/* LanguageTool, a natural language style checker
* Copyright (C) 2015 Daniel Naber (http://www.danielnaber.de)
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
* USA
*/
package org.languagetool.dev.bigdata;
import org.junit.Ignore;
import org.junit.Test;
import org.languagetool.languagemodel.LuceneLanguageModel;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import static org.junit.Assert.fail;
public class LanguageModelSanityTest {
private static final String NGRAM_DIR = "/data/google-ngram-index";
@Test
@Ignore("Interactive use only, requires local ngram index")
public void testEnglishLanguageModelSanity() throws IOException {
LuceneLanguageModel lm = new LuceneLanguageModel(new File(NGRAM_DIR));
// 1gram:
assertMatches(lm, "the");
assertMatches(lm, "The");
assertMatches(lm, ",");
assertMatches(lm, "0");
assertMatches(lm, "1");
assertMatches(lm, "2");
assertMatches(lm, "3");
assertMatches(lm, "4");
assertMatches(lm, "5");
assertMatches(lm, "6");
assertMatches(lm, "7");
assertMatches(lm, "8");
assertMatches(lm, "9");
assertMatches(lm, ":");
assertMatches(lm, "(");
assertMatches(lm, ")");
assertMatches(lm, "£");
// 2gram:
assertMatches(lm, "the man");
assertMatches(lm, "The man");
assertMatches(lm, "_START_ the");
assertMatches(lm, "_START_ The");
assertMatches(lm, "it _END_");
assertMatches(lm, "it .");
assertMatches(lm, "Also ,");
assertMatches(lm, "is 0");
assertMatches(lm, ": it");
assertMatches(lm, "( it");
assertMatches(lm, "it )");
assertMatches(lm, "£ 5");
// 3gram:
assertMatches(lm, "the man who");
assertMatches(lm, "The man who");
assertMatches(lm, "_START_ The man");
assertMatches(lm, "it was _END_");
assertMatches(lm, "it was .");
assertMatches(lm, "Also , it");
assertMatches(lm, "it is 0");
assertMatches(lm, ": it is");
assertMatches(lm, "( it is");
assertMatches(lm, "it is )");
assertMatches(lm, "five - pound");
assertMatches(lm, "is £ 5");
assertMatches(lm, "it 's a");
assertMatches(lm, "it ' s");
// 4gram:
assertMatches(lm, "the man who could");
assertMatches(lm, "The man who could");
assertMatches(lm, "five - pound note");
assertMatches(lm, "_START_ The man who");
assertMatches(lm, "which it was _END_");
assertMatches(lm, "Also , it is");
assertMatches(lm, "when it is 0");
assertMatches(lm, "it is £ 5");
}
private void assertMatches(LuceneLanguageModel lm, String phrase) {
String[] words = phrase.split(" ");
long count = lm.getCount(Arrays.asList(words));
System.out.println(Arrays.toString(words) + ": " + count);
if (count < 10) {
fail("Only got " + count + " matches for " + Arrays.toString(words));
}
}
}