/* LanguageTool, a natural language style checker * Copyright (C) 2014 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.languagemodel; import org.junit.Ignore; import org.junit.Test; import org.languagetool.JLanguageTool; import java.io.File; import java.net.URL; import java.util.Arrays; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; public class LuceneSingleIndexLanguageModelTest extends LanguageModelTest { @Test public void testLanguageModel() throws Exception { URL ngramUrl = JLanguageTool.getDataBroker().getFromResourceDirAsUrl("/yy/ngram-index"); try (LuceneLanguageModel model = new LuceneLanguageModel(new File(ngramUrl.getFile()))) { assertThat(model.getCount("the"), is(55L)); assertThat(model.getCount(Arrays.asList("the", "nice")), is(3L)); assertThat(model.getCount(Arrays.asList("the", "nice", "building")), is(1L)); assertThat(model.getCount("not-in-here"), is(0L)); assertThat(model.getTotalTokenCount(), is(3L)); } } /** * Some values for average time per lookup on 2grams on a 3.7GB Lucene 4.8.1 index with 118,941,740 docs: * -no data in OS cache, index on external USB disk: 17626µs = 17ms * -no data in OS cache, index on SSD: 739µs = <0ms * -all data in OS cache (by running the test more than once): 163µs = <0ms * * Some values for average time per lookup on 3grams on a 7.0GB Lucene 4.9 index: * -no data in OS cache, index on external USB disk: 13256µs = 13ms * -no data in OS cache, index on SSD: 791µs = <0ms * -all(?) data in OS cache (by running the test more than once): 162µs = <0ms * * The tests have been performed on a Dell XSP13 (i7-3537U CPU) under Ubuntu 12.04, with Java 1.7. */ @Test @Ignore("for interactive use only") public void testPerformance() throws Exception { // 2grams: //LanguageModel model = new LuceneLanguageModel(new File("/media/Data/google-ngram/2gram/lucene-index/merged/")); //super.testPerformance(model, 2); // 3grams: //LanguageModel model = new LuceneLanguageModel(new File("/media/Data/google-ngram/3gram/aggregated/lucene-index/merged/")); LuceneLanguageModel model = new LuceneLanguageModel(new File("/data/google-gram-index/")); super.testPerformance(model, 3); } }