/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.benchmark.byTask; import java.io.BufferedReader; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.text.Collator; import java.util.List; import java.util.Locale; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.BaseTokenStreamTestCase; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; import org.apache.lucene.benchmark.BenchmarkTestCase; import org.apache.lucene.benchmark.byTask.feeds.DocMaker; import org.apache.lucene.benchmark.byTask.stats.TaskStats; import org.apache.lucene.benchmark.byTask.tasks.CountingSearchTestTask; import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask; import org.apache.lucene.collation.CollationKeyAnalyzer; import org.apache.lucene.facet.taxonomy.TaxonomyReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.Fields; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.LogMergePolicy; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.index.SerialMergeScheduler; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.store.Directory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.TestUtil; /** * Test very simply that perf tasks - simple algorithms - are doing what they should. */ public class TestPerfTasksLogic extends BenchmarkTestCase { @Override public void setUp() throws Exception { super.setUp(); copyToWorkDir("reuters.first20.lines.txt"); copyToWorkDir("test-mapping-ISOLatin1Accent-partial.txt"); } /** * Test index creation logic */ public void testIndexAndSearchTasks() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "ResetSystemErase", "CreateIndex", "{ AddDoc } : 1000", "ForceMerge(1)", "CloseIndex", "OpenReader", "{ CountingSearchTest } : 200", "CloseReader", "[ CountingSearchTest > : 70", "[ CountingSearchTest > : 9", }; // 2. we test this value later CountingSearchTestTask.numSearches = 0; // 3. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 4. test specific checks after the benchmark run completed. assertEquals("TestSearchTask was supposed to be called!",279,CountingSearchTestTask.numSearches); assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory())); // now we should be able to open the index for write. IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())) .setOpenMode(OpenMode.APPEND)); iw.close(); IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); assertEquals("1000 docs were added to the index, this is what we expect to find!",1000,ir.numDocs()); ir.close(); } /** * Test timed sequence task. */ public void testTimedSearchTask() throws Exception { String algLines[] = { "log.step=100000", "ResetSystemErase", "CreateIndex", "{ AddDoc } : 100", "ForceMerge(1)", "CloseIndex", "OpenReader", "{ CountingSearchTest } : .5s", "CloseReader", }; CountingSearchTestTask.numSearches = 0; execBenchmark(algLines); assertTrue(CountingSearchTestTask.numSearches > 0); long elapsed = CountingSearchTestTask.prevLastMillis - CountingSearchTestTask.startMillis; assertTrue("elapsed time was " + elapsed + " msec", elapsed <= 1500); } // disabled until we fix BG thread prio -- this test // causes build to hang public void testBGSearchTaskThreads() throws Exception { String algLines[] = { "log.time.step.msec = 100", "log.step=100000", "ResetSystemErase", "CreateIndex", "{ AddDoc } : 1000", "ForceMerge(1)", "CloseIndex", "OpenReader", "{", " [ \"XSearch\" { CountingSearchTest > : * ] : 2 &-1", " Wait(0.5)", "}", "CloseReader", "RepSumByPref X" }; CountingSearchTestTask.numSearches = 0; execBenchmark(algLines); // NOTE: cannot assert this, because on a super-slow // system, it could be after waiting 0.5 seconds that // the search threads hadn't yet succeeded in starting // up and then they start up and do no searching: //assertTrue(CountingSearchTestTask.numSearches > 0); } /** * Test Exhasting Doc Maker logic */ public void testExhaustContentSource() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource", "content.source.log.step=1", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "# ----- alg ", "CreateIndex", "{ AddDoc } : * ", "ForceMerge(1)", "CloseIndex", "OpenReader", "{ CountingSearchTest } : 100", "CloseReader", "[ CountingSearchTest > : 30", "[ CountingSearchTest > : 9", }; // 2. we test this value later CountingSearchTestTask.numSearches = 0; // 3. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 4. test specific checks after the benchmark run completed. assertEquals("TestSearchTask was supposed to be called!",139,CountingSearchTestTask.numSearches); assertTrue("Index does not exist?...!", DirectoryReader.indexExists(benchmark.getRunData().getDirectory())); // now we should be able to open the index for write. IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())).setOpenMode(OpenMode.APPEND)); iw.close(); IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); assertEquals("1 docs were added to the index, this is what we expect to find!",1,ir.numDocs()); ir.close(); } // LUCENE-1994: test thread safety of SortableSingleDocMaker public void testDocMakerThreadSafety() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource", "doc.term.vector=false", "log.step.AddDoc=10000", "content.source.forever=true", "directory=RAMDirectory", "doc.reuse.fields=false", "doc.stored=true", "doc.tokenized=false", "doc.index.props=true", "# ----- alg ", "CreateIndex", "[ { AddDoc > : 250 ] : 4", "CloseIndex", }; // 2. we test this value later CountingSearchTestTask.numSearches = 0; // 3. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); DirectoryReader r = DirectoryReader.open(benchmark.getRunData().getDirectory()); final int maxDoc = r.maxDoc(); assertEquals(1000, maxDoc); for(int i=0;i<1000;i++) { assertNotNull("doc " + i + " has null country", r.document(i).getField("country")); } r.close(); } /** * Test Parallel Doc Maker logic (for LUCENE-940) */ public void testParallelDocMaker() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=FSDirectory", "doc.stored=false", "doc.tokenized=false", "# ----- alg ", "CreateIndex", "[ { AddDoc } : * ] : 4 ", "CloseIndex", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // first 20 reuters docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } /** * Test WriteLineDoc and LineDocSource. */ public void testLineDocFile() throws Exception { Path lineFile = createTempFile("test.reuters.lines", ".txt"); // We will call WriteLineDocs this many times final int NUM_TRY_DOCS = 50; // Creates a line file with first 50 docs from SingleDocSource String algLines1[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource", "content.source.forever=true", "line.file.out=" + lineFile.toAbsolutePath().toString().replace('\\', '/'), "# ----- alg ", "{WriteLineDoc()}:" + NUM_TRY_DOCS, }; // Run algo Benchmark benchmark = execBenchmark(algLines1); BufferedReader r = Files.newBufferedReader(lineFile, StandardCharsets.UTF_8); int numLines = 0; String line; while((line = r.readLine()) != null) { if (numLines==0 && line.startsWith(WriteLineDocTask.FIELDS_HEADER_INDICATOR)) { continue; // do not count the header line as a doc } numLines++; } r.close(); assertEquals("did not see the right number of docs; should be " + NUM_TRY_DOCS + " but was " + numLines, NUM_TRY_DOCS, numLines); // Index the line docs String algLines2[] = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + lineFile.toAbsolutePath().toString().replace('\\', '/'), "content.source.forever=false", "doc.reuse.fields=false", "ram.flush.mb=4", "# ----- alg ", "ResetSystemErase", "CreateIndex", "{AddDoc}: *", "CloseIndex", }; // Run algo benchmark = execBenchmark(algLines2); // now we should be able to open the index for write. IndexWriter iw = new IndexWriter(benchmark.getRunData().getDirectory(), new IndexWriterConfig(new MockAnalyzer(random())) .setOpenMode(OpenMode.APPEND)); iw.close(); IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); assertEquals(numLines + " lines were created but " + ir.numDocs() + " docs are in the index", numLines, ir.numDocs()); ir.close(); } /** * Test ReadTokensTask */ public void testReadTokens() throws Exception { // We will call ReadTokens on this many docs final int NUM_DOCS = 20; // Read tokens from first NUM_DOCS docs from Reuters and // then build index from the same docs String algLines1[] = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "# ----- alg ", "{ReadTokens}: " + NUM_DOCS, "ResetSystemErase", "CreateIndex", "{AddDoc}: " + NUM_DOCS, "CloseIndex", }; // Run algo Benchmark benchmark = execBenchmark(algLines1); List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats(); // Count how many tokens all ReadTokens saw int totalTokenCount1 = 0; for (final TaskStats stat : stats) { if (stat.getTask().getName().equals("ReadTokens")) { totalTokenCount1 += stat.getCount(); } } // Separately count how many tokens are actually in the index: IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory()); assertEquals(NUM_DOCS, reader.numDocs()); int totalTokenCount2 = 0; Fields fields = MultiFields.getFields(reader); for (String fieldName : fields) { if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) { continue; } Terms terms = fields.terms(fieldName); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while(termsEnum.next() != null) { docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS); while(docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { totalTokenCount2 += docs.freq(); } } } reader.close(); // Make sure they are the same assertEquals(totalTokenCount1, totalTokenCount2); } /** * Test that " {[AddDoc(4000)]: 4} : * " works corrcetly (for LUCENE-941) */ public void testParallelExhausted() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "task.max.depth.log=1", "# ----- alg ", "CreateIndex", "{ [ AddDoc]: 4} : * ", "ResetInputs ", "{ [ AddDoc]: 4} : * ", "CloseIndex", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 2 * 20; // first 20 reuters docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } /** * Test that exhaust in loop works as expected (LUCENE-1115). */ public void testExhaustedLooped() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "task.max.depth.log=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " CloseIndex", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // first 20 reuters docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } /** * Test that we can close IndexWriter with argument "false". */ public void testCloseIndexFalse() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "ram.flush.mb=-1", "max.buffered=2", "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " CloseIndex(false)", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // first 20 reuters docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } public static class MyMergeScheduler extends SerialMergeScheduler { boolean called; public MyMergeScheduler() { super(); called = true; } } /** * Test that we can set merge scheduler". */ public void testMergeScheduler() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "merge.scheduler=" + MyMergeScheduler.class.getName(), "doc.stored=false", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); assertTrue("did not use the specified MergeScheduler", ((MyMergeScheduler) benchmark.getRunData().getIndexWriter().getConfig() .getMergeScheduler()).called); benchmark.getRunData().getIndexWriter().close(); // 3. test number of docs in the index IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // first 20 reuters docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } public static class MyMergePolicy extends LogDocMergePolicy { boolean called; public MyMergePolicy() { called = true; } } /** * Test that we can set merge policy". */ public void testMergePolicy() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=3", "ram.flush.mb=-1", "max.buffered=2", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "merge.policy=" + MyMergePolicy.class.getName(), "doc.stored=false", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); assertTrue("did not use the specified MergePolicy", ((MyMergePolicy) benchmark.getRunData().getIndexWriter().getConfig().getMergePolicy()).called); benchmark.getRunData().getIndexWriter().close(); // 3. test number of docs in the index IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // first 20 reuters docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); } /** * Test that IndexWriter settings stick. */ public void testIndexWriterSettings() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=3", "ram.flush.mb=-1", "max.buffered=2", "compound=cmpnd:true:false", "doc.term.vector=vector:false:true", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "merge.factor=3", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " NewRound", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); final IndexWriter writer = benchmark.getRunData().getIndexWriter(); assertEquals(2, writer.getConfig().getMaxBufferedDocs()); assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, (int) writer.getConfig().getRAMBufferSizeMB()); assertEquals(3, ((LogMergePolicy) writer.getConfig().getMergePolicy()).getMergeFactor()); assertEquals(0.0d, writer.getConfig().getMergePolicy().getNoCFSRatio(), 0.0); writer.close(); Directory dir = benchmark.getRunData().getDirectory(); IndexReader reader = DirectoryReader.open(dir); Fields tfv = reader.getTermVectors(0); assertNotNull(tfv); assertTrue(tfv.size() > 0); reader.close(); } /** * Test indexing with facets tasks. */ public void testIndexingWithFacets() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=100", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "merge.factor=3", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "ResetSystemErase", "CreateIndex", "CreateTaxonomyIndex", "{ \"AddDocs\" AddFacetedDoc > : * ", "CloseIndex", "CloseTaxonomyIndex", "OpenTaxonomyReader", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); PerfRunData runData = benchmark.getRunData(); assertNull("taxo writer was not properly closed",runData.getTaxonomyWriter()); TaxonomyReader taxoReader = runData.getTaxonomyReader(); assertNotNull("taxo reader was not opened", taxoReader); assertTrue("nothing was added to the taxnomy (expecting root and at least one addtional category)",taxoReader.getSize()>1); taxoReader.close(); } /** * Test that we can call forceMerge(maxNumSegments). */ public void testForceMerge() throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=3", "ram.flush.mb=-1", "max.buffered=3", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "merge.policy=org.apache.lucene.index.LogDocMergePolicy", "doc.stored=false", "doc.tokenized=false", "debug.level=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " ForceMerge(3)", " CloseIndex()", "} : 2", }; // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test number of docs in the index IndexReader ir = DirectoryReader.open(benchmark.getRunData().getDirectory()); int ndocsExpected = 20; // first 20 reuters docs. assertEquals("wrong number of docs in the index!", ndocsExpected, ir.numDocs()); ir.close(); // Make sure we have 3 segments: SegmentInfos infos = SegmentInfos.readLatestCommit(benchmark.getRunData().getDirectory()); assertEquals(3, infos.size()); } /** * Test disabling task count (LUCENE-1136). */ public void testDisableCounting() throws Exception { doTestDisableCounting(true); doTestDisableCounting(false); } private void doTestDisableCounting(boolean disable) throws Exception { // 1. alg definition (required in every "logic" test) String algLines[] = disableCountingLines(disable); // 2. execute the algorithm (required in every "logic" test) Benchmark benchmark = execBenchmark(algLines); // 3. test counters int n = disable ? 0 : 1; int nChecked = 0; for (final TaskStats stats : benchmark.getRunData().getPoints().taskStats()) { String taskName = stats.getTask().getName(); if (taskName.equals("Rounds")) { assertEquals("Wrong total count!",20+2*n,stats.getCount()); nChecked++; } else if (taskName.equals("CreateIndex")) { assertEquals("Wrong count for CreateIndex!",n,stats.getCount()); nChecked++; } else if (taskName.equals("CloseIndex")) { assertEquals("Wrong count for CloseIndex!",n,stats.getCount()); nChecked++; } } assertEquals("Missing some tasks to check!",3,nChecked); } private String[] disableCountingLines (boolean disable) { String dis = disable ? "-" : ""; return new String[] { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=30", "doc.term.vector=false", "content.source.forever=false", "directory=RAMDirectory", "doc.stored=false", "doc.tokenized=false", "task.max.depth.log=1", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " "+dis+"CreateIndex", // optionally disable counting here " { \"AddDocs\" AddDoc > : * ", " "+dis+" CloseIndex", // optionally disable counting here (with extra blanks) "}", "RepSumByName", }; } /** * Test that we can change the Locale in the runData, * that it is parsed as we expect. */ public void testLocale() throws Exception { // empty Locale: clear it (null) Benchmark benchmark = execBenchmark(getLocaleConfig("")); assertNull(benchmark.getRunData().getLocale()); // ROOT locale benchmark = execBenchmark(getLocaleConfig("ROOT")); assertEquals(new Locale(""), benchmark.getRunData().getLocale()); // specify just a language benchmark = execBenchmark(getLocaleConfig("de")); assertEquals(new Locale("de"), benchmark.getRunData().getLocale()); // specify language + country benchmark = execBenchmark(getLocaleConfig("en,US")); assertEquals(new Locale("en", "US"), benchmark.getRunData().getLocale()); // specify language + country + variant benchmark = execBenchmark(getLocaleConfig("no,NO,NY")); assertEquals(new Locale("no", "NO", "NY"), benchmark.getRunData().getLocale()); } private String[] getLocaleConfig(String localeParam) { String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=3", "content.source.forever=false", "directory=RAMDirectory", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " NewLocale(" + localeParam + ")", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " NewRound", "} : 1", }; return algLines; } /** * Test that we can create CollationAnalyzers. */ public void testCollator() throws Exception { // ROOT locale Benchmark benchmark = execBenchmark(getCollatorConfig("ROOT", "impl:jdk")); CollationKeyAnalyzer expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale(""))); assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar"); // specify just a language benchmark = execBenchmark(getCollatorConfig("de", "impl:jdk")); expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("de"))); assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar"); // specify language + country benchmark = execBenchmark(getCollatorConfig("en,US", "impl:jdk")); expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("en", "US"))); assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar"); // specify language + country + variant benchmark = execBenchmark(getCollatorConfig("no,NO,NY", "impl:jdk")); expected = new CollationKeyAnalyzer(Collator.getInstance(new Locale("no", "NO", "NY"))); assertEqualCollation(expected, benchmark.getRunData().getAnalyzer(), "foobar"); } private void assertEqualCollation(Analyzer a1, Analyzer a2, String text) throws Exception { TokenStream ts1 = a1.tokenStream("bogus", text); TokenStream ts2 = a2.tokenStream("bogus", text); ts1.reset(); ts2.reset(); TermToBytesRefAttribute termAtt1 = ts1.addAttribute(TermToBytesRefAttribute.class); TermToBytesRefAttribute termAtt2 = ts2.addAttribute(TermToBytesRefAttribute.class); assertTrue(ts1.incrementToken()); assertTrue(ts2.incrementToken()); BytesRef bytes1 = termAtt1.getBytesRef(); BytesRef bytes2 = termAtt2.getBytesRef(); assertEquals(bytes1, bytes2); assertFalse(ts1.incrementToken()); assertFalse(ts2.incrementToken()); ts1.close(); ts2.close(); } private String[] getCollatorConfig(String localeParam, String collationParam) { String algLines[] = { "# ----- properties ", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "content.source.log.step=3", "content.source.forever=false", "directory=RAMDirectory", "# ----- alg ", "{ \"Rounds\"", " ResetSystemErase", " NewLocale(" + localeParam + ")", " NewCollationAnalyzer(" + collationParam + ")", " CreateIndex", " { \"AddDocs\" AddDoc > : * ", " NewRound", "} : 1", }; return algLines; } /** * Test that we can create shingle analyzers using AnalyzerFactory. */ public void testShingleAnalyzer() throws Exception { String text = "one,two,three, four five six"; // StandardTokenizer, maxShingleSize, and outputUnigrams Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig ("shingle-analyzer", "StandardTokenizer,ShingleFilter")); benchmark.getRunData().getAnalyzer().tokenStream ("bogus", text).close(); BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, new String[] { "one", "one two", "two", "two three", "three", "three four", "four", "four five", "five", "five six", "six" }); // StandardTokenizer, maxShingleSize = 3, and outputUnigrams = false benchmark = execBenchmark (getAnalyzerFactoryConfig ("shingle-analyzer", "StandardTokenizer,ShingleFilter(maxShingleSize:3,outputUnigrams:false)")); BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, new String[] { "one two", "one two three", "two three", "two three four", "three four", "three four five", "four five", "four five six", "five six" }); // WhitespaceTokenizer, default maxShingleSize and outputUnigrams benchmark = execBenchmark (getAnalyzerFactoryConfig("shingle-analyzer", "WhitespaceTokenizer,ShingleFilter")); BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, new String[] { "one,two,three,", "one,two,three, four", "four", "four five", "five", "five six", "six" }); // WhitespaceTokenizer, maxShingleSize=3 and outputUnigrams=false benchmark = execBenchmark (getAnalyzerFactoryConfig ("shingle-factory", "WhitespaceTokenizer,ShingleFilter(outputUnigrams:false,maxShingleSize:3)")); BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, new String[] { "one,two,three, four", "one,two,three, four five", "four five", "four five six", "five six" }); } private String[] getAnalyzerFactoryConfig(String name, String params) { final String singleQuoteEscapedName = name.replaceAll("'", "\\\\'"); String algLines[] = { "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "work.dir=" + getWorkDir().toAbsolutePath().toString().replaceAll("\\\\", "/"), // Fix Windows path "content.source.forever=false", "directory=RAMDirectory", "AnalyzerFactory(name:'" + singleQuoteEscapedName + "', " + params + ")", "NewAnalyzer('" + singleQuoteEscapedName + "')", "CreateIndex", "{ \"AddDocs\" AddDoc > : * " }; return algLines; } public void testAnalyzerFactory() throws Exception { String text = "Fortieth, Quarantième, Cuadragésimo"; Benchmark benchmark = execBenchmark(getAnalyzerFactoryConfig ("ascii folded, pattern replaced, standard tokenized, downcased, bigrammed.'analyzer'", "positionIncrementGap:100,offsetGap:1111," +"MappingCharFilter(mapping:'test-mapping-ISOLatin1Accent-partial.txt')," +"PatternReplaceCharFilterFactory(pattern:'e(\\\\\\\\S*)m',replacement:\"$1xxx$1\")," +"StandardTokenizer,LowerCaseFilter,NGramTokenFilter(minGramSize:2,maxGramSize:2)")); BaseTokenStreamTestCase.assertAnalyzesTo(benchmark.getRunData().getAnalyzer(), text, new String[] { "fo", "or", "rt", "ti", "ie", "et", "th", "qu", "ua", "ar", "ra", "an", "nt", "ti", "ix", "xx", "xx", "xe", "cu", "ua", "ad", "dr", "ra", "ag", "gs", "si", "ix", "xx", "xx", "xs", "si", "io"}); } private String getReuters20LinesFile() { return getWorkDirResourcePath("reuters.first20.lines.txt"); } }