/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.spell; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import java.util.Locale; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.Directory; import org.apache.lucene.util.English; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.NamedThreadFactory; /** * Spell checker test case */ public class TestSpellChecker extends LuceneTestCase { private SpellCheckerMock spellChecker; private Directory userindex, spellindex; private Analyzer analyzer; private List<IndexSearcher> searchers; @Override public void setUp() throws Exception { super.setUp(); //create a user index userindex = newDirectory(); analyzer = new MockAnalyzer(random()); IndexWriter writer = new IndexWriter(userindex, new IndexWriterConfig(analyzer)); for (int i = 0; i < 1000; i++) { Document doc = new Document(); doc.add(newTextField("field1", English.intToEnglish(i), Field.Store.YES)); doc.add(newTextField("field2", English.intToEnglish(i + 1), Field.Store.YES)); // + word thousand doc.add(newTextField("field3", "fvei" + (i % 2 == 0 ? " five" : ""), Field.Store.YES)); // + word thousand writer.addDocument(doc); } { Document doc = new Document(); doc.add(newTextField("field1", "eight", Field.Store.YES)); // "eight" in // the index // twice writer.addDocument(doc); } { Document doc = new Document(); doc .add(newTextField("field1", "twenty-one twenty-one", Field.Store.YES)); // "twenty-one" in the index thrice writer.addDocument(doc); } { Document doc = new Document(); doc.add(newTextField("field1", "twenty", Field.Store.YES)); // "twenty" // in the // index // twice writer.addDocument(doc); } writer.close(); searchers = Collections.synchronizedList(new ArrayList<IndexSearcher>()); // create the spellChecker spellindex = newDirectory(); spellChecker = new SpellCheckerMock(spellindex); } @Override public void tearDown() throws Exception { userindex.close(); if (!spellChecker.isClosed()) spellChecker.close(); spellindex.close(); analyzer.close(); super.tearDown(); } public void testBuild() throws IOException { IndexReader r = DirectoryReader.open(userindex); spellChecker.clearIndex(); addwords(r, spellChecker, "field1"); int num_field1 = this.numdoc(); addwords(r, spellChecker, "field2"); int num_field2 = this.numdoc(); assertEquals(num_field2, num_field1 + 1); assertLastSearcherOpen(4); checkCommonSuggestions(r); checkLevenshteinSuggestions(r); spellChecker.setStringDistance(new JaroWinklerDistance()); spellChecker.setAccuracy(0.8f); checkCommonSuggestions(r); checkJaroWinklerSuggestions(); // the accuracy is set to 0.8 by default, but the best result has a score of 0.925 String[] similar = spellChecker.suggestSimilar("fvie", 2, 0.93f); assertTrue(similar.length == 0); similar = spellChecker.suggestSimilar("fvie", 2, 0.92f); assertTrue(similar.length == 1); similar = spellChecker.suggestSimilar("fiv", 2); assertTrue(similar.length > 0); assertEquals(similar[0], "five"); spellChecker.setStringDistance(new NGramDistance(2)); spellChecker.setAccuracy(0.5f); checkCommonSuggestions(r); checkNGramSuggestions(); r.close(); } public void testComparator() throws Exception { IndexReader r = DirectoryReader.open(userindex); Directory compIdx = newDirectory(); SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator()); addwords(r, compareSP, "field3"); String[] similar = compareSP.suggestSimilar("fvie", 2, r, "field3", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertTrue(similar.length == 2); //five and fvei have the same score, but different frequencies. assertEquals("fvei", similar[0]); assertEquals("five", similar[1]); r.close(); if (!compareSP.isClosed()) compareSP.close(); compIdx.close(); } public void testBogusField() throws Exception { IndexReader r = DirectoryReader.open(userindex); Directory compIdx = newDirectory(); SpellChecker compareSP = new SpellCheckerMock(compIdx, new LevensteinDistance(), new SuggestWordFrequencyComparator()); addwords(r, compareSP, "field3"); String[] similar = compareSP.suggestSimilar("fvie", 2, r, "bogusFieldBogusField", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(0, similar.length); r.close(); if (!compareSP.isClosed()) compareSP.close(); compIdx.close(); } public void testSuggestModes() throws Exception { IndexReader r = DirectoryReader.open(userindex); spellChecker.clearIndex(); addwords(r, spellChecker, "field1"); { String[] similar = spellChecker.suggestSimilar("eighty", 2, r, "field1", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(1, similar.length); assertEquals("eighty", similar[0]); } { String[] similar = spellChecker.suggestSimilar("eight", 2, r, "field1", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(1, similar.length); assertEquals("eight", similar[0]); } { String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1", SuggestMode.SUGGEST_MORE_POPULAR); assertEquals(5, similar.length); assertEquals("eight", similar[0]); } { String[] similar = spellChecker.suggestSimilar("twenty", 5, r, "field1", SuggestMode.SUGGEST_MORE_POPULAR); assertEquals(1, similar.length); assertEquals("twenty-one", similar[0]); } { String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1", SuggestMode.SUGGEST_MORE_POPULAR); assertEquals(0, similar.length); } { String[] similar = spellChecker.suggestSimilar("eighty", 5, r, "field1", SuggestMode.SUGGEST_ALWAYS); assertEquals(5, similar.length); assertEquals("eight", similar[0]); } { String[] similar = spellChecker.suggestSimilar("eight", 5, r, "field1", SuggestMode.SUGGEST_ALWAYS); assertEquals(5, similar.length); assertEquals("eighty", similar[0]); } r.close(); } private void checkCommonSuggestions(IndexReader r) throws IOException { String[] similar = spellChecker.suggestSimilar("fvie", 2); assertTrue(similar.length > 0); assertEquals(similar[0], "five"); similar = spellChecker.suggestSimilar("five", 2); if (similar.length > 0) { assertFalse(similar[0].equals("five")); // don't suggest a word for itself } similar = spellChecker.suggestSimilar("fiv", 2); assertTrue(similar.length > 0); assertEquals(similar[0], "five"); similar = spellChecker.suggestSimilar("fives", 2); assertTrue(similar.length > 0); assertEquals(similar[0], "five"); assertTrue(similar.length > 0); similar = spellChecker.suggestSimilar("fie", 2); assertEquals(similar[0], "five"); // test restraint to a field similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(0, similar.length); // there isn't the term thousand in the field field1 similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(1, similar.length); // there is the term thousand in the field field2 } private void checkLevenshteinSuggestions(IndexReader r) throws IOException { // test small word String[] similar = spellChecker.suggestSimilar("fvie", 2); assertEquals(1, similar.length); assertEquals(similar[0], "five"); similar = spellChecker.suggestSimilar("five", 2); assertEquals(1, similar.length); assertEquals(similar[0], "nine"); // don't suggest a word for itself similar = spellChecker.suggestSimilar("fiv", 2); assertEquals(1, similar.length); assertEquals(similar[0], "five"); similar = spellChecker.suggestSimilar("ive", 2); assertEquals(2, similar.length); assertEquals(similar[0], "five"); assertEquals(similar[1], "nine"); similar = spellChecker.suggestSimilar("fives", 2); assertEquals(1, similar.length); assertEquals(similar[0], "five"); similar = spellChecker.suggestSimilar("fie", 2); assertEquals(2, similar.length); assertEquals(similar[0], "five"); assertEquals(similar[1], "nine"); similar = spellChecker.suggestSimilar("fi", 2); assertEquals(1, similar.length); assertEquals(similar[0], "five"); // test restraint to a field similar = spellChecker.suggestSimilar("tousand", 10, r, "field1", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(0, similar.length); // there isn't the term thousand in the field field1 similar = spellChecker.suggestSimilar("tousand", 10, r, "field2", SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); assertEquals(1, similar.length); // there is the term thousand in the field field2 similar = spellChecker.suggestSimilar("onety", 2); assertEquals(2, similar.length); assertEquals(similar[0], "ninety"); assertEquals(similar[1], "one"); // should not throw exception spellChecker.suggestSimilar("tousand", 10, r, null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX); } private void checkJaroWinklerSuggestions() throws IOException { String[] similar = spellChecker.suggestSimilar("onety", 2); assertEquals(2, similar.length); assertEquals(similar[0], "one"); assertEquals(similar[1], "ninety"); } private void checkNGramSuggestions() throws IOException { String[] similar = spellChecker.suggestSimilar("onety", 2); assertEquals(2, similar.length); assertEquals(similar[0], "one"); assertEquals(similar[1], "ninety"); } private void addwords(IndexReader r, SpellChecker sc, String field) throws IOException { long time = System.currentTimeMillis(); sc.indexDictionary(new LuceneDictionary(r, field), newIndexWriterConfig(null), false); time = System.currentTimeMillis() - time; //System.out.println("time to build " + field + ": " + time); } private int numdoc() throws IOException { IndexReader rs = DirectoryReader.open(spellindex); int num = rs.numDocs(); assertTrue(num != 0); //System.out.println("num docs: " + num); rs.close(); return num; } public void testClose() throws IOException { IndexReader r = DirectoryReader.open(userindex); spellChecker.clearIndex(); String field = "field1"; addwords(r, spellChecker, "field1"); int num_field1 = this.numdoc(); addwords(r, spellChecker, "field2"); int num_field2 = this.numdoc(); assertEquals(num_field2, num_field1 + 1); checkCommonSuggestions(r); assertLastSearcherOpen(4); spellChecker.close(); assertSearchersClosed(); expectThrows(AlreadyClosedException.class, () -> { spellChecker.close(); }); expectThrows(AlreadyClosedException.class, () -> { checkCommonSuggestions(r); }); expectThrows(AlreadyClosedException.class, () -> { spellChecker.clearIndex(); }); expectThrows(AlreadyClosedException.class, () -> { spellChecker.indexDictionary(new LuceneDictionary(r, field), newIndexWriterConfig(null), false); }); expectThrows(AlreadyClosedException.class, () -> { spellChecker.setSpellIndex(spellindex); }); assertEquals(4, searchers.size()); assertSearchersClosed(); r.close(); } /* * tests if the internally shared indexsearcher is correctly closed * when the spellchecker is concurrently accessed and closed. */ public void testConcurrentAccess() throws IOException, InterruptedException { assertEquals(1, searchers.size()); final IndexReader r = DirectoryReader.open(userindex); spellChecker.clearIndex(); assertEquals(2, searchers.size()); addwords(r, spellChecker, "field1"); assertEquals(3, searchers.size()); int num_field1 = this.numdoc(); addwords(r, spellChecker, "field2"); assertEquals(4, searchers.size()); int num_field2 = this.numdoc(); assertEquals(num_field2, num_field1 + 1); int numThreads = 5 + random().nextInt(5); ExecutorService executor = Executors.newFixedThreadPool(numThreads, new NamedThreadFactory("testConcurrentAccess")); SpellCheckWorker[] workers = new SpellCheckWorker[numThreads]; for (int i = 0; i < numThreads; i++) { SpellCheckWorker spellCheckWorker = new SpellCheckWorker(r); executor.execute(spellCheckWorker); workers[i] = spellCheckWorker; } int iterations = 5 + random().nextInt(5); for (int i = 0; i < iterations; i++) { Thread.sleep(100); // concurrently reset the spell index spellChecker.setSpellIndex(this.spellindex); // for debug - prints the internal open searchers // showSearchersOpen(); } spellChecker.close(); executor.shutdown(); // wait for 60 seconds - usually this is very fast but coverage runs could take quite long executor.awaitTermination(60L, TimeUnit.SECONDS); for (int i = 0; i < workers.length; i++) { assertFalse(String.format(Locale.ROOT, "worker thread %d failed", i), workers[i].failed); assertTrue(String.format(Locale.ROOT, "worker thread %d is still running but should be terminated", i), workers[i].terminated); } // 4 searchers more than iterations // 1. at creation // 2. clearIndex() // 2. and 3. during addwords assertEquals(iterations + 4, searchers.size()); assertSearchersClosed(); r.close(); } private void assertLastSearcherOpen(int numSearchers) { assertEquals(numSearchers, searchers.size()); IndexSearcher[] searcherArray = searchers.toArray(new IndexSearcher[0]); for (int i = 0; i < searcherArray.length; i++) { if (i == searcherArray.length - 1) { assertTrue("expected last searcher open but was closed", searcherArray[i].getIndexReader().getRefCount() > 0); } else { assertFalse("expected closed searcher but was open - Index: " + i, searcherArray[i].getIndexReader().getRefCount() > 0); } } } private void assertSearchersClosed() { for (IndexSearcher searcher : searchers) { assertEquals(0, searcher.getIndexReader().getRefCount()); } } // For debug // private void showSearchersOpen() { // int count = 0; // for (IndexSearcher searcher : searchers) { // if(searcher.getIndexReader().getRefCount() > 0) // ++count; // } // System.out.println(count); // } private class SpellCheckWorker implements Runnable { private final IndexReader reader; volatile boolean terminated = false; volatile boolean failed = false; SpellCheckWorker(IndexReader reader) { super(); this.reader = reader; } @Override public void run() { try { while (true) { try { checkCommonSuggestions(reader); } catch (AlreadyClosedException e) { return; } catch (Throwable e) { e.printStackTrace(); failed = true; return; } } } finally { terminated = true; } } } class SpellCheckerMock extends SpellChecker { public SpellCheckerMock(Directory spellIndex) throws IOException { super(spellIndex); } public SpellCheckerMock(Directory spellIndex, StringDistance sd) throws IOException { super(spellIndex, sd); } public SpellCheckerMock(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException { super(spellIndex, sd, comparator); } @Override IndexSearcher createSearcher(Directory dir) throws IOException { IndexSearcher searcher = super.createSearcher(dir); TestSpellChecker.this.searchers.add(searcher); return searcher; } } }