/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.MockAnalyzer; import org.apache.lucene.analysis.MockTokenizer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.RandomIndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.store.Directory; import org.apache.lucene.util.IOUtils; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.TestUtil; import org.apache.lucene.util.automaton.LevenshteinAutomata; /** * Tests {@link FuzzyQuery}. * */ public class TestFuzzyQuery extends LuceneTestCase { public void testBasicPrefix() throws Exception { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); addDoc("abc", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); writer.close(); FuzzyQuery query = new FuzzyQuery(new Term("field", "abc"), FuzzyQuery.defaultMaxEdits, 1); ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; assertEquals(1, hits.length); reader.close(); directory.close(); } public void testFuzziness() throws Exception { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); addDoc("aaaaa", writer); addDoc("aaaab", writer); addDoc("aaabb", writer); addDoc("aabbb", writer); addDoc("abbbb", writer); addDoc("bbbbb", writer); addDoc("ddddd", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); writer.close(); FuzzyQuery query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0); ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); // same with prefix query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 1); hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 2); hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 3); hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 4); hits = searcher.search(query, 1000).scoreDocs; assertEquals(2, hits.length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 5); hits = searcher.search(query, 1000).scoreDocs; assertEquals(1, hits.length); query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 6); hits = searcher.search(query, 1000).scoreDocs; assertEquals(1, hits.length); // test scoring query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, 1000).scoreDocs; assertEquals("3 documents should match", 3, hits.length); List<String> order = Arrays.asList("bbbbb","abbbb","aabbb"); for (int i = 0; i < hits.length; i++) { final String term = searcher.doc(hits[i].doc).get("field"); //System.out.println(hits[i].score); assertEquals(order.get(i), term); } // test pq size by supplying maxExpansions=2 // This query would normally return 3 documents, because 3 terms match (see above): query = new FuzzyQuery(new Term("field", "bbbbb"), FuzzyQuery.defaultMaxEdits, 0, 2, false); hits = searcher.search(query, 1000).scoreDocs; assertEquals("only 2 documents should match", 2, hits.length); order = Arrays.asList("bbbbb","abbbb"); for (int i = 0; i < hits.length; i++) { final String term = searcher.doc(hits[i].doc).get("field"); //System.out.println(hits[i].score); assertEquals(order.get(i), term); } // not similar enough: query = new FuzzyQuery(new Term("field", "xxxxx"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, 1000).scoreDocs; assertEquals(0, hits.length); query = new FuzzyQuery(new Term("field", "aaccc"), FuzzyQuery.defaultMaxEdits, 0); // edit distance to "aaaaa" = 3 hits = searcher.search(query, 1000).scoreDocs; assertEquals(0, hits.length); // query identical to a word in the index: query = new FuzzyQuery(new Term("field", "aaaaa"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); // default allows for up to two edits: assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); // query similar to a word in the index: query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); // now with prefix query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 1); hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 2); hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 3); hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); assertEquals(searcher.doc(hits[2].doc).get("field"), ("aaabb")); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 4); hits = searcher.search(query, 1000).scoreDocs; assertEquals(2, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("aaaaa")); assertEquals(searcher.doc(hits[1].doc).get("field"), ("aaaab")); query = new FuzzyQuery(new Term("field", "aaaac"), FuzzyQuery.defaultMaxEdits, 5); hits = searcher.search(query, 1000).scoreDocs; assertEquals(0, hits.length); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); // now with prefix query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 1); hits = searcher.search(query, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 2); hits = searcher.search(query, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 3); hits = searcher.search(query, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 4); hits = searcher.search(query, 1000).scoreDocs; assertEquals(1, hits.length); assertEquals(searcher.doc(hits[0].doc).get("field"), ("ddddd")); query = new FuzzyQuery(new Term("field", "ddddX"), FuzzyQuery.defaultMaxEdits, 5); hits = searcher.search(query, 1000).scoreDocs; assertEquals(0, hits.length); // different field = no match: query = new FuzzyQuery(new Term("anotherfield", "ddddX"), FuzzyQuery.defaultMaxEdits, 0); hits = searcher.search(query, 1000).scoreDocs; assertEquals(0, hits.length); reader.close(); directory.close(); } public void test2() throws Exception { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); addDoc("LANGE", writer); addDoc("LUETH", writer); addDoc("PIRSING", writer); addDoc("RIEGEL", writer); addDoc("TRZECZIAK", writer); addDoc("WALKER", writer); addDoc("WBR", writer); addDoc("WE", writer); addDoc("WEB", writer); addDoc("WEBE", writer); addDoc("WEBER", writer); addDoc("WEBERE", writer); addDoc("WEBREE", writer); addDoc("WEBEREI", writer); addDoc("WBRE", writer); addDoc("WITTKOPF", writer); addDoc("WOJNAROWSKI", writer); addDoc("WRICKE", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); writer.close(); FuzzyQuery query = new FuzzyQuery(new Term("field", "WEBER"), 2, 1); //query.setRewriteMethod(FuzzyQuery.SCORING_BOOLEAN_QUERY_REWRITE); ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; assertEquals(8, hits.length); reader.close(); directory.close(); } public void testSingleQueryExactMatchScoresHighest() throws Exception { //See issue LUCENE-329 - IDF shouldn't wreck similarity ranking Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); addDoc("smith", writer); addDoc("smith", writer); addDoc("smith", writer); addDoc("smith", writer); addDoc("smith", writer); addDoc("smith", writer); addDoc("smythe", writer); addDoc("smdssasd", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); searcher.setSimilarity(new ClassicSimilarity()); //avoid randomisation of similarity algo by test framework writer.close(); String searchTerms[] = { "smith", "smythe", "smdssasd" }; for (String searchTerm : searchTerms) { FuzzyQuery query = new FuzzyQuery(new Term("field", searchTerm), 2, 1); ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; Document bestDoc = searcher.doc(hits[0].doc); assertTrue(hits.length > 0); String topMatch = bestDoc.get("field"); assertEquals(searchTerm, topMatch); if (hits.length > 1) { Document worstDoc = searcher.doc(hits[hits.length - 1].doc); String worstMatch = worstDoc.get("field"); assertNotSame(searchTerm, worstMatch); } } reader.close(); directory.close(); } public void testMultipleQueriesIdfWorks() throws Exception { // With issue LUCENE-329 - it could be argued a MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite // is the solution as it disables IDF. // However - IDF is still useful as in this case where there are multiple FuzzyQueries. Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); addDoc("michael smith", writer); addDoc("michael lucero", writer); addDoc("doug cutting", writer); addDoc("doug cuttin", writer); addDoc("michael wardle", writer); addDoc("micheal vegas", writer); addDoc("michael lydon", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); searcher.setSimilarity(new ClassicSimilarity()); //avoid randomisation of similarity algo by test framework writer.close(); BooleanQuery.Builder query = new BooleanQuery.Builder(); String commonSearchTerm = "michael"; FuzzyQuery commonQuery = new FuzzyQuery(new Term("field", commonSearchTerm), 2, 1); query.add(commonQuery, Occur.SHOULD); String rareSearchTerm = "cutting"; FuzzyQuery rareQuery = new FuzzyQuery(new Term("field", rareSearchTerm), 2, 1); query.add(rareQuery, Occur.SHOULD); ScoreDoc[] hits = searcher.search(query.build(), 1000).scoreDocs; // Matches on the rare surname should be worth more than matches on the common forename assertEquals(7, hits.length); Document bestDoc = searcher.doc(hits[0].doc); String topMatch = bestDoc.get("field"); assertTrue(topMatch.contains(rareSearchTerm)); Document runnerUpDoc = searcher.doc(hits[1].doc); String runnerUpMatch = runnerUpDoc.get("field"); assertTrue(runnerUpMatch.contains("cuttin")); Document worstDoc = searcher.doc(hits[hits.length - 1].doc); String worstMatch = worstDoc.get("field"); assertTrue(worstMatch.contains("micheal")); //misspelling of common name reader.close(); directory.close(); } /** * MultiTermQuery provides (via attribute) information about which values * must be competitive to enter the priority queue. * * FuzzyQuery optimizes itself around this information, if the attribute * is not implemented correctly, there will be problems! */ public void testTieBreaker() throws Exception { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); addDoc("a123456", writer); addDoc("c123456", writer); addDoc("d123456", writer); addDoc("e123456", writer); Directory directory2 = newDirectory(); RandomIndexWriter writer2 = new RandomIndexWriter(random(), directory2); addDoc("a123456", writer2); addDoc("b123456", writer2); addDoc("b123456", writer2); addDoc("b123456", writer2); addDoc("c123456", writer2); addDoc("f123456", writer2); IndexReader ir1 = writer.getReader(); IndexReader ir2 = writer2.getReader(); MultiReader mr = new MultiReader(ir1, ir2); IndexSearcher searcher = newSearcher(mr); FuzzyQuery fq = new FuzzyQuery(new Term("field", "z123456"), 1, 0, 2, false); TopDocs docs = searcher.search(fq, 2); assertEquals(5, docs.totalHits); // 5 docs, from the a and b's mr.close(); ir1.close(); ir2.close(); writer.close(); writer2.close(); directory.close(); directory2.close(); } /** Test the TopTermsBoostOnlyBooleanQueryRewrite rewrite method. */ public void testBoostOnlyRewrite() throws Exception { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory); addDoc("Lucene", writer); addDoc("Lucene", writer); addDoc("Lucenne", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); writer.close(); FuzzyQuery query = new FuzzyQuery(new Term("field", "lucene")); query.setRewriteMethod(new MultiTermQuery.TopTermsBoostOnlyBooleanQueryRewrite(50)); ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; assertEquals(3, hits.length); // normally, 'Lucenne' would be the first result as IDF will skew the score. assertEquals("Lucene", reader.document(hits[0].doc).get("field")); assertEquals("Lucene", reader.document(hits[1].doc).get("field")); assertEquals("Lucenne", reader.document(hits[2].doc).get("field")); reader.close(); directory.close(); } public void testGiga() throws Exception { MockAnalyzer analyzer = new MockAnalyzer(random()); Directory index = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), index); addDoc("Lucene in Action", w); addDoc("Lucene for Dummies", w); //addDoc("Giga", w); addDoc("Giga byte", w); addDoc("ManagingGigabytesManagingGigabyte", w); addDoc("ManagingGigabytesManagingGigabytes", w); addDoc("The Art of Computer Science", w); addDoc("J. K. Rowling", w); addDoc("JK Rowling", w); addDoc("Joanne K Roling", w); addDoc("Bruce Willis", w); addDoc("Willis bruce", w); addDoc("Brute willis", w); addDoc("B. willis", w); IndexReader r = w.getReader(); w.close(); Query q = new FuzzyQuery(new Term("field", "giga"), 0); // 3. search IndexSearcher searcher = newSearcher(r); ScoreDoc[] hits = searcher.search(q, 10).scoreDocs; assertEquals(1, hits.length); assertEquals("Giga byte", searcher.doc(hits[0].doc).get("field")); r.close(); index.close(); } public void testDistanceAsEditsSearching() throws Exception { Directory index = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), index); addDoc("foobar", w); addDoc("test", w); addDoc("working", w); IndexReader reader = w.getReader(); IndexSearcher searcher = newSearcher(reader); w.close(); FuzzyQuery q = new FuzzyQuery(new Term("field", "fouba"), 2); ScoreDoc[] hits = searcher.search(q, 10).scoreDocs; assertEquals(1, hits.length); assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); q = new FuzzyQuery(new Term("field", "foubara"), 2); hits = searcher.search(q, 10).scoreDocs; assertEquals(1, hits.length); assertEquals("foobar", searcher.doc(hits[0].doc).get("field")); expectThrows(IllegalArgumentException.class, () -> { new FuzzyQuery(new Term("field", "t"), 3); }); reader.close(); index.close(); } public void testValidation() { IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> { new FuzzyQuery(new Term("field", "foo"), -1, 0, 1, false); }); assertTrue(expected.getMessage().contains("maxEdits")); expected = expectThrows(IllegalArgumentException.class, () -> { new FuzzyQuery(new Term("field", "foo"), LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + 1, 0, 1, false); }); assertTrue(expected.getMessage().contains("maxEdits must be between")); expected = expectThrows(IllegalArgumentException.class, () -> { new FuzzyQuery(new Term("field", "foo"), 1, -1, 1, false); }); assertTrue(expected.getMessage().contains("prefixLength cannot be negative")); expected = expectThrows(IllegalArgumentException.class, () -> { new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false); }); assertTrue(expected.getMessage().contains("maxExpansions must be positive")); expected = expectThrows(IllegalArgumentException.class, () -> { new FuzzyQuery(new Term("field", "foo"), 1, 0, -1, false); }); assertTrue(expected.getMessage().contains("maxExpansions must be positive")); } private void addDoc(String text, RandomIndexWriter writer) throws IOException { Document doc = new Document(); doc.add(newTextField("field", text, Field.Store.YES)); writer.addDocument(doc); } private String randomSimpleString(int digits) { int termLength = TestUtil.nextInt(random(), 1, 8); char[] chars = new char[termLength]; for(int i=0;i<termLength;i++) { chars[i] = (char) ('a' + random().nextInt(digits)); } return new String(chars); } @SuppressWarnings({"unchecked","rawtypes"}) public void testRandom() throws Exception { int digits = TestUtil.nextInt(random(), 2, 3); // underestimated total number of unique terms that randomSimpleString // maybe generate, it assumes all terms have a length of 7 int vocabularySize = digits << 7; int numTerms = Math.min(atLeast(100), vocabularySize); Set<String> terms = new HashSet<>(); while (terms.size() < numTerms) { terms.add(randomSimpleString(digits)); } Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); for(String term : terms) { Document doc = new Document(); doc.add(new StringField("field", term, Field.Store.YES)); w.addDocument(doc); } DirectoryReader r = w.getReader(); //System.out.println("TEST: reader=" + r); IndexSearcher s = newSearcher(r); int iters = atLeast(1000); for(int iter=0;iter<iters;iter++) { String queryTerm = randomSimpleString(digits); int prefixLength = random().nextInt(queryTerm.length()); String queryPrefix = queryTerm.substring(0, prefixLength); // we don't look at scores here: List<TermAndScore>[] expected = new List[3]; for(int ed=0;ed<3;ed++) { expected[ed] = new ArrayList<TermAndScore>(); } for(String term : terms) { if (term.startsWith(queryPrefix) == false) { continue; } int ed = getDistance(term, queryTerm); float score = 1f - (float) ed / (float) Math.min(queryTerm.length(), term.length()); while (ed < 3) { expected[ed].add(new TermAndScore(term, score)); ed++; } } for(int ed=0;ed<3;ed++) { Collections.sort(expected[ed]); int queueSize = TestUtil.nextInt(random(), 1, terms.size()); /* System.out.println("\nTEST: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength); for(TermAndScore ent : expected[ed]) { System.out.println(" " + ent); } */ FuzzyQuery query = new FuzzyQuery(new Term("field", queryTerm), ed, prefixLength, queueSize, true); TopDocs hits = s.search(query, terms.size()); Set<String> actual = new HashSet<>(); for(ScoreDoc hit : hits.scoreDocs) { Document doc = s.doc(hit.doc); actual.add(doc.get("field")); //System.out.println(" actual: " + doc.get("field") + " score=" + hit.score); } Set<String> expectedTop = new HashSet<>(); int limit = Math.min(queueSize, expected[ed].size()); for(int i=0;i<limit;i++) { expectedTop.add(expected[ed].get(i).term); } if (actual.equals(expectedTop) == false) { StringBuilder sb = new StringBuilder(); sb.append("FAILED: query=" + queryTerm + " ed=" + ed + " queueSize=" + queueSize + " vs expected match size=" + expected[ed].size() + " prefixLength=" + prefixLength + "\n"); boolean first = true; for(String term : actual) { if (expectedTop.contains(term) == false) { if (first) { sb.append(" these matched but shouldn't:\n"); first = false; } sb.append(" " + term + "\n"); } } first = true; for(String term : expectedTop) { if (actual.contains(term) == false) { if (first) { sb.append(" these did not match but should:\n"); first = false; } sb.append(" " + term + "\n"); } } throw new AssertionError(sb.toString()); } } } IOUtils.close(r, w, dir); } private static class TermAndScore implements Comparable<TermAndScore> { final String term; final float score; public TermAndScore(String term, float score) { this.term = term; this.score = score; } @Override public int compareTo(TermAndScore other) { // higher score sorts first, and if scores are tied, lower term sorts first if (score > other.score) { return -1; } else if (score < other.score) { return 1; } else { return term.compareTo(other.term); } } @Override public String toString() { return term + " score=" + score; } } // Poached from LuceneLevenshteinDistance.java (from suggest module): it supports transpositions (treats them as ed=1, not ed=2) private static int getDistance(String target, String other) { IntsRef targetPoints; IntsRef otherPoints; int n; int d[][]; // cost array // NOTE: if we cared, we could 3*m space instead of m*n space, similar to // what LevenshteinDistance does, except cycling thru a ring of three // horizontal cost arrays... but this comparator is never actually used by // DirectSpellChecker, it's only used for merging results from multiple shards // in "distributed spellcheck", and it's inefficient in other ways too... // cheaper to do this up front once targetPoints = toIntsRef(target); otherPoints = toIntsRef(other); n = targetPoints.length; final int m = otherPoints.length; d = new int[n+1][m+1]; if (n == 0 || m == 0) { if (n == m) { return 0; } else { return Math.max(n, m); } } // indexes into strings s and t int i; // iterates through s int j; // iterates through t int t_j; // jth character of t int cost; // cost for (i = 0; i<=n; i++) { d[i][0] = i; } for (j = 0; j<=m; j++) { d[0][j] = j; } for (j = 1; j<=m; j++) { t_j = otherPoints.ints[j-1]; for (i=1; i<=n; i++) { cost = targetPoints.ints[i-1]==t_j ? 0 : 1; // minimum of cell to the left+1, to the top+1, diagonally left and up +cost d[i][j] = Math.min(Math.min(d[i-1][j]+1, d[i][j-1]+1), d[i-1][j-1]+cost); // transposition if (i > 1 && j > 1 && targetPoints.ints[i-1] == otherPoints.ints[j-2] && targetPoints.ints[i-2] == otherPoints.ints[j-1]) { d[i][j] = Math.min(d[i][j], d[i-2][j-2] + cost); } } } return d[n][m]; } private static IntsRef toIntsRef(String s) { IntsRef ref = new IntsRef(s.length()); // worst case int utf16Len = s.length(); for (int i = 0, cp = 0; i < utf16Len; i += Character.charCount(cp)) { cp = ref.ints[ref.length++] = Character.codePointAt(s, i); } return ref; } }