/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.lucene.search.suggest.document; import java.io.IOException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Weight; import org.apache.lucene.search.suggest.BitsProducer; import org.apache.lucene.util.IntsRef; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; import org.apache.lucene.util.automaton.FiniteStringsIterator; import org.apache.lucene.util.automaton.LevenshteinAutomata; import org.apache.lucene.util.automaton.Operations; import org.apache.lucene.util.automaton.UTF32ToUTF8; /** * A {@link CompletionQuery} that match documents containing terms * within an edit distance of the specified prefix. * <p> * This query boost documents relative to how similar the indexed terms are to the * provided prefix. * <p> * Example usage of querying an analyzed prefix within an edit distance of 1 of 'subg' * against a field 'suggest_field' is as follows: * * <pre class="prettyprint"> * CompletionQuery query = new FuzzyCompletionQuery(analyzer, new Term("suggest_field", "subg")); * </pre> * * @lucene.experimental */ public class FuzzyCompletionQuery extends PrefixCompletionQuery { /** * Measure maxEdits, minFuzzyLength, transpositions and nonFuzzyPrefix * parameters in Unicode code points (actual letters) * instead of bytes. * */ public static final boolean DEFAULT_UNICODE_AWARE = false; /** * The default minimum length of the key before any edits are allowed. */ public static final int DEFAULT_MIN_FUZZY_LENGTH = 3; /** * The default prefix length where edits are not allowed. */ public static final int DEFAULT_NON_FUZZY_PREFIX = 1; /** * The default maximum number of edits for fuzzy * suggestions. */ public static final int DEFAULT_MAX_EDITS = 1; /** * The default transposition value passed to {@link LevenshteinAutomata} */ public static final boolean DEFAULT_TRANSPOSITIONS = true; private final int maxEdits; private final boolean transpositions; private final int nonFuzzyPrefix; private final int minFuzzyLength; private final boolean unicodeAware; private final int maxDeterminizedStates; /** * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer)} * with no filter */ public FuzzyCompletionQuery(Analyzer analyzer, Term term) { this(analyzer, term, null); } /** * Calls {@link FuzzyCompletionQuery#FuzzyCompletionQuery(Analyzer, Term, BitsProducer, * int, boolean, int, int, boolean, int)} * with defaults for <code>maxEdits</code>, <code>transpositions</code>, * <code>nonFuzzyPrefix</code>, <code>minFuzzyLength</code>, * <code>unicodeAware</code> and <code>maxDeterminizedStates</code> * * See {@link #DEFAULT_MAX_EDITS}, {@link #DEFAULT_TRANSPOSITIONS}, * {@link #DEFAULT_NON_FUZZY_PREFIX}, {@link #DEFAULT_MIN_FUZZY_LENGTH}, * {@link #DEFAULT_UNICODE_AWARE} and {@link Operations#DEFAULT_MAX_DETERMINIZED_STATES} * for defaults */ public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter) { this(analyzer, term, filter, DEFAULT_MAX_EDITS, DEFAULT_TRANSPOSITIONS, DEFAULT_NON_FUZZY_PREFIX, DEFAULT_MIN_FUZZY_LENGTH, DEFAULT_UNICODE_AWARE, Operations.DEFAULT_MAX_DETERMINIZED_STATES ); } /** * Constructs an analyzed fuzzy prefix completion query * * @param analyzer used to analyze the provided {@link Term#text()} * @param term query is run against {@link Term#field()} and {@link Term#text()} * is analyzed with <code>analyzer</code> * @param filter used to query on a sub set of documents * @param maxEdits maximum number of acceptable edits * @param transpositions value passed to {@link LevenshteinAutomata} * @param nonFuzzyPrefix prefix length where edits are not allowed * @param minFuzzyLength minimum prefix length before any edits are allowed * @param unicodeAware treat prefix as unicode rather than bytes * @param maxDeterminizedStates maximum automaton states allowed for {@link LevenshteinAutomata} */ public FuzzyCompletionQuery(Analyzer analyzer, Term term, BitsProducer filter, int maxEdits, boolean transpositions, int nonFuzzyPrefix, int minFuzzyLength, boolean unicodeAware, int maxDeterminizedStates) { super(analyzer, term, filter); this.maxEdits = maxEdits; this.transpositions = transpositions; this.nonFuzzyPrefix = nonFuzzyPrefix; this.minFuzzyLength = minFuzzyLength; this.unicodeAware = unicodeAware; this.maxDeterminizedStates = maxDeterminizedStates; } @Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException { CompletionTokenStream stream = (CompletionTokenStream) analyzer.tokenStream(getField(), getTerm().text()); Set<IntsRef> refs = new HashSet<>(); Automaton automaton = toLevenshteinAutomata(stream.toAutomaton(unicodeAware), refs); if (unicodeAware) { Automaton utf8automaton = new UTF32ToUTF8().convert(automaton); utf8automaton = Operations.determinize(utf8automaton, maxDeterminizedStates); automaton = utf8automaton; } // TODO Accumulating all refs is bad, because the resulting set may be very big. // TODO Better iterate over automaton again inside FuzzyCompletionWeight? return new FuzzyCompletionWeight(this, automaton, refs); } private Automaton toLevenshteinAutomata(Automaton automaton, Set<IntsRef> refs) { List<Automaton> subs = new ArrayList<>(); FiniteStringsIterator finiteStrings = new FiniteStringsIterator(automaton); for (IntsRef string; (string = finiteStrings.next()) != null;) { refs.add(IntsRef.deepCopyOf(string)); if (string.length <= nonFuzzyPrefix || string.length < minFuzzyLength) { subs.add(Automata.makeString(string.ints, string.offset, string.length)); } else { int ints[] = new int[string.length - nonFuzzyPrefix]; System.arraycopy(string.ints, string.offset + nonFuzzyPrefix, ints, 0, ints.length); // TODO: maybe add alphaMin to LevenshteinAutomata, // and pass 1 instead of 0? We probably don't want // to allow the trailing dedup bytes to be // edited... but then 0 byte is "in general" allowed // on input (but not in UTF8). LevenshteinAutomata lev = new LevenshteinAutomata(ints, unicodeAware ? Character.MAX_CODE_POINT : 255, transpositions); subs.add(lev.toAutomaton(maxEdits, UnicodeUtil.newString(string.ints, string.offset, nonFuzzyPrefix))); } } if (subs.isEmpty()) { // automaton is empty, there is no accepted paths through it return Automata.makeEmpty(); // matches nothing } else if (subs.size() == 1) { // no synonyms or anything: just a single path through the tokenstream return subs.get(0); } else { // multiple paths: this is really scary! is it slow? // maybe we should not do this and throw UOE? Automaton a = Operations.union(subs); // TODO: we could call toLevenshteinAutomata() before det? // this only happens if you have multiple paths anyway (e.g. synonyms) return Operations.determinize(a, maxDeterminizedStates); } } /** * Get the maximum edit distance for fuzzy matches */ public int getMaxEdits() { return maxEdits; } /** * Return whether transpositions count as a single edit */ public boolean isTranspositions() { return transpositions; } /** * Get the length of a prefix where no edits are permitted */ public int getNonFuzzyPrefix() { return nonFuzzyPrefix; } /** * Get the minimum length of a term considered for matching */ public int getMinFuzzyLength() { return minFuzzyLength; } /** * Return true if lengths are measured in unicode code-points rather than bytes */ public boolean isUnicodeAware() { return unicodeAware; } /** * Get the maximum number of determinized states permitted */ public int getMaxDeterminizedStates() { return maxDeterminizedStates; } @Override public String toString(String field) { StringBuilder buffer = new StringBuilder(); if (!getField().equals(field)) { buffer.append(getField()); buffer.append(":"); } buffer.append(getTerm().text()); buffer.append('*'); buffer.append('~'); buffer.append(Integer.toString(maxEdits)); if (getFilter() != null) { buffer.append(","); buffer.append("filter"); buffer.append(getFilter().toString()); } return buffer.toString(); } private static class FuzzyCompletionWeight extends CompletionWeight { private final Set<IntsRef> refs; int currentBoost = 0; public FuzzyCompletionWeight(CompletionQuery query, Automaton automaton, Set<IntsRef> refs) throws IOException { super(query, automaton); this.refs = refs; } @Override protected void setNextMatch(IntsRef pathPrefix) { // NOTE: the last letter of the matched prefix for the exact // match never makes it through here // so an exact match and a match with only a edit at the // end is boosted the same int maxCount = 0; for (IntsRef ref : refs) { int minLength = Math.min(ref.length, pathPrefix.length); int count = 0; for (int i = 0; i < minLength; i++) { if (ref.ints[i + ref.offset] == pathPrefix.ints[i + pathPrefix.offset]) { count++; } else { break; } } maxCount = Math.max(maxCount, count); } currentBoost = maxCount; } @Override protected float boost() { return currentBoost; } } }