package experiments.collective.entdoccentric.LTR;
import java.io.IOException;
import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.TopTermsRewrite;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.util.automaton.LevenshteinAutomata;
/**
* Implements the fuzzy search query. The similarity measurement is based on the
* Damerau-Levenshtein (optimal string alignment) algorithm, though you can
* explicitly choose classic Levenshtein by passing <code>false</code> to the
* <code>transpositions</code> parameter.
*
* <p>
* This query uses {@link MultiTermQuery.TopTermsScoringBooleanQueryRewrite} as
* default. So terms will be collected and scored according to their edit
* distance. Only the top terms are used for building the {@link BooleanQuery}.
* It is not recommended to change the rewrite mode for fuzzy queries.
*
* <p>
* At most, this query will match terms up to
* {@value org.apache.lucene.util.automaton.LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE} edits.
* Higher distances (especially with transpositions enabled), are generally not useful and
* will match a significant amount of the term dictionary. If you really want this, consider
* using an n-gram indexing technique (such as the SpellChecker in the
* <a href="{@docRoot}
* edits. Higher distances (especially with transpositions enabled), are
* generally not useful and will match a significant amount of the term
* dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot}
* edits. Higher distances (especially with transpositions enabled), are
* generally not useful and will match a significant amount of the term
* dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot}
* edits. Higher distances (especially with transpositions enabled), are
* generally not useful and will match a significant amount of the term
* dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot} edits. Higher distances (especially
* with transpositions enabled), are generally not useful and will match a
* significant amount of the term dictionary. If you really want this, consider
* using an n-gram indexing technique (such as the SpellChecker in the <a
* href="{@docRoot} edits. Higher distances (especially with transpositions
* enabled), are generally not useful and will match a significant amount of the
* term dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot}
* edits. Higher distances (especially with transpositions enabled), are
* generally not useful and will match a significant amount of the term
* dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot} edits. Higher distances (especially
* with transpositions enabled), are generally not useful and will match a
* significant amount of the term dictionary. If you really want this, consider
* using an n-gram indexing technique (such as the SpellChecker in the <a
* href="{@docRoot} edits. Higher distances (especially with transpositions
* enabled), are generally not useful and will match a significant amount of the
* term dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot} edits. Higher distances (especially
* with transpositions enabled), are generally not useful and will match a
* significant amount of the term dictionary. If you really want this, consider
* using an n-gram indexing technique (such as the SpellChecker in the <a
* href="{@docRoot} edits. Higher distances (especially with transpositions
* enabled), are generally not useful and will match a significant amount of the
* term dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot}
* edits. Higher distances (especially with transpositions enabled), are
* generally not useful and will match a significant amount of the term
* dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot} edits. Higher distances (especially
* with transpositions enabled), are generally not useful and will match a
* significant amount of the term dictionary. If you really want this, consider
* using an n-gram indexing technique (such as the SpellChecker in the <a
* href="{@docRoot} edits. Higher distances (especially with transpositions
* enabled), are generally not useful and will match a significant amount of the
* term dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot} edits. Higher distances (especially
* with transpositions enabled), are generally not useful and will match a
* significant amount of the term dictionary. If you really want this, consider
* using an n-gram indexing technique (such as the SpellChecker in the <a
* href="{@docRoot} edits. Higher distances (especially with transpositions
* enabled), are generally not useful and will match a significant amount of the
* term dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot} edits. Higher distances (especially
* with transpositions enabled), are generally not useful and will match a
* significant amount of the term dictionary. If you really want this, consider
* using an n-gram indexing technique (such as the SpellChecker in the <a
* href="{@docRoot} edits. Higher distances (especially with transpositions
* enabled), are generally not useful and will match a significant amount of the
* term dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot} edits. Higher distances (especially
* with transpositions enabled), are generally not useful and will match a
* significant amount of the term dictionary. If you really want this, consider
* using an n-gram indexing technique (such as the SpellChecker in the <a
* href="{@docRoot} edits. Higher distances (especially with transpositions
* enabled), are generally not useful and will match a significant amount of the
* term dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot} edits. Higher
* distances (especially with transpositions enabled), are generally not useful
* and will match a significant amount of the term dictionary. If you really
* want this, consider using an n-gram indexing technique (such as the
* SpellChecker in the <a href="{@docRoot} edits. Higher distances (especially
* with transpositions enabled), are generally not useful and will match a
* significant amount of the term dictionary. If you really want this, consider
* using an n-gram indexing technique (such as the SpellChecker in the <a
* href="{@docRoot} edits. Higher distances (especially with transpositions
* enabled), are generally not useful and will match a significant amount of the
* term dictionary. If you really want this, consider using an n-gram indexing
* technique (such as the SpellChecker in the <a href="{@docRoot}
* /../suggest/overview-summary.html">suggest module</a>) instead.
*/
public class LearnToRankFuzzyQuery extends MultiTermQuery {
public final static int defaultMaxEdits = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
public final static int defaultPrefixLength = 0;
public final static int defaultMaxExpansions = 50;
public final static boolean defaultTranspositions = true;
private final int maxEdits;
private final int maxExpansions;
private final boolean transpositions;
private final int prefixLength;
private final Term term;
private static Similarity sim = null;
/**
* Create a new FuzzyQuery that will match terms with an edit distance of at
* most <code>maxEdits</code> to <code>term</code>. If a
* <code>prefixLength</code> > 0 is specified, a common prefix of that
* length is also required.
*
* @param term
* the term to search for
* @param maxEdits
* must be >= 0 and <=
* {@link LevenshteinAutomata#MAXIMUM_SUPPORTED_DISTANCE}.
* @param prefixLength
* length of common (non-fuzzy) prefix
* @param maxExpansions
* the maximum number of terms to match. If this number is
* greater than {@link BooleanQuery#getMaxClauseCount} when the
* query is rewritten, then the maxClauseCount will be used
* instead.
* @param transpositions
* true if transpositions should be treated as a primitive edit
* operation. If this is false, comparisons will implement the
* classic Levenshtein algorithm.
*/
public LearnToRankFuzzyQuery(Term term, int maxEdits, int prefixLength,
int maxExpansions, boolean transpositions, Similarity sim) {
super(term.field());
if (maxEdits < 0
|| maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
throw new IllegalArgumentException(
"maxEdits must be between 0 and "
+ LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
}
if (prefixLength < 0) {
throw new IllegalArgumentException(
"prefixLength cannot be negative.");
}
if (maxExpansions < 0) {
throw new IllegalArgumentException(
"maxExpansions cannot be negative.");
}
this.term = term;
this.maxEdits = maxEdits;
this.prefixLength = prefixLength;
this.transpositions = transpositions;
this.maxExpansions = maxExpansions;
LearnToRankFuzzyQuery.sim = sim;
setRewriteMethod(new LearnToRankFuzzyQuery.LTRTopTermsScoringBooleanQueryRewrite(maxExpansions));
// setRewriteMethod(new LearnToRankFuzzyQuery.LTRTopTermsScoringBooleanQueryRewrite(
// maxExpansions));
}
/**
* Calls {@link #FuzzyQuery(Term, int, int, int, boolean) FuzzyQuery(term,
* minimumSimilarity, prefixLength, defaultMaxExpansions,
* defaultTranspositions)}.
*/
public LearnToRankFuzzyQuery(Term term, int maxEdits, int prefixLength,
Similarity sim) {
this(term, maxEdits, prefixLength, defaultMaxExpansions,
defaultTranspositions, sim);
}
/**
* Calls {@link #FuzzyQuery(Term, int, int) FuzzyQuery(term, maxEdits,
* defaultPrefixLength)}.
*/
public LearnToRankFuzzyQuery(Term term, int maxEdits, Similarity sim) {
this(term, maxEdits, defaultPrefixLength, sim);
}
/**
* Calls {@link #FuzzyQuery(Term, int) FuzzyQuery(term, defaultMaxEdits)}.
*/
public LearnToRankFuzzyQuery(Term term, Similarity sim) {
this(term, defaultMaxEdits, sim);
}
/**
* @return the maximum number of edit distances allowed for this query to
* match.
*/
public int getMaxEdits() {
return maxEdits;
}
/**
* Returns the non-fuzzy prefix length. This is the number of characters at
* the start of a term that must be identical (not fuzzy) to the query term
* if the query is to match that term.
*/
public int getPrefixLength() {
return prefixLength;
}
@Override
protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts)
throws IOException {
if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only
// match
// if
// it's
// exact
return new SingleTermsEnum(terms.iterator(null), term.bytes());
}
return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits,
prefixLength, transpositions);
}
/**
* Returns the pattern term.
*/
public Term getTerm() {
return term;
}
@Override
public String toString(String field) {
final StringBuilder buffer = new StringBuilder();
if (!term.field().equals(field)) {
buffer.append(term.field());
buffer.append(":");
}
buffer.append(term.text());
buffer.append('~');
buffer.append(Integer.toString(maxEdits));
buffer.append(ToStringUtils.boost(getBoost()));
return buffer.toString();
}
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + maxEdits;
result = prime * result + prefixLength;
result = prime * result + maxExpansions;
result = prime * result + (transpositions ? 0 : 1);
result = prime * result + ((term == null) ? 0 : term.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
LearnToRankFuzzyQuery other = (LearnToRankFuzzyQuery) obj;
if (maxEdits != other.maxEdits)
return false;
if (prefixLength != other.prefixLength)
return false;
if (maxExpansions != other.maxExpansions)
return false;
if (transpositions != other.transpositions)
return false;
if (term == null) {
if (other.term != null)
return false;
} else if (!term.equals(other.term))
return false;
return true;
}
/**
* @deprecated pass integer edit distances instead.
*/
@Deprecated
public final static float defaultMinSimilarity = LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE;
/**
* Helper function to convert from deprecated "minimumSimilarity" fractions
* to raw edit distances.
*
* @param minimumSimilarity
* scaled similarity
* @param termLen
* length (in unicode codepoints) of the term.
* @return equivalent number of maxEdits
* @deprecated pass integer edit distances instead.
*/
@Deprecated
public static int floatToEdits(float minimumSimilarity, int termLen) {
if (minimumSimilarity >= 1f) {
return (int) Math.min(minimumSimilarity,
LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
} else if (minimumSimilarity == 0.0f) {
return 0; // 0 means exact, not infinite # of edits!
} else {
return Math.min((int) ((1D - minimumSimilarity) * termLen),
LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE);
}
}
public static final class LTRTopTermsScoringBooleanQueryRewrite extends
TopTermsRewrite<BooleanQuery> {
/**
* Create a TopTermsScoringBooleanQueryRewrite for at most
* <code>size</code> terms.
* <p>
* NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than
* <code>size</code>, then it will be used instead.
*/
public LTRTopTermsScoringBooleanQueryRewrite(int size) {
super(size);
}
@Override
protected int getMaxSize() {
return BooleanQuery.getMaxClauseCount();
}
@Override
protected BooleanQuery getTopLevelQuery() {
return new BooleanQuery(true);
}
@Override
protected void addClause(BooleanQuery topLevel, Term term,
int docCount, float boost, TermContext states) {
final LearnToRankTermQuery tq = new LearnToRankTermQuery(term,
states, sim);
tq.setBoost(boost);
topLevel.add(tq, BooleanClause.Occur.SHOULD);
}
// @Override
// protected void addClause(BooleanQuery topLevel, Term term, int docCount, float boost, TermContext states) {
// final TermQuery tq = new TermQuery(term, states);
// tq.setBoost(boost);
// topLevel.add(tq, BooleanClause.Occur.SHOULD);
// }
}
}