package org.apache.lucene.search; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.PriorityQueue; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.BytesRef; import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Fields; import org.apache.lucene.index.Terms; import org.apache.lucene.queryParser.QueryParser; // for javadoc import org.apache.lucene.util.Attribute; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.util.PagedBytes; /** * An abstract {@link Query} that matches documents * containing a subset of terms provided by a {@link * FilteredTermsEnum} enumeration. * * <p>This query cannot be used directly; you must subclass * it and define {@link #getTermsEnum} to provide a {@link * FilteredTermsEnum} that iterates through the terms to be * matched. * * <p><b>NOTE</b>: if {@link #setRewriteMethod} is either * {@link #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} or {@link * #SCORING_BOOLEAN_QUERY_REWRITE}, you may encounter a * {@link BooleanQuery.TooManyClauses} exception during * searching, which happens when the number of terms to be * searched exceeds {@link * BooleanQuery#getMaxClauseCount()}. Setting {@link * #setRewriteMethod} to {@link #CONSTANT_SCORE_FILTER_REWRITE} * prevents this. * * <p>The recommended rewrite method is {@link * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT}: it doesn't spend CPU * computing unhelpful scores, and it tries to pick the most * performant rewrite method given the query. If you * need scoring (like {@link FuzzyQuery}, use * {@link TopTermsScoringBooleanQueryRewrite} which uses * a priority queue to only collect competitive terms * and not hit this limitation. * * Note that {@link QueryParser} produces * MultiTermQueries using {@link * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} by default. */ public abstract class MultiTermQuery extends Query { protected final String field; protected RewriteMethod rewriteMethod = CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; transient int numberOfTerms = 0; /** Add this {@link Attribute} to a {@link TermsEnum} returned by {@link #getTermsEnum} * and update the boost on each returned term. This enables to control the boost factor * for each matching term in {@link #SCORING_BOOLEAN_QUERY_REWRITE} or * {@link TopTermsBooleanQueryRewrite} mode. * {@link FuzzyQuery} is using this to take the edit distance into account. */ public static interface BoostAttribute extends Attribute { /** Sets the boost in this attribute */ public void setBoost(float boost); /** Retrieves the boost, default is {@code 1.0f}. */ public float getBoost(); /** Sets the maximum boost for terms that would never get * into the priority queue of {@link MultiTermQuery.TopTermsBooleanQueryRewrite}. * This value is not changed by {@link AttributeImpl#clear} * and not used in {@code equals()} and {@code hashCode()}. * Do not change the value in the {@link TermsEnum}! */ public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost); /** Retrieves the maximum boost that is not competitive, * default is megative infinity. You can use this boost value * as a hint when writing the {@link TermsEnum}. */ public float getMaxNonCompetitiveBoost(); } /** Implementation class for {@link BoostAttribute}. */ public static final class BoostAttributeImpl extends AttributeImpl implements BoostAttribute { private float boost = 1.0f, maxNonCompetitiveBoost = Float.NEGATIVE_INFINITY; public void setBoost(float boost) { this.boost = boost; } public float getBoost() { return boost; } public void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { this.maxNonCompetitiveBoost = maxNonCompetitiveBoost; } public float getMaxNonCompetitiveBoost() { return maxNonCompetitiveBoost; } @Override public void clear() { boost = 1.0f; } @Override public boolean equals(Object other) { if (this == other) return true; if (other instanceof BoostAttributeImpl) return ((BoostAttributeImpl) other).boost == boost; return false; } @Override public int hashCode() { return Float.floatToIntBits(boost); } @Override public void copyTo(AttributeImpl target) { ((BoostAttribute) target).setBoost(boost); } } /** Abstract class that defines how the query is rewritten. */ public static abstract class RewriteMethod implements Serializable { public abstract Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException; } private static final class ConstantScoreFilterRewrite extends RewriteMethod { @Override public Query rewrite(IndexReader reader, MultiTermQuery query) { Query result = new ConstantScoreQuery(new MultiTermQueryWrapperFilter<MultiTermQuery>(query)); result.setBoost(query.getBoost()); return result; } // Make sure we are still a singleton even after deserializing protected Object readResolve() { return CONSTANT_SCORE_FILTER_REWRITE; } } /** A rewrite method that first creates a private Filter, * by visiting each term in sequence and marking all docs * for that term. Matching documents are assigned a * constant score equal to the query's boost. * * <p> This method is faster than the BooleanQuery * rewrite methods when the number of matched terms or * matched documents is non-trivial. Also, it will never * hit an errant {@link BooleanQuery.TooManyClauses} * exception. * * @see #setRewriteMethod */ public final static RewriteMethod CONSTANT_SCORE_FILTER_REWRITE = new ConstantScoreFilterRewrite(); private abstract static class BooleanQueryRewrite extends RewriteMethod { protected final int collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException { final Fields fields = MultiFields.getFields(reader); if (fields == null) { // reader has no fields return 0; } final Terms terms = fields.terms(query.field); if (terms == null) { // field does not exist return 0; } final TermsEnum termsEnum = query.getTermsEnum(reader); assert termsEnum != null; if (termsEnum == TermsEnum.EMPTY) return 0; final BoostAttribute boostAtt = termsEnum.attributes().addAttribute(BoostAttribute.class); collector.boostAtt = boostAtt; int count = 0; BytesRef bytes; while ((bytes = termsEnum.next()) != null) { if (collector.collect(bytes, boostAtt.getBoost())) { count++; } else { break; } } collector.boostAtt = null; return count; } protected static abstract class TermCollector { private BoostAttribute boostAtt = null; /** return false to stop collecting */ public abstract boolean collect(BytesRef bytes, float boost) throws IOException; /** set the minimum boost as a hint for the term producer */ protected final void setMaxNonCompetitiveBoost(float maxNonCompetitiveBoost) { assert boostAtt != null; boostAtt.setMaxNonCompetitiveBoost(maxNonCompetitiveBoost); } } } private static class ScoringBooleanQueryRewrite extends BooleanQueryRewrite { @Override public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final BooleanQuery result = new BooleanQuery(true); final Term placeholderTerm = new Term(query.field); query.incTotalNumberOfTerms(collectTerms(reader, query, new TermCollector() { public boolean collect(BytesRef bytes, float boost) { // add new TQ, we must clone the term, else it may get overwritten! TermQuery tq = new TermQuery(placeholderTerm.createTerm(new BytesRef(bytes))); tq.setBoost(query.getBoost() * boost); // set the boost result.add(tq, BooleanClause.Occur.SHOULD); // add to query return true; } })); return result; } // Make sure we are still a singleton even after deserializing protected Object readResolve() { return SCORING_BOOLEAN_QUERY_REWRITE; } } /** A rewrite method that first translates each term into * {@link BooleanClause.Occur#SHOULD} clause in a * BooleanQuery, and keeps the scores as computed by the * query. Note that typically such scores are * meaningless to the user, and require non-trivial CPU * to compute, so it's almost always better to use {@link * #CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} instead. * * <p><b>NOTE</b>: This rewrite method will hit {@link * BooleanQuery.TooManyClauses} if the number of terms * exceeds {@link BooleanQuery#getMaxClauseCount}. * * @see #setRewriteMethod */ public final static RewriteMethod SCORING_BOOLEAN_QUERY_REWRITE = new ScoringBooleanQueryRewrite(); /** * Base rewrite method for collecting only the top terms * via a priority queue. */ public static abstract class TopTermsBooleanQueryRewrite extends BooleanQueryRewrite { private final int size; /** * Create a TopTermsBooleanQueryRewrite for * at most <code>size</code> terms. * <p> * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than * <code>size</code>, then it will be used instead. */ public TopTermsBooleanQueryRewrite(int size) { this.size = size; } /** * Create a TopTermsBooleanQueryRewrite that is limited * to at most {@link BooleanQuery#getMaxClauseCount} terms. */ public TopTermsBooleanQueryRewrite() { this(Integer.MAX_VALUE); } /** Return a suitable Query for a MultiTermQuery term. */ protected abstract Query getQuery(Term term); @Override public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { final int maxSize = Math.min(size, BooleanQuery.getMaxClauseCount()); final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<ScoreTerm>(); collectTerms(reader, query, new TermCollector() { public boolean collect(BytesRef bytes, float boost) { // ignore uncompetetive hits if (stQueue.size() >= maxSize && boost <= stQueue.peek().boost) return true; // add new entry in PQ, we must clone the term, else it may get overwritten! st.bytes.copy(bytes); st.boost = boost; stQueue.offer(st); // possibly drop entries from queue st = (stQueue.size() > maxSize) ? stQueue.poll() : new ScoreTerm(); setMaxNonCompetitiveBoost((stQueue.size() >= maxSize) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY); return true; } // reusable instance private ScoreTerm st = new ScoreTerm(); }); final Term placeholderTerm = new Term(query.field); final BooleanQuery bq = new BooleanQuery(true); for (final ScoreTerm st : stQueue) { // add new query, we must clone the term, else it may get overwritten! Query tq = getQuery(placeholderTerm.createTerm(st.bytes)); tq.setBoost(query.getBoost() * st.boost); // set the boost bq.add(tq, BooleanClause.Occur.SHOULD); // add to query } query.incTotalNumberOfTerms(bq.clauses().size()); return bq; } @Override public int hashCode() { final int prime = 17; int result = 1; result = prime * result + size; return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; TopTermsBooleanQueryRewrite other = (TopTermsBooleanQueryRewrite) obj; if (size != other.size) return false; return true; } private static class ScoreTerm implements Comparable<ScoreTerm> { public final BytesRef bytes = new BytesRef(); public float boost; public int compareTo(ScoreTerm other) { if (this.boost == other.boost) // TODO: is it OK to use default compare here? return other.bytes.compareTo(this.bytes); else return Float.compare(this.boost, other.boost); } } } /** * A rewrite method that first translates each term into * {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, and keeps the * scores as computed by the query. * * <p> * This rewrite mode only uses the top scoring terms so it will not overflow * the boolean max clause count. It is the default rewrite mode for * {@link FuzzyQuery}. * * @see #setRewriteMethod */ public static final class TopTermsScoringBooleanQueryRewrite extends TopTermsBooleanQueryRewrite { /** * Create a TopTermsScoringBooleanQueryRewrite that is limited * to at most {@link BooleanQuery#getMaxClauseCount} terms. */ public TopTermsScoringBooleanQueryRewrite() { super(); } /** * Create a TopTermsScoringBooleanQueryRewrite for * at most <code>size</code> terms. * <p> * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than * <code>size</code>, then it will be used instead. */ public TopTermsScoringBooleanQueryRewrite(int size) { super(size); } @Override protected Query getQuery(Term term) { return new TermQuery(term); } } /** * A rewrite method that first translates each term into * {@link BooleanClause.Occur#SHOULD} clause in a BooleanQuery, but the scores * are only computed as the boost. * <p> * This rewrite method only uses the top scoring terms so it will not overflow * the boolean max clause count. * * @see #setRewriteMethod */ public static final class TopTermsBoostOnlyBooleanQueryRewrite extends TopTermsBooleanQueryRewrite { /** * Create a TopTermsBoostOnlyBooleanQueryRewrite that is limited * to at most {@link BooleanQuery#getMaxClauseCount} terms. */ public TopTermsBoostOnlyBooleanQueryRewrite() { super(); } /** * Create a TopTermsBoostOnlyBooleanQueryRewrite for * at most <code>size</code> terms. * <p> * NOTE: if {@link BooleanQuery#getMaxClauseCount} is smaller than * <code>size</code>, then it will be used instead. */ public TopTermsBoostOnlyBooleanQueryRewrite(int size) { super(size); } @Override protected Query getQuery(Term term) { return new ConstantScoreQuery(new QueryWrapperFilter(new TermQuery(term))); } } private static class ConstantScoreBooleanQueryRewrite extends ScoringBooleanQueryRewrite implements Serializable { @Override public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { Query result = super.rewrite(reader, query); assert result instanceof BooleanQuery; // TODO: if empty boolean query return NullQuery? if (!((BooleanQuery) result).clauses().isEmpty()) { // strip the scores off result = new ConstantScoreQuery(new QueryWrapperFilter(result)); result.setBoost(query.getBoost()); } return result; } // Make sure we are still a singleton even after deserializing @Override protected Object readResolve() { return CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE; } } /** Like {@link #SCORING_BOOLEAN_QUERY_REWRITE} except * scores are not computed. Instead, each matching * document receives a constant score equal to the * query's boost. * * <p><b>NOTE</b>: This rewrite method will hit {@link * BooleanQuery.TooManyClauses} if the number of terms * exceeds {@link BooleanQuery#getMaxClauseCount}. * * @see #setRewriteMethod */ public final static RewriteMethod CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE = new ConstantScoreBooleanQueryRewrite(); /** A rewrite method that tries to pick the best * constant-score rewrite method based on term and * document counts from the query. If both the number of * terms and documents is small enough, then {@link * #CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE} is used. * Otherwise, {@link #CONSTANT_SCORE_FILTER_REWRITE} is * used. */ public static class ConstantScoreAutoRewrite extends BooleanQueryRewrite { // Defaults derived from rough tests with a 20.0 million // doc Wikipedia index. With more than 350 terms in the // query, the filter method is fastest: public static int DEFAULT_TERM_COUNT_CUTOFF = 350; // If the query will hit more than 1 in 1000 of the docs // in the index (0.1%), the filter method is fastest: public static double DEFAULT_DOC_COUNT_PERCENT = 0.1; private int termCountCutoff = DEFAULT_TERM_COUNT_CUTOFF; private double docCountPercent = DEFAULT_DOC_COUNT_PERCENT; /** If the number of terms in this query is equal to or * larger than this setting then {@link * #CONSTANT_SCORE_FILTER_REWRITE} is used. */ public void setTermCountCutoff(int count) { termCountCutoff = count; } /** @see #setTermCountCutoff */ public int getTermCountCutoff() { return termCountCutoff; } /** If the number of documents to be visited in the * postings exceeds this specified percentage of the * maxDoc() for the index, then {@link * #CONSTANT_SCORE_FILTER_REWRITE} is used. * @param percent 0.0 to 100.0 */ public void setDocCountPercent(double percent) { docCountPercent = percent; } /** @see #setDocCountPercent */ public double getDocCountPercent() { return docCountPercent; } @Override public Query rewrite(final IndexReader reader, final MultiTermQuery query) throws IOException { // Get the enum and start visiting terms. If we // exhaust the enum before hitting either of the // cutoffs, we use ConstantBooleanQueryRewrite; else, // ConstantFilterRewrite: final int docCountCutoff = (int) ((docCountPercent / 100.) * reader.maxDoc()); final int termCountLimit = Math.min(BooleanQuery.getMaxClauseCount(), termCountCutoff); final CutOffTermCollector col = new CutOffTermCollector(reader, query.field, docCountCutoff, termCountLimit); collectTerms(reader, query, col); if (col.hasCutOff) { return CONSTANT_SCORE_FILTER_REWRITE.rewrite(reader, query); } else if (col.termCount == 0) { return new BooleanQuery(true); } else { final PagedBytes.Reader bytesReader = col.pendingTerms.freeze(false); try { final BooleanQuery bq = new BooleanQuery(true); final Term placeholderTerm = new Term(query.field); long start = col.startOffset; for(int i = 0; i < col.termCount; i++) { final BytesRef bytes = new BytesRef(); start = bytesReader.fillUsingLengthPrefix3(bytes, start); bq.add(new TermQuery(placeholderTerm.createTerm(bytes)), BooleanClause.Occur.SHOULD); } // Strip scores final Query result = new ConstantScoreQuery(new QueryWrapperFilter(bq)); result.setBoost(query.getBoost()); query.incTotalNumberOfTerms(col.termCount); return result; } finally { bytesReader.close(); } } } private static final class CutOffTermCollector extends TermCollector { CutOffTermCollector(IndexReader reader, String field, int docCountCutoff, int termCountLimit) { this.reader = reader; this.field = field; this.docCountCutoff = docCountCutoff; this.termCountLimit = termCountLimit; } public boolean collect(BytesRef bytes, float boost) throws IOException { termCount++; if (termCount >= termCountLimit || docVisitCount >= docCountCutoff) { hasCutOff = true; return false; } pendingTerms.copyUsingLengthPrefix(bytes); // Loading the TermInfo from the terms dict here // should not be costly, because 1) the // query/filter will load the TermInfo when it // runs, and 2) the terms dict has a cache: docVisitCount += reader.docFreq(field, bytes); return true; } int docVisitCount = 0; boolean hasCutOff = false; int termCount = 0; final IndexReader reader; final String field; final int docCountCutoff, termCountLimit; final PagedBytes pendingTerms = new PagedBytes(15); // max term size is 32 KiB final long startOffset = pendingTerms.getPointer(); } @Override public int hashCode() { final int prime = 1279; return (int) (prime * termCountCutoff + Double.doubleToLongBits(docCountPercent)); } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; ConstantScoreAutoRewrite other = (ConstantScoreAutoRewrite) obj; if (other.termCountCutoff != termCountCutoff) { return false; } if (Double.doubleToLongBits(other.docCountPercent) != Double.doubleToLongBits(docCountPercent)) { return false; } return true; } } /** Read-only default instance of {@link * ConstantScoreAutoRewrite}, with {@link * ConstantScoreAutoRewrite#setTermCountCutoff} set to * {@link * ConstantScoreAutoRewrite#DEFAULT_TERM_COUNT_CUTOFF} * and {@link * ConstantScoreAutoRewrite#setDocCountPercent} set to * {@link * ConstantScoreAutoRewrite#DEFAULT_DOC_COUNT_PERCENT}. * Note that you cannot alter the configuration of this * instance; you'll need to create a private instance * instead. */ public final static RewriteMethod CONSTANT_SCORE_AUTO_REWRITE_DEFAULT = new ConstantScoreAutoRewrite() { @Override public void setTermCountCutoff(int count) { throw new UnsupportedOperationException("Please create a private instance"); } @Override public void setDocCountPercent(double percent) { throw new UnsupportedOperationException("Please create a private instance"); } // Make sure we are still a singleton even after deserializing protected Object readResolve() { return CONSTANT_SCORE_AUTO_REWRITE_DEFAULT; } }; /** * Constructs a query matching terms that cannot be represented with a single * Term. */ public MultiTermQuery(final String field) { this.field = field; assert field != null; } /** Returns the field name for this query */ public final String getField() { return field; } /** Construct the enumeration to be used, expanding the * pattern term. This method should only be called if * the field exists (ie, implementations can assume the * field does exist). This method should not return null * (should instead return {@link TermsEnum#EMPTY} if no * terms match). The TermsEnum must already be * positioned to the first matching term. */ protected abstract TermsEnum getTermsEnum(IndexReader reader) throws IOException; /** * Expert: Return the number of unique terms visited during execution of the query. * If there are many of them, you may consider using another query type * or optimize your total term count in index. * <p>This method is not thread safe, be sure to only call it when no query is running! * If you re-use the same query instance for another * search, be sure to first reset the term counter * with {@link #clearTotalNumberOfTerms}. * <p>On optimized indexes / no MultiReaders, you get the correct number of * unique terms for the whole index. Use this number to compare different queries. * For non-optimized indexes this number can also be achieved in * non-constant-score mode. In constant-score mode you get the total number of * terms seeked for all segments / sub-readers. * @see #clearTotalNumberOfTerms */ public int getTotalNumberOfTerms() { return numberOfTerms; } /** * Expert: Resets the counting of unique terms. * Do this before executing the query/filter. * @see #getTotalNumberOfTerms */ public void clearTotalNumberOfTerms() { numberOfTerms = 0; } protected void incTotalNumberOfTerms(int inc) { numberOfTerms += inc; } @Override public Query rewrite(IndexReader reader) throws IOException { return rewriteMethod.rewrite(reader, this); } /** * @see #setRewriteMethod */ public RewriteMethod getRewriteMethod() { return rewriteMethod; } /** * Sets the rewrite method to be used when executing the * query. You can use one of the four core methods, or * implement your own subclass of {@link RewriteMethod}. */ public void setRewriteMethod(RewriteMethod method) { rewriteMethod = method; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + Float.floatToIntBits(getBoost()); result = prime * result + rewriteMethod.hashCode(); if (field != null) result = prime * result + field.hashCode(); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; MultiTermQuery other = (MultiTermQuery) obj; if (Float.floatToIntBits(getBoost()) != Float.floatToIntBits(other.getBoost())) return false; if (!rewriteMethod.equals(other.rewriteMethod)) { return false; } return (other.field == null ? field == null : other.field.equals(field)); } }