package org.apache.lucene.search; /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.util.Bits; import org.apache.lucene.util.ToStringUtils; import java.io.IOException; import java.util.Collection; import java.util.Collections; import java.util.Set; /** * A query that applies a filter to the results of another query. * * <p>Note: the bits are retrieved from the filter each time this * query is used in a search - use a CachingWrapperFilter to avoid * regenerating the bits every time. * @since 1.4 * @see CachingWrapperFilter */ public class FilteredQuery extends Query { private final Query query; private final Filter filter; private final FilterStrategy strategy; /** * Constructs a new query which applies a filter to the results of the original query. * {@link Filter#getDocIdSet} will be called every time this query is used in a search. * @param query Query to be filtered, cannot be <code>null</code>. * @param filter Filter to apply to query results, cannot be <code>null</code>. */ public FilteredQuery(Query query, Filter filter) { this(query, filter, RANDOM_ACCESS_FILTER_STRATEGY); } /** * Expert: Constructs a new query which applies a filter to the results of the original query. * {@link Filter#getDocIdSet} will be called every time this query is used in a search. * @param query Query to be filtered, cannot be <code>null</code>. * @param filter Filter to apply to query results, cannot be <code>null</code>. * @param strategy a filter strategy used to create a filtered scorer. * * @see FilterStrategy */ public FilteredQuery(Query query, Filter filter, FilterStrategy strategy) { if (query == null || filter == null) throw new IllegalArgumentException("Query and filter cannot be null."); if (strategy == null) throw new IllegalArgumentException("FilterStrategy can not be null"); this.strategy = strategy; this.query = query; this.filter = filter; } /** * Returns a Weight that applies the filter to the enclosed query's Weight. * This is accomplished by overriding the Scorer returned by the Weight. */ @Override public Weight createWeight(final IndexSearcher searcher) throws IOException { final Weight weight = query.createWeight (searcher); return new Weight() { @Override public boolean scoresDocsOutOfOrder() { return true; } @Override public float getValueForNormalization() throws IOException { return weight.getValueForNormalization() * getBoost() * getBoost(); // boost sub-weight } @Override public void normalize(float norm, float topLevelBoost) { weight.normalize(norm, topLevelBoost * getBoost()); // incorporate boost } @Override public Explanation explain(AtomicReaderContext ir, int i) throws IOException { Explanation inner = weight.explain (ir, i); Filter f = FilteredQuery.this.filter; DocIdSet docIdSet = f.getDocIdSet(ir, ir.reader().getLiveDocs()); DocIdSetIterator docIdSetIterator = docIdSet == null ? DocIdSetIterator.empty() : docIdSet.iterator(); if (docIdSetIterator == null) { docIdSetIterator = DocIdSetIterator.empty(); } if (docIdSetIterator.advance(i) == i) { return inner; } else { Explanation result = new Explanation (0.0f, "failure to match filter: " + f.toString()); result.addDetail(inner); return result; } } // return this query @Override public Query getQuery() { return FilteredQuery.this; } // return a filtering scorer @Override public Scorer scorer(AtomicReaderContext context, Bits acceptDocs) throws IOException { assert filter != null; DocIdSet filterDocIdSet = filter.getDocIdSet(context, acceptDocs); if (filterDocIdSet == null) { // this means the filter does not accept any documents. return null; } return strategy.filteredScorer(context, weight, filterDocIdSet); } // return a filtering top scorer @Override public BulkScorer bulkScorer(AtomicReaderContext context, boolean scoreDocsInOrder, Bits acceptDocs) throws IOException { assert filter != null; DocIdSet filterDocIdSet = filter.getDocIdSet(context, acceptDocs); if (filterDocIdSet == null) { // this means the filter does not accept any documents. return null; } return strategy.filteredBulkScorer(context, weight, scoreDocsInOrder, filterDocIdSet); } }; } /** * A scorer that consults the filter iff a document was matched by the * delegate scorer. This is useful if the filter computation is more expensive * than document scoring or if the filter has a linear running time to compute * the next matching doc like exact geo distances. */ private static final class QueryFirstScorer extends Scorer { private final Scorer scorer; private int scorerDoc = -1; private final Bits filterBits; protected QueryFirstScorer(Weight weight, Bits filterBits, Scorer other) { super(weight); this.scorer = other; this.filterBits = filterBits; } @Override public int nextDoc() throws IOException { int doc; for(;;) { doc = scorer.nextDoc(); if (doc == Scorer.NO_MORE_DOCS || filterBits.get(doc)) { return scorerDoc = doc; } } } @Override public int advance(int target) throws IOException { int doc = scorer.advance(target); if (doc != Scorer.NO_MORE_DOCS && !filterBits.get(doc)) { return scorerDoc = nextDoc(); } else { return scorerDoc = doc; } } @Override public int docID() { return scorerDoc; } @Override public float score() throws IOException { return scorer.score(); } @Override public int freq() throws IOException { return scorer.freq(); } @Override public Collection<ChildScorer> getChildren() { return Collections.singleton(new ChildScorer(scorer, "FILTERED")); } @Override public long cost() { return scorer.cost(); } } private static class QueryFirstBulkScorer extends BulkScorer { private final Scorer scorer; private final Bits filterBits; public QueryFirstBulkScorer(Scorer scorer, Bits filterBits) { this.scorer = scorer; this.filterBits = filterBits; } @Override public boolean score(Collector collector, int maxDoc) throws IOException { // the normalization trick already applies the boost of this query, // so we can use the wrapped scorer directly: collector.setScorer(scorer); if (scorer.docID() == -1) { scorer.nextDoc(); } while (true) { final int scorerDoc = scorer.docID(); if (scorerDoc < maxDoc) { if (filterBits.get(scorerDoc)) { collector.collect(scorerDoc); } scorer.nextDoc(); } else { break; } } return scorer.docID() != Scorer.NO_MORE_DOCS; } } /** * A Scorer that uses a "leap-frog" approach (also called "zig-zag join"). The scorer and the filter * take turns trying to advance to each other's next matching document, often * jumping past the target document. When both land on the same document, it's * collected. */ private static class LeapFrogScorer extends Scorer { private final DocIdSetIterator secondary; private final DocIdSetIterator primary; private final Scorer scorer; protected int primaryDoc = -1; protected int secondaryDoc = -1; protected LeapFrogScorer(Weight weight, DocIdSetIterator primary, DocIdSetIterator secondary, Scorer scorer) { super(weight); this.primary = primary; this.secondary = secondary; this.scorer = scorer; } private final int advanceToNextCommonDoc() throws IOException { for (;;) { if (secondaryDoc < primaryDoc) { secondaryDoc = secondary.advance(primaryDoc); } else if (secondaryDoc == primaryDoc) { return primaryDoc; } else { primaryDoc = primary.advance(secondaryDoc); } } } @Override public final int nextDoc() throws IOException { primaryDoc = primaryNext(); return advanceToNextCommonDoc(); } protected int primaryNext() throws IOException { return primary.nextDoc(); } @Override public final int advance(int target) throws IOException { if (target > primaryDoc) { primaryDoc = primary.advance(target); } return advanceToNextCommonDoc(); } @Override public final int docID() { return secondaryDoc; } @Override public final float score() throws IOException { return scorer.score(); } @Override public final int freq() throws IOException { return scorer.freq(); } @Override public final Collection<ChildScorer> getChildren() { return Collections.singleton(new ChildScorer(scorer, "FILTERED")); } @Override public long cost() { return Math.min(primary.cost(), secondary.cost()); } } // TODO once we have way to figure out if we use RA or LeapFrog we can remove this scorer private static final class PrimaryAdvancedLeapFrogScorer extends LeapFrogScorer { private final int firstFilteredDoc; protected PrimaryAdvancedLeapFrogScorer(Weight weight, int firstFilteredDoc, DocIdSetIterator filterIter, Scorer other) { super(weight, filterIter, other, other); this.firstFilteredDoc = firstFilteredDoc; this.primaryDoc = firstFilteredDoc; // initialize to prevent and advance call to move it further } @Override protected int primaryNext() throws IOException { if (secondaryDoc != -1) { return super.primaryNext(); } else { return firstFilteredDoc; } } } /** Rewrites the query. If the wrapped is an instance of * {@link MatchAllDocsQuery} it returns a {@link ConstantScoreQuery}. Otherwise * it returns a new {@code FilteredQuery} wrapping the rewritten query. */ @Override public Query rewrite(IndexReader reader) throws IOException { final Query queryRewritten = query.rewrite(reader); if (queryRewritten != query) { // rewrite to a new FilteredQuery wrapping the rewritten query final Query rewritten = new FilteredQuery(queryRewritten, filter, strategy); rewritten.setBoost(this.getBoost()); return rewritten; } else { // nothing to rewrite, we are done! return this; } } /** Returns this FilteredQuery's (unfiltered) Query */ public final Query getQuery() { return query; } /** Returns this FilteredQuery's filter */ public final Filter getFilter() { return filter; } /** Returns this FilteredQuery's {@link FilterStrategy} */ public FilterStrategy getFilterStrategy() { return this.strategy; } // inherit javadoc @Override public void extractTerms(Set<Term> terms) { getQuery().extractTerms(terms); } /** Prints a user-readable version of this query. */ @Override public String toString (String s) { StringBuilder buffer = new StringBuilder(); buffer.append("filtered("); buffer.append(query.toString(s)); buffer.append(")->"); buffer.append(filter); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); } /** Returns true iff <code>o</code> is equal to this. */ @Override public boolean equals(Object o) { if (o == this) return true; if (!super.equals(o)) return false; assert o instanceof FilteredQuery; final FilteredQuery fq = (FilteredQuery) o; return fq.query.equals(this.query) && fq.filter.equals(this.filter) && fq.strategy.equals(this.strategy); } /** Returns a hash code value for this object. */ @Override public int hashCode() { int hash = super.hashCode(); hash = hash * 31 + strategy.hashCode(); hash = hash * 31 + query.hashCode(); hash = hash * 31 + filter.hashCode(); return hash; } /** * A {@link FilterStrategy} that conditionally uses a random access filter if * the given {@link DocIdSet} supports random access (returns a non-null value * from {@link DocIdSet#bits()}) and * {@link RandomAccessFilterStrategy#useRandomAccess(Bits, int)} returns * <code>true</code>. Otherwise this strategy falls back to a "zig-zag join" ( * {@link FilteredQuery#LEAP_FROG_FILTER_FIRST_STRATEGY}) strategy. * * <p> * Note: this strategy is the default strategy in {@link FilteredQuery} * </p> */ public static final FilterStrategy RANDOM_ACCESS_FILTER_STRATEGY = new RandomAccessFilterStrategy(); /** * A filter strategy that uses a "leap-frog" approach (also called "zig-zag join"). * The scorer and the filter * take turns trying to advance to each other's next matching document, often * jumping past the target document. When both land on the same document, it's * collected. * <p> * Note: This strategy uses the filter to lead the iteration. * </p> */ public static final FilterStrategy LEAP_FROG_FILTER_FIRST_STRATEGY = new LeapFrogFilterStrategy(false); /** * A filter strategy that uses a "leap-frog" approach (also called "zig-zag join"). * The scorer and the filter * take turns trying to advance to each other's next matching document, often * jumping past the target document. When both land on the same document, it's * collected. * <p> * Note: This strategy uses the query to lead the iteration. * </p> */ public static final FilterStrategy LEAP_FROG_QUERY_FIRST_STRATEGY = new LeapFrogFilterStrategy(true); /** * A filter strategy that advances the Query or rather its {@link Scorer} first and consults the * filter {@link DocIdSet} for each matched document. * <p> * Note: this strategy requires a {@link DocIdSet#bits()} to return a non-null value. Otherwise * this strategy falls back to {@link FilteredQuery#LEAP_FROG_QUERY_FIRST_STRATEGY} * </p> * <p> * Use this strategy if the filter computation is more expensive than document * scoring or if the filter has a linear running time to compute the next * matching doc like exact geo distances. * </p> */ public static final FilterStrategy QUERY_FIRST_FILTER_STRATEGY = new QueryFirstFilterStrategy(); /** Abstract class that defines how the filter ({@link DocIdSet}) applied during document collection. */ public static abstract class FilterStrategy { /** * Returns a filtered {@link Scorer} based on this strategy. * * @param context * the {@link AtomicReaderContext} for which to return the {@link Scorer}. * @param weight the {@link FilteredQuery} {@link Weight} to create the filtered scorer. * @param docIdSet the filter {@link DocIdSet} to apply * @return a filtered scorer * * @throws IOException if an {@link IOException} occurs */ public abstract Scorer filteredScorer(AtomicReaderContext context, Weight weight, DocIdSet docIdSet) throws IOException; /** * Returns a filtered {@link BulkScorer} based on this * strategy. This is an optional method: the default * implementation just calls {@link #filteredScorer} and * wraps that into a BulkScorer. * * @param context * the {@link AtomicReaderContext} for which to return the {@link Scorer}. * @param weight the {@link FilteredQuery} {@link Weight} to create the filtered scorer. * @param docIdSet the filter {@link DocIdSet} to apply * @return a filtered top scorer */ public BulkScorer filteredBulkScorer(AtomicReaderContext context, Weight weight, boolean scoreDocsInOrder, DocIdSet docIdSet) throws IOException { Scorer scorer = filteredScorer(context, weight, docIdSet); if (scorer == null) { return null; } // This impl always scores docs in order, so we can // ignore scoreDocsInOrder: return new Weight.DefaultBulkScorer(scorer); } } /** * A {@link FilterStrategy} that conditionally uses a random access filter if * the given {@link DocIdSet} supports random access (returns a non-null value * from {@link DocIdSet#bits()}) and * {@link RandomAccessFilterStrategy#useRandomAccess(Bits, int)} returns * <code>true</code>. Otherwise this strategy falls back to a "zig-zag join" ( * {@link FilteredQuery#LEAP_FROG_FILTER_FIRST_STRATEGY}) strategy . */ public static class RandomAccessFilterStrategy extends FilterStrategy { @Override public Scorer filteredScorer(AtomicReaderContext context, Weight weight, DocIdSet docIdSet) throws IOException { final DocIdSetIterator filterIter = docIdSet.iterator(); if (filterIter == null) { // this means the filter does not accept any documents. return null; } final int firstFilterDoc = filterIter.nextDoc(); if (firstFilterDoc == DocIdSetIterator.NO_MORE_DOCS) { return null; } final Bits filterAcceptDocs = docIdSet.bits(); // force if RA is requested final boolean useRandomAccess = filterAcceptDocs != null && useRandomAccess(filterAcceptDocs, firstFilterDoc); if (useRandomAccess) { // if we are using random access, we return the inner scorer, just with other acceptDocs return weight.scorer(context, filterAcceptDocs); } else { assert firstFilterDoc > -1; // we are gonna advance() this scorer, so we set inorder=true/toplevel=false // we pass null as acceptDocs, as our filter has already respected acceptDocs, no need to do twice final Scorer scorer = weight.scorer(context, null); // TODO once we have way to figure out if we use RA or LeapFrog we can remove this scorer return (scorer == null) ? null : new PrimaryAdvancedLeapFrogScorer(weight, firstFilterDoc, filterIter, scorer); } } /** * Expert: decides if a filter should be executed as "random-access" or not. * random-access means the filter "filters" in a similar way as deleted docs are filtered * in Lucene. This is faster when the filter accepts many documents. * However, when the filter is very sparse, it can be faster to execute the query+filter * as a conjunction in some cases. * * The default implementation returns <code>true</code> if the first document accepted by the * filter is < 100. * * @lucene.internal */ protected boolean useRandomAccess(Bits bits, int firstFilterDoc) { //TODO once we have a cost API on filters and scorers we should rethink this heuristic return firstFilterDoc < 100; } } private static final class LeapFrogFilterStrategy extends FilterStrategy { private final boolean scorerFirst; private LeapFrogFilterStrategy(boolean scorerFirst) { this.scorerFirst = scorerFirst; } @Override public Scorer filteredScorer(AtomicReaderContext context, Weight weight, DocIdSet docIdSet) throws IOException { final DocIdSetIterator filterIter = docIdSet.iterator(); if (filterIter == null) { // this means the filter does not accept any documents. return null; } // we pass null as acceptDocs, as our filter has already respected acceptDocs, no need to do twice final Scorer scorer = weight.scorer(context, null); if (scorer == null) { return null; } if (scorerFirst) { return new LeapFrogScorer(weight, scorer, filterIter, scorer); } else { return new LeapFrogScorer(weight, filterIter, scorer, scorer); } } } /** * A filter strategy that advances the {@link Scorer} first and consults the * {@link DocIdSet} for each matched document. * <p> * Note: this strategy requires a {@link DocIdSet#bits()} to return a non-null value. Otherwise * this strategy falls back to {@link FilteredQuery#LEAP_FROG_QUERY_FIRST_STRATEGY} * </p> * <p> * Use this strategy if the filter computation is more expensive than document * scoring or if the filter has a linear running time to compute the next * matching doc like exact geo distances. * </p> */ private static final class QueryFirstFilterStrategy extends FilterStrategy { @Override public Scorer filteredScorer(final AtomicReaderContext context, Weight weight, DocIdSet docIdSet) throws IOException { Bits filterAcceptDocs = docIdSet.bits(); if (filterAcceptDocs == null) { // Filter does not provide random-access Bits; we // must fallback to leapfrog: return LEAP_FROG_QUERY_FIRST_STRATEGY.filteredScorer(context, weight, docIdSet); } final Scorer scorer = weight.scorer(context, null); return scorer == null ? null : new QueryFirstScorer(weight, filterAcceptDocs, scorer); } @Override public BulkScorer filteredBulkScorer(final AtomicReaderContext context, Weight weight, boolean scoreDocsInOrder, // ignored (we always top-score in order) DocIdSet docIdSet) throws IOException { Bits filterAcceptDocs = docIdSet.bits(); if (filterAcceptDocs == null) { // Filter does not provide random-access Bits; we // must fallback to leapfrog: return LEAP_FROG_QUERY_FIRST_STRATEGY.filteredBulkScorer(context, weight, scoreDocsInOrder, docIdSet); } final Scorer scorer = weight.scorer(context, null); return scorer == null ? null : new QueryFirstBulkScorer(scorer, filterAcceptDocs); } } }