FilteredQuery.java example

Explorer
heliosearch-master
- lucene
- solr
package org.apache.lucene.search;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ToStringUtils;

import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Set;


/**
 * A query that applies a filter to the results of another query.
 *
 * <p>Note: the bits are retrieved from the filter each time this
 * query is used in a search - use a CachingWrapperFilter to avoid
 * regenerating the bits every time.
 * @since   1.4
 * @see     CachingWrapperFilter
 */
public class FilteredQuery extends Query {

  private final Query query;
  private final Filter filter;
  private final FilterStrategy strategy;

  /**
   * Constructs a new query which applies a filter to the results of the original query.
   * {@link Filter#getDocIdSet} will be called every time this query is used in a search.
   * @param query  Query to be filtered, cannot be <code>null</code>.
   * @param filter Filter to apply to query results, cannot be <code>null</code>.
   */
  public FilteredQuery(Query query, Filter filter) {
    this(query, filter, RANDOM_ACCESS_FILTER_STRATEGY);
  }
  
  /**
   * Expert: Constructs a new query which applies a filter to the results of the original query.
   * {@link Filter#getDocIdSet} will be called every time this query is used in a search.
   * @param query  Query to be filtered, cannot be <code>null</code>.
   * @param filter Filter to apply to query results, cannot be <code>null</code>.
   * @param strategy a filter strategy used to create a filtered scorer. 
   * 
   * @see FilterStrategy
   */
  public FilteredQuery(Query query, Filter filter, FilterStrategy strategy) {
    if (query == null || filter == null)
      throw new IllegalArgumentException("Query and filter cannot be null.");
    if (strategy == null)
      throw new IllegalArgumentException("FilterStrategy can not be null");
    this.strategy = strategy;
    this.query = query;
    this.filter = filter;
  }
  
  /**
   * Returns a Weight that applies the filter to the enclosed query's Weight.
   * This is accomplished by overriding the Scorer returned by the Weight.
   */
  @Override
  public Weight createWeight(final IndexSearcher searcher) throws IOException {
    final Weight weight = query.createWeight (searcher);
    return new Weight() {
      
      @Override
      public boolean scoresDocsOutOfOrder() {
        return true;
      }

      @Override
      public float getValueForNormalization() throws IOException { 
        return weight.getValueForNormalization() * getBoost() * getBoost(); // boost sub-weight
      }

      @Override
      public void normalize(float norm, float topLevelBoost) { 
        weight.normalize(norm, topLevelBoost * getBoost()); // incorporate boost
      }

      @Override
      public Explanation explain(AtomicReaderContext ir, int i) throws IOException {
        Explanation inner = weight.explain (ir, i);
        Filter f = FilteredQuery.this.filter;
        DocIdSet docIdSet = f.getDocIdSet(ir, ir.reader().getLiveDocs());
        DocIdSetIterator docIdSetIterator = docIdSet == null ? DocIdSetIterator.empty() : docIdSet.iterator();
        if (docIdSetIterator == null) {
          docIdSetIterator = DocIdSetIterator.empty();
        }
        if (docIdSetIterator.advance(i) == i) {
          return inner;
        } else {
          Explanation result = new Explanation
            (0.0f, "failure to match filter: " + f.toString());
          result.addDetail(inner);
          return result;
        }
      }

      // return this query
      @Override
      public Query getQuery() {
        return FilteredQuery.this;
      }

      // return a filtering scorer
      @Override
      public Scorer scorer(AtomicReaderContext context, Bits acceptDocs) throws IOException {
        assert filter != null;

        DocIdSet filterDocIdSet = filter.getDocIdSet(context, acceptDocs);
        if (filterDocIdSet == null) {
          // this means the filter does not accept any documents.
          return null;
        }

        return strategy.filteredScorer(context, weight, filterDocIdSet);
      }

      // return a filtering top scorer
      @Override
      public BulkScorer bulkScorer(AtomicReaderContext context, boolean scoreDocsInOrder, Bits acceptDocs) throws IOException {
        assert filter != null;

        DocIdSet filterDocIdSet = filter.getDocIdSet(context, acceptDocs);
        if (filterDocIdSet == null) {
          // this means the filter does not accept any documents.
          return null;
        }

        return strategy.filteredBulkScorer(context, weight, scoreDocsInOrder, filterDocIdSet);
      }
    };
  }
  
  /**
   * A scorer that consults the filter iff a document was matched by the
   * delegate scorer. This is useful if the filter computation is more expensive
   * than document scoring or if the filter has a linear running time to compute
   * the next matching doc like exact geo distances.
   */
  private static final class QueryFirstScorer extends Scorer {
    private final Scorer scorer;
    private int scorerDoc = -1;
    private final Bits filterBits;

    protected QueryFirstScorer(Weight weight, Bits filterBits, Scorer other) {
      super(weight);
      this.scorer = other;
      this.filterBits = filterBits;
    }

    @Override
    public int nextDoc() throws IOException {
      int doc;
      for(;;) {
        doc = scorer.nextDoc();
        if (doc == Scorer.NO_MORE_DOCS || filterBits.get(doc)) {
          return scorerDoc = doc;
        }
      } 
    }
    
    @Override
    public int advance(int target) throws IOException {
      int doc = scorer.advance(target);
      if (doc != Scorer.NO_MORE_DOCS && !filterBits.get(doc)) {
        return scorerDoc = nextDoc();
      } else {
        return scorerDoc = doc;
      }
    }

    @Override
    public int docID() {
      return scorerDoc;
    }
    
    @Override
    public float score() throws IOException {
      return scorer.score();
    }
    
    @Override
    public int freq() throws IOException { return scorer.freq(); }
    
    @Override
    public Collection<ChildScorer> getChildren() {
      return Collections.singleton(new ChildScorer(scorer, "FILTERED"));
    }

    @Override
    public long cost() {
      return scorer.cost();
    }
  }

  private static class QueryFirstBulkScorer extends BulkScorer {

    private final Scorer scorer;
    private final Bits filterBits;

    public QueryFirstBulkScorer(Scorer scorer, Bits filterBits) {
      this.scorer = scorer;
      this.filterBits = filterBits;
    }

    @Override
    public boolean score(Collector collector, int maxDoc) throws IOException {
      // the normalization trick already applies the boost of this query,
      // so we can use the wrapped scorer directly:
      collector.setScorer(scorer);
      if (scorer.docID() == -1) {
        scorer.nextDoc();
      }
      while (true) {
        final int scorerDoc = scorer.docID();
        if (scorerDoc < maxDoc) {
          if (filterBits.get(scorerDoc)) {
            collector.collect(scorerDoc);
          }
          scorer.nextDoc();
        } else {
          break;
        }
      }

      return scorer.docID() != Scorer.NO_MORE_DOCS;
    }
  }
  
  /**
   * A Scorer that uses a "leap-frog" approach (also called "zig-zag join"). The scorer and the filter
   * take turns trying to advance to each other's next matching document, often
   * jumping past the target document. When both land on the same document, it's
   * collected.
   */
  private static class LeapFrogScorer extends Scorer {
    private final DocIdSetIterator secondary;
    private final DocIdSetIterator primary;
    private final Scorer scorer;
    protected int primaryDoc = -1;
    protected int secondaryDoc = -1;

    protected LeapFrogScorer(Weight weight, DocIdSetIterator primary, DocIdSetIterator secondary, Scorer scorer) {
      super(weight);
      this.primary = primary;
      this.secondary = secondary;
      this.scorer = scorer;
    }

    private final int advanceToNextCommonDoc() throws IOException {
      for (;;) {
        if (secondaryDoc < primaryDoc) {
          secondaryDoc = secondary.advance(primaryDoc);
        } else if (secondaryDoc == primaryDoc) {
          return primaryDoc;
        } else {
          primaryDoc = primary.advance(secondaryDoc);
        }
      }
    }

    @Override
    public final int nextDoc() throws IOException {
      primaryDoc = primaryNext();
      return advanceToNextCommonDoc();
    }
    
    protected int primaryNext() throws IOException {
      return primary.nextDoc();
    }
    
    @Override
    public final int advance(int target) throws IOException {
      if (target > primaryDoc) {
        primaryDoc = primary.advance(target);
      }
      return advanceToNextCommonDoc();
    }

    @Override
    public final int docID() {
      return secondaryDoc;
    }
    
    @Override
    public final float score() throws IOException {
      return scorer.score();
    }
    
    @Override
    public final int freq() throws IOException {
      return scorer.freq();
    }
    
    @Override
    public final Collection<ChildScorer> getChildren() {
      return Collections.singleton(new ChildScorer(scorer, "FILTERED"));
    }

    @Override
    public long cost() {
      return Math.min(primary.cost(), secondary.cost());
    }
  }
  
  // TODO once we have way to figure out if we use RA or LeapFrog we can remove this scorer
  private static final class PrimaryAdvancedLeapFrogScorer extends LeapFrogScorer {
    private final int firstFilteredDoc;

    protected PrimaryAdvancedLeapFrogScorer(Weight weight, int firstFilteredDoc, DocIdSetIterator filterIter, Scorer other) {
      super(weight, filterIter, other, other);
      this.firstFilteredDoc = firstFilteredDoc;
      this.primaryDoc = firstFilteredDoc; // initialize to prevent and advance call to move it further
    }

    @Override
    protected int primaryNext() throws IOException {
      if (secondaryDoc != -1) {
        return super.primaryNext();
      } else {
        return firstFilteredDoc;
      }
    }
  }
  
  /** Rewrites the query. If the wrapped is an instance of
   * {@link MatchAllDocsQuery} it returns a {@link ConstantScoreQuery}. Otherwise
   * it returns a new {@code FilteredQuery} wrapping the rewritten query. */
  @Override
  public Query rewrite(IndexReader reader) throws IOException {
    final Query queryRewritten = query.rewrite(reader);
    
    if (queryRewritten != query) {
      // rewrite to a new FilteredQuery wrapping the rewritten query
      final Query rewritten = new FilteredQuery(queryRewritten, filter, strategy);
      rewritten.setBoost(this.getBoost());
      return rewritten;
    } else {
      // nothing to rewrite, we are done!
      return this;
    }
  }

  /** Returns this FilteredQuery's (unfiltered) Query */
  public final Query getQuery() {
    return query;
  }

  /** Returns this FilteredQuery's filter */
  public final Filter getFilter() {
    return filter;
  }
  
  /** Returns this FilteredQuery's {@link FilterStrategy} */
  public FilterStrategy getFilterStrategy() {
    return this.strategy;
  }

  // inherit javadoc
  @Override
  public void extractTerms(Set<Term> terms) {
    getQuery().extractTerms(terms);
  }

  /** Prints a user-readable version of this query. */
  @Override
  public String toString (String s) {
    StringBuilder buffer = new StringBuilder();
    buffer.append("filtered(");
    buffer.append(query.toString(s));
    buffer.append(")->");
    buffer.append(filter);
    buffer.append(ToStringUtils.boost(getBoost()));
    return buffer.toString();
  }

  /** Returns true iff <code>o</code> is equal to this. */
  @Override
  public boolean equals(Object o) {
    if (o == this)
      return true;
    if (!super.equals(o))
      return false;
    assert o instanceof FilteredQuery;
    final FilteredQuery fq = (FilteredQuery) o;
    return fq.query.equals(this.query) && fq.filter.equals(this.filter) && fq.strategy.equals(this.strategy);
  }

  /** Returns a hash code value for this object. */
  @Override
  public int hashCode() {
    int hash = super.hashCode();
    hash = hash * 31 + strategy.hashCode();
    hash = hash * 31 + query.hashCode();
    hash = hash * 31 + filter.hashCode();
    return hash;
  }
  
  /**
   * A {@link FilterStrategy} that conditionally uses a random access filter if
   * the given {@link DocIdSet} supports random access (returns a non-null value
   * from {@link DocIdSet#bits()}) and
   * {@link RandomAccessFilterStrategy#useRandomAccess(Bits, int)} returns
   * <code>true</code>. Otherwise this strategy falls back to a "zig-zag join" (
   * {@link FilteredQuery#LEAP_FROG_FILTER_FIRST_STRATEGY}) strategy.
   * 
   * <p>
   * Note: this strategy is the default strategy in {@link FilteredQuery}
   * </p>
   */
  public static final FilterStrategy RANDOM_ACCESS_FILTER_STRATEGY = new RandomAccessFilterStrategy();
  
  /**
   * A filter strategy that uses a "leap-frog" approach (also called "zig-zag join"). 
   * The scorer and the filter
   * take turns trying to advance to each other's next matching document, often
   * jumping past the target document. When both land on the same document, it's
   * collected.
   * <p>
   * Note: This strategy uses the filter to lead the iteration.
   * </p> 
   */
  public static final FilterStrategy LEAP_FROG_FILTER_FIRST_STRATEGY = new LeapFrogFilterStrategy(false);
  
  /**
   * A filter strategy that uses a "leap-frog" approach (also called "zig-zag join"). 
   * The scorer and the filter
   * take turns trying to advance to each other's next matching document, often
   * jumping past the target document. When both land on the same document, it's
   * collected.
   * <p>
   * Note: This strategy uses the query to lead the iteration.
   * </p> 
   */
  public static final FilterStrategy LEAP_FROG_QUERY_FIRST_STRATEGY = new LeapFrogFilterStrategy(true);
  
  /**
   * A filter strategy that advances the Query or rather its {@link Scorer} first and consults the
   * filter {@link DocIdSet} for each matched document.
   * <p>
   * Note: this strategy requires a {@link DocIdSet#bits()} to return a non-null value. Otherwise
   * this strategy falls back to {@link FilteredQuery#LEAP_FROG_QUERY_FIRST_STRATEGY}
   * </p>
   * <p>
   * Use this strategy if the filter computation is more expensive than document
   * scoring or if the filter has a linear running time to compute the next
   * matching doc like exact geo distances.
   * </p>
   */
  public static final FilterStrategy QUERY_FIRST_FILTER_STRATEGY = new QueryFirstFilterStrategy();
  
  /** Abstract class that defines how the filter ({@link DocIdSet}) applied during document collection. */
  public static abstract class FilterStrategy {
    
    /**
     * Returns a filtered {@link Scorer} based on this strategy.
     * 
     * @param context
     *          the {@link AtomicReaderContext} for which to return the {@link Scorer}.
     * @param weight the {@link FilteredQuery} {@link Weight} to create the filtered scorer.
     * @param docIdSet the filter {@link DocIdSet} to apply
     * @return a filtered scorer
     * 
     * @throws IOException if an {@link IOException} occurs
     */
    public abstract Scorer filteredScorer(AtomicReaderContext context,
        Weight weight, DocIdSet docIdSet) throws IOException;

    /**
     * Returns a filtered {@link BulkScorer} based on this
     * strategy.  This is an optional method: the default
     * implementation just calls {@link #filteredScorer} and
     * wraps that into a BulkScorer.
     *
     * @param context
     *          the {@link AtomicReaderContext} for which to return the {@link Scorer}.
     * @param weight the {@link FilteredQuery} {@link Weight} to create the filtered scorer.
     * @param docIdSet the filter {@link DocIdSet} to apply
     * @return a filtered top scorer
     */
    public BulkScorer filteredBulkScorer(AtomicReaderContext context,
        Weight weight, boolean scoreDocsInOrder, DocIdSet docIdSet) throws IOException {
      Scorer scorer = filteredScorer(context, weight, docIdSet);
      if (scorer == null) {
        return null;
      }
      // This impl always scores docs in order, so we can
      // ignore scoreDocsInOrder:
      return new Weight.DefaultBulkScorer(scorer);
    }
  }
  
  /**
   * A {@link FilterStrategy} that conditionally uses a random access filter if
   * the given {@link DocIdSet} supports random access (returns a non-null value
   * from {@link DocIdSet#bits()}) and
   * {@link RandomAccessFilterStrategy#useRandomAccess(Bits, int)} returns
   * <code>true</code>. Otherwise this strategy falls back to a "zig-zag join" (
   * {@link FilteredQuery#LEAP_FROG_FILTER_FIRST_STRATEGY}) strategy .
   */
  public static class RandomAccessFilterStrategy extends FilterStrategy {

    @Override
    public Scorer filteredScorer(AtomicReaderContext context, Weight weight, DocIdSet docIdSet) throws IOException {
      final DocIdSetIterator filterIter = docIdSet.iterator();
      if (filterIter == null) {
        // this means the filter does not accept any documents.
        return null;
      }  

      final int firstFilterDoc = filterIter.nextDoc();
      if (firstFilterDoc == DocIdSetIterator.NO_MORE_DOCS) {
        return null;
      }
      
      final Bits filterAcceptDocs = docIdSet.bits();
      // force if RA is requested
      final boolean useRandomAccess = filterAcceptDocs != null && useRandomAccess(filterAcceptDocs, firstFilterDoc);
      if (useRandomAccess) {
        // if we are using random access, we return the inner scorer, just with other acceptDocs
        return weight.scorer(context, filterAcceptDocs);
      } else {
        assert firstFilterDoc > -1;
        // we are gonna advance() this scorer, so we set inorder=true/toplevel=false
        // we pass null as acceptDocs, as our filter has already respected acceptDocs, no need to do twice
        final Scorer scorer = weight.scorer(context, null);
        // TODO once we have way to figure out if we use RA or LeapFrog we can remove this scorer
        return (scorer == null) ? null : new PrimaryAdvancedLeapFrogScorer(weight, firstFilterDoc, filterIter, scorer);
      }
    }
    
    /**
     * Expert: decides if a filter should be executed as "random-access" or not.
     * random-access means the filter "filters" in a similar way as deleted docs are filtered
     * in Lucene. This is faster when the filter accepts many documents.
     * However, when the filter is very sparse, it can be faster to execute the query+filter
     * as a conjunction in some cases.
     * 
     * The default implementation returns <code>true</code> if the first document accepted by the
     * filter is < 100.
     * 
     * @lucene.internal
     */
    protected boolean useRandomAccess(Bits bits, int firstFilterDoc) {
      //TODO once we have a cost API on filters and scorers we should rethink this heuristic
      return firstFilterDoc < 100;
    }
  }
  
  private static final class LeapFrogFilterStrategy extends FilterStrategy {
    
    private final boolean scorerFirst;
    
    private LeapFrogFilterStrategy(boolean scorerFirst) {
      this.scorerFirst = scorerFirst;
    }

    @Override
    public Scorer filteredScorer(AtomicReaderContext context,
        Weight weight, DocIdSet docIdSet) throws IOException {
      final DocIdSetIterator filterIter = docIdSet.iterator();
      if (filterIter == null) {
        // this means the filter does not accept any documents.
        return null;
      }
      // we pass null as acceptDocs, as our filter has already respected acceptDocs, no need to do twice
      final Scorer scorer = weight.scorer(context, null);
      if (scorer == null) {
        return null;
      }

      if (scorerFirst) {
        return new LeapFrogScorer(weight, scorer, filterIter, scorer);  
      } else {
        return new LeapFrogScorer(weight, filterIter, scorer, scorer);  
      }
    }
  }
  
  /**
   * A filter strategy that advances the {@link Scorer} first and consults the
   * {@link DocIdSet} for each matched document.
   * <p>
   * Note: this strategy requires a {@link DocIdSet#bits()} to return a non-null value. Otherwise
   * this strategy falls back to {@link FilteredQuery#LEAP_FROG_QUERY_FIRST_STRATEGY}
   * </p>
   * <p>
   * Use this strategy if the filter computation is more expensive than document
   * scoring or if the filter has a linear running time to compute the next
   * matching doc like exact geo distances.
   * </p>
   */
  private static final class QueryFirstFilterStrategy extends FilterStrategy {
    @Override
    public Scorer filteredScorer(final AtomicReaderContext context,
        Weight weight,
        DocIdSet docIdSet) throws IOException {
      Bits filterAcceptDocs = docIdSet.bits();
      if (filterAcceptDocs == null) {
        // Filter does not provide random-access Bits; we
        // must fallback to leapfrog:
        return LEAP_FROG_QUERY_FIRST_STRATEGY.filteredScorer(context, weight, docIdSet);
      }
      final Scorer scorer = weight.scorer(context, null);
      return scorer == null ? null : new QueryFirstScorer(weight,
          filterAcceptDocs, scorer);
    }

    @Override
    public BulkScorer filteredBulkScorer(final AtomicReaderContext context,
        Weight weight,
        boolean scoreDocsInOrder, // ignored (we always top-score in order)
        DocIdSet docIdSet) throws IOException {
      Bits filterAcceptDocs = docIdSet.bits();
      if (filterAcceptDocs == null) {
        // Filter does not provide random-access Bits; we
        // must fallback to leapfrog:
        return LEAP_FROG_QUERY_FIRST_STRATEGY.filteredBulkScorer(context, weight, scoreDocsInOrder, docIdSet);
      }
      final Scorer scorer = weight.scorer(context, null);
      return scorer == null ? null : new QueryFirstBulkScorer(scorer, filterAcceptDocs);
    }
  }
  
}