DocSetUtil.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.search;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;

import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.solr.common.SolrException;

/** @lucene.experimental */
public class DocSetUtil {

  /** The cut-off point for small sets (SortedIntDocSet) vs large sets (BitDocSet) */
  public static int smallSetSize(int maxDoc) {
    return (maxDoc>>6)+5;  // The +5 is for better test coverage for small sets
  }

  /**
   * Iterates DocSets to test for equality - slow and for testing purposes only.
   * @lucene.internal
   */
  public static boolean equals(DocSet a, DocSet b) {
    DocIterator iter1 = a.iterator();
    DocIterator iter2 = b.iterator();

    for (;;) {
      boolean n1 = iter1.hasNext();
      boolean n2 = iter2.hasNext();
      if (n1 != n2) {
        return false;
      }
      if (!n1) return true;  // made it to end
      int d1 = iter1.nextDoc();
      int d2 = iter2.nextDoc();
      if (d1 != d2) {
        return false;
      }
    }
  }

  /**
   * This variant of getDocSet will attempt to do some deduplication
   * on certain DocSets such as DocSets that match numDocs.  This means it can return
   * a cached version of the set, and the returned set should not be modified.
   * @lucene.experimental
   */
  public static DocSet getDocSet(DocSetCollector collector, SolrIndexSearcher searcher) {
    if (collector.size() == searcher.numDocs()) {
      if (!searcher.isLiveDocsInstantiated()) {
        searcher.setLiveDocs( collector.getDocSet() );
      }
      try {
        return searcher.getLiveDocs();
      } catch (IOException e) {
        // should be impossible... liveDocs should exist, so no IO should be necessary
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      }
    }

    return collector.getDocSet();
  }

  /**
   * This variant of getDocSet maps all sets with size numDocs to searcher.getLiveDocs.
   * The returned set should not be modified.
   * @lucene.experimental
   */
  public static DocSet getDocSet(DocSet docs, SolrIndexSearcher searcher) {
    if (docs.size() == searcher.numDocs()) {
      if (!searcher.isLiveDocsInstantiated()) {
        searcher.setLiveDocs( docs );
      }
      try {
        // if this docset has the same cardinality as liveDocs, return liveDocs instead
        // so this set will be short lived garbage.
        return searcher.getLiveDocs();
      } catch (IOException e) {
        // should be impossible... liveDocs should exist, so no IO should be necessary
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      }
    }

    return docs;
  }

  // implementers of DocSetProducer should not call this with themselves or it will result in an infinite loop
  public static DocSet createDocSet(SolrIndexSearcher searcher, Query query, DocSet filter) throws IOException {

    if (filter != null) {
        Filter luceneFilter = filter.getTopFilter();
        query = new BooleanQuery.Builder()
            .add(query, BooleanClause.Occur.MUST)
            .add(luceneFilter, BooleanClause.Occur.FILTER)
            .build();
    }

    if (query instanceof TermQuery) {
      DocSet set = createDocSet(searcher, ((TermQuery)query).getTerm() );
      // assert equals(set, createDocSetGeneric(searcher, query));
      return set;
    } else if (query instanceof DocSetProducer) {
      DocSet set = ((DocSetProducer) query).createDocSet(searcher);
      // assert equals(set, createDocSetGeneric(searcher, query));
      return set;
    }

    return createDocSetGeneric(searcher, query);
  }

  // code to produce docsets for non-docsetproducer queries
  public static DocSet createDocSetGeneric(SolrIndexSearcher searcher, Query query) throws IOException {

    int maxDoc = searcher.getIndexReader().maxDoc();
    DocSetCollector collector = new DocSetCollector(maxDoc);

    // This may throw an ExitableDirectoryReader.ExitingReaderException
    // but we should not catch it here, as we don't know how this DocSet will be used (it could be negated before use) or cached.
    searcher.search(query, collector);

    return getDocSet(collector, searcher);
  }

  public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException {
    DirectoryReader reader = searcher.getRawReader();  // raw reader to avoid extra wrapping overhead
    int maxDoc = searcher.getIndexReader().maxDoc();
    int smallSetSize = smallSetSize(maxDoc);

    String field = term.field();
    BytesRef termVal = term.bytes();

    int maxCount = 0;
    int firstReader = -1;
    List<LeafReaderContext> leaves = reader.leaves();
    PostingsEnum[] postList = new PostingsEnum[leaves.size()]; // use array for slightly higher scanning cost, but fewer memory allocations
    for (LeafReaderContext ctx : leaves) {
      assert leaves.get(ctx.ord) == ctx;
      LeafReader r = ctx.reader();
      Fields f = r.fields();
      Terms t = f.terms(field);
      if (t == null) continue;  // field is missing
      TermsEnum te = t.iterator();
      if (te.seekExact(termVal)) {
        maxCount += te.docFreq();
        postList[ctx.ord] = te.postings(null, PostingsEnum.NONE);
        if (firstReader < 0) firstReader = ctx.ord;
      }
    }

    DocSet answer = null;
    if (maxCount == 0) {
      answer = DocSet.EMPTY;
    } else if (maxCount <= smallSetSize) {
      answer = createSmallSet(leaves, postList, maxCount, firstReader);
    } else {
      answer = createBigSet(leaves, postList, maxDoc, firstReader);
    }

    return DocSetUtil.getDocSet( answer, searcher );
  }

  private static DocSet createSmallSet(List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxPossible, int firstReader) throws IOException {
    int[] docs = new int[maxPossible];
    int sz = 0;
    for (int i = firstReader; i < postList.length; i++) {
      PostingsEnum postings = postList[i];
      if (postings == null) continue;
      LeafReaderContext ctx = leaves.get(i);
      Bits liveDocs = ctx.reader().getLiveDocs();
      int base = ctx.docBase;
      for (; ; ) {
        int subId = postings.nextDoc();
        if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
        if (liveDocs != null && !liveDocs.get(subId)) continue;
        int globalId = subId + base;
        docs[sz++] = globalId;
      }
    }

    return new SortedIntDocSet(docs, sz);
  }


  private static DocSet createBigSet(List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxDoc, int firstReader) throws IOException {
    long[] bits = new long[FixedBitSet.bits2words(maxDoc)];
    int sz = 0;
    for (int i = firstReader; i < postList.length; i++) {
      PostingsEnum postings = postList[i];
      if (postings == null) continue;
      LeafReaderContext ctx = leaves.get(i);
      Bits liveDocs = ctx.reader().getLiveDocs();
      int base = ctx.docBase;
      for (; ; ) {
        int subId = postings.nextDoc();
        if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
        if (liveDocs != null && !liveDocs.get(subId)) continue;
        int globalId = subId + base;
        bits[globalId >> 6] |= (1L << globalId);
        sz++;
      }
    }

    BitDocSet docSet = new BitDocSet( new FixedBitSet(bits, maxDoc), sz );

    int smallSetSize = smallSetSize(maxDoc);
    if (sz < smallSetSize) {
      // make this optional?
      DocSet smallSet = toSmallSet( docSet );
      // assert equals(docSet, smallSet);
      return smallSet;
    }

    return docSet;
  }

  public static DocSet toSmallSet(BitDocSet bitSet) {
    int sz = bitSet.size();
    int[] docs = new int[sz];
    FixedBitSet bs = bitSet.getBits();
    int doc = -1;
    for (int i=0; i<sz; i++) {
      doc = bs.nextSetBit(doc + 1);
      docs[i] = doc;
    }
    return new SortedIntDocSet(docs);
  }

  public static void collectSortedDocSet(DocSet docs, IndexReader reader, Collector collector) throws IOException {
    // TODO add SortedDocSet sub-interface and take that.
    // TODO collectUnsortedDocSet: iterate segment, then all docSet per segment.

    final List<LeafReaderContext> leaves = reader.leaves();
    final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
    int segBase = 0;
    int segMax;
    int adjustedMax = 0;
    LeafReaderContext ctx = null;
    LeafCollector leafCollector = null;
    for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
      final int doc = docsIt.nextDoc();
      if (doc >= adjustedMax) {
        do {
          ctx = ctxIt.next();
          segBase = ctx.docBase;
          segMax = ctx.reader().maxDoc();
          adjustedMax = segBase + segMax;
        } while (doc >= adjustedMax);
        leafCollector = collector.getLeafCollector(ctx);
      }
      if (doc < segBase) {
        throw new IllegalStateException("algorithm expects sorted DocSet but wasn't: " + docs.getClass());
      }
      leafCollector.collect(doc - segBase);  // per-seg collectors
    }
  }

}