RandomSamplingFacetsCollector.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.facet;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.facet.FacetsConfig.DimConfig;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BitDocIdSet;
import org.apache.lucene.util.FixedBitSet;

/**
 * Collects hits for subsequent faceting, using sampling if needed. Once you've
 * run a search and collect hits into this, instantiate one of the
 * {@link Facets} subclasses to do the facet counting. Note that this collector
 * does not collect the scores of matching docs (i.e.
 * {@link FacetsCollector.MatchingDocs#scores}) is {@code null}.
 * <p>
 * If you require the original set of hits, you can call
 * {@link #getOriginalMatchingDocs()}. Also, since the counts of the top-facets
 * is based on the sampled set, you can amortize the counts by calling
 * {@link #amortizeFacetCounts}.
 */
public class RandomSamplingFacetsCollector extends FacetsCollector {
  
  /**
   * Faster alternative for java.util.Random, inspired by
   * http://dmurphy747.wordpress.com/2011/03/23/xorshift-vs-random-
   * performance-in-java/
   * <p>
   * Has a period of 2^64-1
   */
  private static class XORShift64Random {
    
    private long x;
    
    /** Creates a xorshift random generator using the provided seed */
    public XORShift64Random(long seed) {
      x = seed == 0 ? 0xdeadbeef : seed;
    }
    
    /** Get the next random long value */
    public long randomLong() {
      x ^= (x << 21);
      x ^= (x >>> 35);
      x ^= (x << 4);
      return x;
    }
    
    /** Get the next random int, between 0 (inclusive) and n (exclusive) */
    public int nextInt(int n) {
      int res = (int) (randomLong() % n);
      return (res < 0) ? -res : res;
    }
    
  }
  
  private final static int NOT_CALCULATED = -1;
  
  private final int sampleSize;
  private final XORShift64Random random;
  
  private double samplingRate;
  private List<MatchingDocs> sampledDocs;
  private int totalHits = NOT_CALCULATED;
  private int leftoverBin = NOT_CALCULATED;
  private int leftoverIndex = NOT_CALCULATED;
  
  /**
   * Constructor with the given sample size and default seed.
   * 
   * @see #RandomSamplingFacetsCollector(int, long)
   */
  public RandomSamplingFacetsCollector(int sampleSize) {
    this(sampleSize, 0);
  }
  
  /**
   * Constructor with the given sample size and seed.
   * 
   * @param sampleSize
   *          The preferred sample size. If the number of hits is greater than
   *          the size, sampling will be done using a sample ratio of sampling
   *          size / totalN. For example: 1000 hits, sample size = 10 results in
   *          samplingRatio of 0.01. If the number of hits is lower, no sampling
   *          is done at all
   * @param seed
   *          The random seed. If {@code 0} then a seed will be chosen for you.
   */
  public RandomSamplingFacetsCollector(int sampleSize, long seed) {
    super(false);
    this.sampleSize = sampleSize;
    this.random = new XORShift64Random(seed);
    this.sampledDocs = null;
  }
  
  /**
   * Returns the sampled list of the matching documents. Note that a
   * {@link FacetsCollector.MatchingDocs} instance is returned per segment, even
   * if no hits from that segment are included in the sampled set.
   * <p>
   * Note: One or more of the MatchingDocs might be empty (not containing any
   * hits) as result of sampling.
   * <p>
   * Note: {@code MatchingDocs.totalHits} is copied from the original
   * MatchingDocs, scores is set to {@code null}
   */
  @Override
  public List<MatchingDocs> getMatchingDocs() {
    List<MatchingDocs> matchingDocs = super.getMatchingDocs();
    
    if (totalHits == NOT_CALCULATED) {
      totalHits = 0;
      for (MatchingDocs md : matchingDocs) {
        totalHits += md.totalHits;
      }
    }
    
    if (totalHits <= sampleSize) {
      return matchingDocs;
    }
    
    if (sampledDocs == null) {
      samplingRate = (1.0 * sampleSize) / totalHits;
      sampledDocs = createSampledDocs(matchingDocs);
    }
    return sampledDocs;
  }
  
  /** Returns the original matching documents. */
  public List<MatchingDocs> getOriginalMatchingDocs() {
    return super.getMatchingDocs();
  }
  
  /** Create a sampled copy of the matching documents list. */
  private List<MatchingDocs> createSampledDocs(List<MatchingDocs> matchingDocsList) {
    List<MatchingDocs> sampledDocsList = new ArrayList<>(matchingDocsList.size());
    for (MatchingDocs docs : matchingDocsList) {
      sampledDocsList.add(createSample(docs));
    }
    return sampledDocsList;
  }
  
  /** Create a sampled of the given hits. */
  private MatchingDocs createSample(MatchingDocs docs) {
    int maxdoc = docs.context.reader().maxDoc();
    
    // TODO: we could try the WAH8DocIdSet here as well, as the results will be sparse
    FixedBitSet sampleDocs = new FixedBitSet(maxdoc);
    
    int binSize = (int) (1.0 / samplingRate);
    
    try {
      int counter = 0;
      int limit, randomIndex;
      if (leftoverBin != NOT_CALCULATED) {
        limit = leftoverBin;
        // either NOT_CALCULATED, which means we already sampled from that bin,
        // or the next document to sample
        randomIndex = leftoverIndex;
      } else {
        limit = binSize;
        randomIndex = random.nextInt(binSize);
      }
      final DocIdSetIterator it = docs.bits.iterator();
      for (int doc = it.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = it.nextDoc()) {
        if (counter == randomIndex) {
          sampleDocs.set(doc);
        }
        counter++;
        if (counter >= limit) {
          counter = 0;
          limit = binSize;
          randomIndex = random.nextInt(binSize);
        }
      }
      
      if (counter == 0) {
        // we either exhausted the bin and the iterator at the same time, or
        // this segment had no results. in the latter case we might want to
        // carry leftover to the next segment as is, but that complicates the
        // code and doesn't seem so important.
        leftoverBin = leftoverIndex = NOT_CALCULATED;
      } else {
        leftoverBin = limit - counter;
        if (randomIndex > counter) {
          // the document to sample is in the next bin
          leftoverIndex = randomIndex - counter;
        } else if (randomIndex < counter) {
          // we sampled a document from the bin, so just skip over remaining
          // documents in the bin in the next segment.
          leftoverIndex = NOT_CALCULATED;
        }
      }
      
      return new MatchingDocs(docs.context, new BitDocIdSet(sampleDocs), docs.totalHits, null);
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }
  
  /**
   * Note: if you use a counting {@link Facets} implementation, you can amortize the
   * sampled counts by calling this method. Uses the {@link FacetsConfig} and
   * the {@link IndexSearcher} to determine the upper bound for each facet value.
   */
  public FacetResult amortizeFacetCounts(FacetResult res, FacetsConfig config, IndexSearcher searcher) throws IOException {
    if (res == null || totalHits <= sampleSize) {
      return res;
    }
    
    LabelAndValue[] fixedLabelValues = new LabelAndValue[res.labelValues.length];
    IndexReader reader = searcher.getIndexReader();
    DimConfig dimConfig = config.getDimConfig(res.dim);
    
    // +2 to prepend dimension, append child label
    String[] childPath = new String[res.path.length + 2];
    childPath[0] = res.dim;
    
    System.arraycopy(res.path, 0, childPath, 1, res.path.length); // reuse
    
    for (int i = 0; i < res.labelValues.length; i++) {
      childPath[res.path.length + 1] = res.labelValues[i].label;
      String fullPath = FacetsConfig.pathToString(childPath, childPath.length);
      int max = reader.docFreq(new Term(dimConfig.indexFieldName, fullPath));
      int correctedCount = (int) (res.labelValues[i].value.doubleValue() / samplingRate);
      correctedCount = Math.min(max, correctedCount);
      fixedLabelValues[i] = new LabelAndValue(res.labelValues[i].label, correctedCount);
    }
    
    // cap the total count on the total number of non-deleted documents in the reader
    int correctedTotalCount = res.value.intValue();
    if (correctedTotalCount > 0) {
      correctedTotalCount = Math.min(reader.numDocs(), (int) (res.value.doubleValue() / samplingRate));
    }
    
    return new FacetResult(res.dim, res.path, correctedTotalCount, fixedLabelValues, res.childCount);
  }
  
  /** Returns the sampling rate that was used. */
  public double getSamplingRate() {
    return samplingRate;
  }
  
}