package org.apache.lucene.facet.search.sampling;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.facet.search.CategoryListIterator;
import org.apache.lucene.facet.search.FacetArrays;
import org.apache.lucene.facet.search.ScoredDocIDs;
import org.apache.lucene.facet.search.aggregator.Aggregator;
import org.apache.lucene.facet.search.params.FacetRequest;
import org.apache.lucene.facet.search.params.FacetSearchParams;
import org.apache.lucene.facet.search.results.FacetResult;
import org.apache.lucene.facet.search.results.FacetResultNode;
import org.apache.lucene.facet.search.results.MutableFacetResultNode;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Sampling definition for facets accumulation
* <p>
* The Sampler uses TAKMI style counting to provide a 'best guess' top-K result
* set of the facets accumulated.
* <p>
* Note: Sampling accumulation (Accumulation over a sampled-set of the results),
* does not guarantee accurate values for
* {@link FacetResult#getNumValidDescendants()} &
* {@link FacetResultNode#getResidue()}.
*
* @lucene.experimental
*/
public abstract class Sampler {
protected final SamplingParams samplingParams;
/**
* Construct with {@link SamplingParams}
*/
public Sampler() {
this(new SamplingParams());
}
/**
* Construct with certain {@link SamplingParams}
* @param params sampling params in effect
* @throws IllegalArgumentException if the provided SamplingParams are not valid
*/
public Sampler(SamplingParams params) throws IllegalArgumentException {
if (!params.validate()) {
throw new IllegalArgumentException("The provided SamplingParams are not valid!!");
}
this.samplingParams = params;
}
/**
* Check if this sampler would complement for the input docIds
*/
public boolean shouldSample(ScoredDocIDs docIds) {
return docIds.size() > samplingParams.getSamplingThreshold();
}
/**
* Compute a sample set out of the input set, based on the {@link SamplingParams#getSampleRatio()}
* in effect. Sub classes can override to alter how the sample set is
* computed.
* <p>
* If the input set is of size smaller than {@link SamplingParams#getMinSampleSize()},
* the input set is returned (no sampling takes place).
* <p>
* Other than that, the returned set size will not be larger than {@link SamplingParams#getMaxSampleSize()}
* nor smaller than {@link SamplingParams#getMinSampleSize()}.
* @param docids
* full set of matching documents out of which a sample is needed.
*/
public SampleResult getSampleSet(ScoredDocIDs docids) throws IOException {
if (!shouldSample(docids)) {
return new SampleResult(docids, 1d);
}
int actualSize = docids.size();
int sampleSetSize = (int) (actualSize * samplingParams.getSampleRatio());
sampleSetSize = Math.max(sampleSetSize, samplingParams.getMinSampleSize());
sampleSetSize = Math.min(sampleSetSize, samplingParams.getMaxSampleSize());
return createSample(docids, actualSize, sampleSetSize);
}
/**
* Create and return a sample of the input set
* @param docids input set out of which a sample is to be created
* @param actualSize original size of set, prior to sampling
* @param sampleSetSize required size of sample set
* @return sample of the input set in the required size
*/
protected abstract SampleResult createSample(ScoredDocIDs docids, int actualSize,
int sampleSetSize) throws IOException;
/**
* Get a fixer of sample facet accumulation results. Default implementation
* returns a <code>TakmiSampleFixer</code> which is adequate only for
* counting. For any other accumulator, provide a different fixer.
*/
public SampleFixer getSampleFixer(
IndexReader indexReader, TaxonomyReader taxonomyReader,
FacetSearchParams searchParams) {
return new TakmiSampleFixer(indexReader, taxonomyReader, searchParams);
}
/**
* Result of sample computation
*/
public final static class SampleResult {
public final ScoredDocIDs docids;
public final double actualSampleRatio;
protected SampleResult(ScoredDocIDs docids, double actualSampleRatio) {
this.docids = docids;
this.actualSampleRatio = actualSampleRatio;
}
}
/**
* Return the sampling params in effect
*/
public final SamplingParams getSamplingParams() {
return samplingParams;
}
/**
* Trim the input facet result.<br>
* Note: It is only valid to call this method with result obtained for a
* facet request created through {@link #overSampledSearchParams(FacetSearchParams)}.
*
* @throws IllegalArgumentException
* if called with results not obtained for requests created
* through {@link #overSampledSearchParams(FacetSearchParams)}
*/
public FacetResult trimResult(FacetResult facetResult) throws IllegalArgumentException {
double overSampleFactor = getSamplingParams().getOversampleFactor();
if (overSampleFactor <= 1) { // no factoring done?
return facetResult;
}
OverSampledFacetRequest sampledFreq = null;
try {
sampledFreq = (OverSampledFacetRequest)facetResult.getFacetRequest();
} catch (ClassCastException e) {
throw new IllegalArgumentException(
"It is only valid to call this method with result obtained for a" +
"facet request created through sampler.overSamlpingSearchParams()",
e);
}
FacetRequest origFrq = sampledFreq.orig;
MutableFacetResultNode trimmedRootNode = MutableFacetResultNode.toImpl(facetResult.getFacetResultNode());
trimmedRootNode.trimSubResults(origFrq.getNumResults());
return new FacetResult(origFrq, trimmedRootNode, facetResult.getNumValidDescendants());
}
/**
* Over-sampled search params, wrapping each request with an over-sampled one.
*/
public FacetSearchParams overSampledSearchParams(FacetSearchParams original) {
FacetSearchParams res = original;
// So now we can sample -> altering the searchParams to accommodate for the statistical error for the sampling
double overSampleFactor = getSamplingParams().getOversampleFactor();
if (overSampleFactor > 1) { // any factoring to do?
res = new FacetSearchParams(original.getFacetIndexingParams());
for (FacetRequest frq: original.getFacetRequests()) {
int overSampledNumResults = (int) Math.ceil(frq.getNumResults() * overSampleFactor);
res.addFacetRequest(new OverSampledFacetRequest(frq, overSampledNumResults));
}
}
return res;
}
/**
* Wrapping a facet request for over sampling.
* Implementation detail: even if the original request is a count request, no
* statistics will be computed for it as the wrapping is not a count request.
* This is ok, as the sampling accumulator is later computing the statistics
* over the original requests.
*/
private static class OverSampledFacetRequest extends FacetRequest {
final FacetRequest orig;
public OverSampledFacetRequest(FacetRequest orig, int num) {
super(orig.getCategoryPath(), num);
this.orig = orig;
setDepth(orig.getDepth());
setNumLabel(orig.getNumLabel());
setResultMode(orig.getResultMode());
setSortBy(orig.getSortBy());
setSortOrder(orig.getSortOrder());
}
@Override
public CategoryListIterator createCategoryListIterator(IndexReader reader,
TaxonomyReader taxo, FacetSearchParams sParams, int partition)
throws IOException {
return orig.createCategoryListIterator(reader, taxo, sParams, partition);
}
@Override
public Aggregator createAggregator(boolean useComplements,
FacetArrays arrays, IndexReader indexReader,
TaxonomyReader taxonomy) throws IOException {
return orig.createAggregator(useComplements, arrays, indexReader,
taxonomy);
}
@Override
public double getValueOf(FacetArrays arrays, int idx) {
return orig.getValueOf(arrays, idx);
}
@Override
public boolean requireDocumentScore() {
return orig.requireDocumentScore();
}
@Override
public boolean supportsComplements() {
return orig.supportsComplements();
}
}
}