TestRandomSamplingFacetsCollector.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.facet;

import java.util.List;
import java.util.Random;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
import org.apache.lucene.facet.taxonomy.FastTaxonomyFacetCounts;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyReader;
import org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiCollector;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.IOUtils;

public class TestRandomSamplingFacetsCollector extends FacetTestCase {
  
  // The first 50 chi-square value for p-value=0.05, taken from:
  // http://en.wikibooks.org/wiki/Engineering_Tables/Chi-Squared_Distibution
  private static final float[] CHI_SQUARE_VALUES = new float[] {0.0f, 3.841f,
      5.991f, 7.815f, 9.488f, 11.07f, 12.592f, 14.067f, 15.507f, 16.919f,
      18.307f, 19.675f, 21.026f, 22.362f, 23.685f, 24.996f, 26.296f, 27.587f,
      28.869f, 30.144f, 31.41f, 32.671f, 33.924f, 35.172f, 36.415f, 37.652f,
      38.885f, 40.113f, 41.337f, 42.557f, 43.773f, 44.985f, 46.194f, 47.4f,
      48.602f, 49.802f, 50.998f, 52.192f, 53.384f, 54.572f, 55.758f, 56.942f,
      58.124f, 59.304f, 60.481f, 61.656f, 62.83f, 64.001f, 65.171f, 66.339f,
      67.505f};
  
  public void testRandomSampling() throws Exception {
    Directory dir = newDirectory();
    Directory taxoDir = newDirectory();
    
    Random random = random();
    DirectoryTaxonomyWriter taxoWriter = new DirectoryTaxonomyWriter(taxoDir);
    RandomIndexWriter writer = new RandomIndexWriter(random, dir);
    
    FacetsConfig config = new FacetsConfig();
    
    final int numCategories = 10;
    int numDocs = atLeast(10000);
    for (int i = 0; i < numDocs; i++) {
      Document doc = new Document();
      doc.add(new StringField("EvenOdd", (i % 2 == 0) ? "even" : "odd", Store.NO));
      doc.add(new FacetField("iMod10", Integer.toString(i % numCategories)));
      writer.addDocument(config.build(taxoWriter, doc));
    }
    writer.forceMerge(CHI_SQUARE_VALUES.length - 1);
    
    // NRT open
    IndexSearcher searcher = newSearcher(writer.getReader());
    TaxonomyReader taxoReader = new DirectoryTaxonomyReader(taxoWriter);
    IOUtils.close(writer, taxoWriter);
    
    // Test empty results
    RandomSamplingFacetsCollector collectRandomZeroResults = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong());
    
    // There should be no divisions by zero
    searcher.search(new TermQuery(new Term("EvenOdd", "NeverMatches")), collectRandomZeroResults);
    
    // There should be no divisions by zero and no null result
    assertNotNull(collectRandomZeroResults.getMatchingDocs());
    
    // There should be no results at all
    for (MatchingDocs doc : collectRandomZeroResults.getMatchingDocs()) {
      assertEquals(0, doc.totalHits);
    }
    
    // Now start searching and retrieve results.
    
    // Use a query to select half of the documents.
    TermQuery query = new TermQuery(new Term("EvenOdd", "even"));
    
    RandomSamplingFacetsCollector random10Percent = new RandomSamplingFacetsCollector(numDocs / 10, random.nextLong()); // 10% of total docs, 20% of the hits

    FacetsCollector fc = new FacetsCollector();
    
    searcher.search(query, MultiCollector.wrap(fc, random10Percent));
    
    final List<MatchingDocs> matchingDocs = random10Percent.getMatchingDocs();

    // count the total hits and sampled docs, also store the number of sampled
    // docs per segment
    int totalSampledDocs = 0, totalHits = 0;
    int[] numSampledDocs = new int[matchingDocs.size()];
//    System.out.println("numSegments=" + numSampledDocs.length);
    for (int i = 0; i < numSampledDocs.length; i++) {
      MatchingDocs md = matchingDocs.get(i);
      final DocIdSetIterator iter = md.bits.iterator();
      while (iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) ++numSampledDocs[i];
      totalSampledDocs += numSampledDocs[i];
      totalHits += md.totalHits;
    }
    
    // compute the chi-square value for the sampled documents' distribution
    float chi_square = 0;
    for (int i = 0; i < numSampledDocs.length; i++) {
      MatchingDocs md = matchingDocs.get(i);
      float ei = (float) md.totalHits / totalHits;
      if (ei > 0.0f) {
        float oi = (float) numSampledDocs[i] / totalSampledDocs;
        chi_square += (Math.pow(ei - oi, 2) / ei);
      }
    }
    
    // Verify that the chi-square value isn't too big. According to
    // http://en.wikipedia.org/wiki/Chi-squared_distribution#Table_of_.CF.872_value_vs_p-value,
    // we basically verify that there is a really small chance of hitting a very
    // bad sample (p-value < 0.05), for n-degrees of freedom. The number 'n' depends
    // on the number of segments.
    assertTrue("chisquare not statistically significant enough: " + chi_square, chi_square < CHI_SQUARE_VALUES[numSampledDocs.length]);
    
    // Test amortized counts - should be 5X the sampled count, but maximum numDocs/10
    final FastTaxonomyFacetCounts random10FacetCounts = new FastTaxonomyFacetCounts(taxoReader, config, random10Percent);
    final FacetResult random10Result = random10FacetCounts.getTopChildren(10, "iMod10");
    final FacetResult amortized10Result = random10Percent.amortizeFacetCounts(random10Result, config, searcher);
    for (int i = 0; i < amortized10Result.labelValues.length; i++) {
      LabelAndValue amortized = amortized10Result.labelValues[i];
      LabelAndValue sampled = random10Result.labelValues[i];
      // since numDocs may not divide by 10 exactly, allow for some slack in the amortized count 
      assertEquals(amortized.value.floatValue(), Math.min(5 * sampled.value.floatValue(), numDocs / 10.f), 1.0);
    }
    
    IOUtils.close(searcher.getIndexReader(), taxoReader, dir, taxoDir);
  }
  
}