TestDistributedStatsComponentCardinality.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import java.lang.invoke.MethodHandles;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Map;

import org.apache.lucene.util.TestUtil;
import org.apache.lucene.util.LuceneTestCase.Slow;

import org.apache.solr.BaseDistributedSearchTestCase;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.response.FieldStatsInfo;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.ModifiableSolrParams;

import org.apache.solr.util.hll.HLL;
import com.google.common.hash.Hashing;
import com.google.common.hash.HashFunction;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Slow
@SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-9062")
public class TestDistributedStatsComponentCardinality extends BaseDistributedSearchTestCase {
  
  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
  
  final static HashFunction HASHER = Hashing.murmur3_128();

  final static long BIG_PRIME = 982451653L;

  final static int MIN_NUM_DOCS = 10000;
  final static int MAX_NUM_DOCS = MIN_NUM_DOCS * 2;

  final static List<String> STAT_FIELDS = 
    Collections.unmodifiableList(Arrays.asList( "int_i", "long_l", "string_s" ));

  final int NUM_DOCS;
  final long MAX_LONG;
  final long MIN_LONG;

  public TestDistributedStatsComponentCardinality() {
    super();
    // we want some randomness in the shard number, but we don't want multiple iterations
    fixShardCount(TEST_NIGHTLY ? 7 : random().nextInt(3) + 1);

    handle.put("maxScore", SKIPVAL);
    NUM_DOCS = TestUtil.nextInt(random(), 10000, 15000);
    MAX_LONG = TestUtil.nextLong(random(), 0, NUM_DOCS * BIG_PRIME);
    MIN_LONG = MAX_LONG - (((long)NUM_DOCS-1) * BIG_PRIME);
  }

  /** CAUTION: this builds a very large index */
  public void buildIndex() throws Exception {
    log.info("Building an index of {} docs", NUM_DOCS);

    // we want a big spread in the long values we use, decrement by BIG_PRIME as we index
    long longValue = MAX_LONG;

    for (int i = 1; i <= NUM_DOCS; i++) {
      // with these values, we know that every doc indexed has a unique value in all of the
      // fields we will compute cardinality against.
      // which means the number of docs matching a query is the true cardinality for each field

      final String strValue = "s"+longValue;
      indexDoc(sdoc("id","" + i, 
                    "int_i", ""+i,
                    "int_i_prehashed_l", ""+HASHER.hashInt(i).asLong(),
                    "long_l", ""+longValue, 
                    "long_l_prehashed_l", ""+HASHER.hashLong(longValue).asLong(),
                    "string_s", strValue,
                    "string_s_prehashed_l", ""+HASHER.hashString(strValue).asLong()));

      longValue -= BIG_PRIME;
    }

    commit();
    
  }


  public void test() throws Exception {
    buildIndex();
    
    { // simple sanity checks - don't leak variables
      QueryResponse rsp = null;
      rsp = query(params("rows", "0", "q", "id:42")); 
      assertEquals(1, rsp.getResults().getNumFound());
      
      rsp = query(params("rows", "0", "q", "*:*", 
                         "stats","true", "stats.field", "{!min=true max=true}long_l"));
      assertEquals(NUM_DOCS, rsp.getResults().getNumFound());
      assertEquals(MIN_LONG, Math.round((double) rsp.getFieldStatsInfo().get("long_l").getMin()));
      assertEquals(MAX_LONG, Math.round((double) rsp.getFieldStatsInfo().get("long_l").getMax()));
    }

    final int NUM_QUERIES = atLeast(100);

    // Some Randomized queries with randomized log2m and max regwidth
    for (int i = 0; i < NUM_QUERIES; i++) {

      // testing shows that on random data, at the size we're dealing with, 
      // MINIMUM_LOG2M_PARAM is just too absurdly small to give anything remotely close the 
      // the theoretically expected relative error.
      //
      // So we have to use a slightly higher lower bound on what log2m values we randomly test
      final int log2m = TestUtil.nextInt(random(), 
                                         2 + HLL.MINIMUM_LOG2M_PARAM, 
                                         HLL.MAXIMUM_LOG2M_PARAM);

      // use max regwidth to try and prevent hash collisions from introducing problems
      final int regwidth = HLL.MAXIMUM_REGWIDTH_PARAM;

      final int lowId = TestUtil.nextInt(random(), 1, NUM_DOCS-2000);
      final int highId = TestUtil.nextInt(random(), lowId+1000, NUM_DOCS);
      final int numMatches = 1+highId-lowId;

      SolrParams p = buildCardinalityQ(lowId, highId, log2m, regwidth);
      QueryResponse rsp = query(p);
      assertEquals("sanity check num matches, p="+p, numMatches, rsp.getResults().getNumFound());

      Map<String,FieldStatsInfo> stats = rsp.getFieldStatsInfo();

      for (String f : STAT_FIELDS) {
        // regardless of log2m and regwidth, the estimated cardinality of the 
        // hashed vs prehashed values should be exactly the same for each field

        assertEquals(f + ": hashed vs prehashed, real="+ numMatches + ", p=" + p,
                     stats.get(f).getCardinality().longValue(),
                     stats.get(f+"_prehashed_l").getCardinality().longValue());
      }

      for (String f : STAT_FIELDS) {
        // check the relative error of the estimate returned against the known truth

        final double relErr = expectedRelativeError(log2m);
        final long estimate = stats.get(f).getCardinality().longValue();
        assertTrue(f + ": relativeErr="+relErr+", estimate="+estimate+", real="+numMatches+", p=" + p,
                   (Math.abs(numMatches - estimate) / numMatches) < relErr);
        
      }
    }
    
    // Some Randomized queries with both low and high accuracy options
    for (int i = 0; i < NUM_QUERIES; i++) {

      final int lowId = TestUtil.nextInt(random(), 1, NUM_DOCS-2000);
      final int highId = TestUtil.nextInt(random(), lowId+1000, NUM_DOCS);
      final int numMatches = 1+highId-lowId;

      // WTF? - https://github.com/aggregateknowledge/java-hll/issues/15
      // 
      // aparently we can't rely on estimates always being more accurate with higher log2m values?
      // so for now, just try testing accuracy values that differ by at least 0.5
      //
      // (that should give us a significant enough log2m diff that the "highAccuracy" is always
      // more accurate -- if, not then the entire premise of the float value is fundementally bogus)
      // 
      final double lowAccuracy = random().nextDouble() / 2;
      // final double highAccuracy = Math.min(1.0D, lowAccuracy + (random().nextDouble() / 2));
      final double highAccuracy = Math.min(1.0D, lowAccuracy + 0.5D);

      SolrParams p = buildCardinalityQ(lowId, highId, lowAccuracy, highAccuracy);
      QueryResponse rsp = query(p);
      assertEquals("sanity check num matches, p="+p, numMatches, rsp.getResults().getNumFound());

      Map<String,FieldStatsInfo> stats = rsp.getFieldStatsInfo();

      // can't use STAT_FIELDS here ...
      //
      // hueristic differences for regwidth on 32 bit values mean we get differences 
      // between estimates for the normal field vs the prehashed (long) field
      //
      // so we settle for only testing things where the regwidth is consistent 
      // w/the prehashed long...
      for (String f : new String[] { "long_l", "string_s" }) {

        // regardless of accuracy, the estimated cardinality of the 
        // hashed vs prehashed values should be exactly the same for each field

        assertEquals(f + ": hashed vs prehashed (low), real="+ numMatches + ", p=" + p,
                     stats.get("low_"+f).getCardinality().longValue(),
                     stats.get("low_"+f+"_prehashed_l").getCardinality().longValue());
        assertEquals(f + ": hashed vs prehashed (high), real="+ numMatches + ", p=" + p,
                     stats.get("high_"+f).getCardinality().longValue(),
                     stats.get("high_"+f+"_prehashed_l").getCardinality().longValue());
      }
      
      for (String f : STAT_FIELDS) {
        for (String ff : new String[] { f, f+"_prehashed_l"}) {
          // for both the prehashed and regular fields, the high accuracy option 
          // should always produce an estimate at least as good as the low accuracy option
          
          long poorEst = stats.get("low_"+ff).getCardinality();
          long goodEst = stats.get("high_"+ff).getCardinality();
          assertTrue(ff + ": goodEst="+goodEst+", poorEst="+poorEst+", real="+numMatches+", p=" + p,
                     Math.abs(numMatches - goodEst) <= Math.abs(numMatches - poorEst));
        }
      }
    }
  }
    
  /**
   * Returns the (max) expected relative error according ot the HLL algorithm docs
   */
  private static double expectedRelativeError(final int log2m) {
    final long m = 1 << log2m;
    // theoretical error is 1.04D * sqrt(m)
    // fudge slightly to account for variance in random data
    return 1.1D / Math.sqrt(m);
  }

  /** 
   * Helper utility for building up a set of query params.  
   *
   * The main query is a simple range query against the id field (using lowId TO highId). 
   * 2 stats.field params are generated for every field in {@link #STAT_FIELDS} --
   * both with and w/o a prehashed_l suffix -- using the specified log2m and regwidth.
   * 
   * The response keys will be the full field names
   */
  private static SolrParams buildCardinalityQ(final int lowId, 
                                              final int highId, 
                                              final int log2m, 
                                              final int regwidth) {
    ModifiableSolrParams p = params("q", "id:["+lowId+" TO "+highId+"]", 
                                    "rows", "0", "stats", "true");
    final String prefix = "{!cardinality=true hllLog2m="+log2m+" hllRegwidth="+regwidth;
    for (String f : STAT_FIELDS) {
      p.add("stats.field", prefix+"}"+f);
      p.add("stats.field", prefix+" hllPreHashed=true}"+f+"_prehashed_l");
    }
    return p;
  }

  /** 
   * Helper utility for building up a set of query params.  
   *
   * The main query is a simple range query against the id field (using lowId TO highId). 
   * 4 stats.field params are generated for every field in {@link #STAT_FIELDS} --
   * both with and w/o a prehashed_l suffix, and using both the low and high accuracy values
   *
   * The response keys will be the full field names with either a "low_" or "high_" prefix
   */
  private static SolrParams buildCardinalityQ(final int lowId, 
                                              final int highId, 
                                              final double lowAccuracy,
                                              final double highAccuracy) {
    ModifiableSolrParams p = params("q", "id:["+lowId+" TO "+highId+"]", 
                                    "rows", "0", "stats", "true");
    final String[] prefixes = new String[] {
      "{!cardinality=" + lowAccuracy + " key=low_",
      "{!cardinality=" + highAccuracy + " key=high_"
    };

    for (String f : STAT_FIELDS) {
      for (String prefix : prefixes) {
        p.add("stats.field", prefix+f+"}"+f);
        p.add("stats.field", prefix+f+"_prehashed_l hllPreHashed=true}"+f+"_prehashed_l");
      }
    }
    return p;
  }
}