StatsField.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.handler.component;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.queries.function.FunctionQuery;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.FieldCacheSource;
import org.apache.lucene.queries.function.valuesource.QueryValueSource;
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.StatsParams;
import org.apache.solr.common.util.StrUtils;
import org.apache.solr.request.DocValuesStats;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.NumberType;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocIterator;
import org.apache.solr.search.DocSet;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
import org.apache.solr.search.QueryParsing;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.util.hll.HLL;
import org.apache.solr.util.hll.HLLType;

import com.google.common.hash.HashFunction;
import com.google.common.hash.Hashing;

/**
 * Models all of the information associated with a single {@link StatsParams#STATS_FIELD}
 * instance.
 *
 * @see StatsComponent
 */
public class StatsField {
  
  /**
   * An enumeration representing the sumer set of all possible stat values that can be computed.
   * Each of these enum values can be specified as a local param in a <code>stats.field</code> 
   * (eg: <code>stats.field={!min=true mean=true}my_field_name</code>) but not all enum values 
   * are valid for all field types (eg: <code>mean</code> is meaningless for String fields)
   *
   * @lucene.internal
   * @lucene.experimental
   */
  public static enum Stat {
    min(true),
    max(true),
    missing(true),
    sum(true),
    count(true),
    mean(false, sum, count),
    sumOfSquares(true),
    stddev(false, sum, count, sumOfSquares),
    distinctValues(true),
    countDistinct(false, distinctValues),
    percentiles(true){
      /** special for percentiles **/
      boolean parseParams(StatsField sf) {
        String percentileParas = sf.localParams.get(this.name());
        if (percentileParas != null) {
          List<Double> percentiles = new ArrayList<Double>();
          try {
            for (String percentile : StrUtils.splitSmart(percentileParas, ',')) {
              percentiles.add(Double.parseDouble(percentile));
            }
            if (!percentiles.isEmpty()) {
              sf.percentilesList.addAll(percentiles);
              sf.tdigestCompression = sf.localParams.getDouble("tdigestCompression", 
                                                               sf.tdigestCompression);
              return true;
            }
          } catch (NumberFormatException e) {
            throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
                + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
                + e.getMessage(), e);
          }

        }
        return false;
      }
    },
    cardinality(true) { 
      /** special for percentiles **/
      boolean parseParams(StatsField sf) {
        try {
          sf.hllOpts = HllOptions.parseHllOptions(sf.localParams, sf.schemaField);
          return (null != sf.hllOpts);
        } catch (Exception e) {
          throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse "
              + StatsParams.STATS_FIELD + " local params: " + sf.localParams + " due to: "
              + e.getMessage(), e);
        }
      }
    };

    private final List<Stat> distribDeps;
    
    /**
     * Sole constructor for Stat enum values
     * @param deps the set of stat values, other then this one, which are a distributed 
     *        dependency and must be computed and returned by each individual shards in 
     *        order to compute <i>this</i> stat over the entire distributed result set.
     * @param selfDep indicates that when computing this stat across a distributed result 
     *        set, each shard must compute this stat <i>in addition to</i> any other 
     *        distributed dependencies.
     * @see #getDistribDeps
     */
    Stat(boolean selfDep, Stat... deps) {
      distribDeps = new ArrayList<Stat>(deps.length+1);
      distribDeps.addAll(Arrays.asList(deps));
      if (selfDep) { 
        distribDeps.add(this);
      }
    }
    
    /**
     * Given a String, returns the corrisponding Stat enum value if any, otherwise returns null.
     */
    public static Stat forName(String paramKey) {
      try {
        return Stat.valueOf(paramKey);
      } catch (IllegalArgumentException e) {
        return null;
      }
    }
    
    /**
     * The stats that must be computed and returned by each shard involved in a distributed 
     * request in order to compute the overall value for this stat across the entire distributed 
     * result set.  A Stat instance may include itself in the <code>getDistribDeps()</code> result,
     * but that is not always the case.
     */
    public EnumSet<Stat> getDistribDeps() {
      return EnumSet.copyOf(this.distribDeps);
    }
    
    /** 
     * Called when the name of a stat is found as a local param on this {@link StatsField}
     * @return true if the user is requesting this stat, else false
     */
    boolean parseParams(StatsField sf) {
      return sf.localParams.getBool(this.name(), false);
    }
    
  }

  /**
   * the equivilent stats if "calcdistinct" is specified
   * @see Stat#countDistinct
   * @see Stat#distinctValues
   */
  private static final EnumSet<Stat> CALCDISTINCT_PSUEDO_STAT = EnumSet.of(Stat.countDistinct, Stat.distinctValues);

  /**
   * The set of stats computed by default when no localparams are used to specify explicit stats 
   */
  public final static Set<Stat> DEFAULT_STATS = Collections.<Stat>unmodifiableSet
    (EnumSet.of(Stat.min, Stat.max, Stat.missing, Stat.sum, Stat.count, Stat.mean, Stat.sumOfSquares, Stat.stddev));

  private final SolrIndexSearcher searcher;
  private final ResponseBuilder rb;
  private final String originalParam; // for error messages
  private final SolrParams localParams;
  private final ValueSource valueSource; // may be null if simple field stats
  private final SchemaField schemaField; // may be null if function/query stats
  private final String key;
  private final boolean  topLevelCalcDistinct;
  private final String[] facets;
  private final List<String> tagList;
  private final List<String> excludeTagList;
  private final EnumSet<Stat> statsToCalculate = EnumSet.noneOf(Stat.class);
  private final EnumSet<Stat> statsInResponse = EnumSet.noneOf(Stat.class);
  private final List<Double> percentilesList= new ArrayList<Double>();
  private final boolean isShard;
  
  private double tdigestCompression = 100.0D;
  private HllOptions hllOpts;
  
  /**
   * @param rb the current request/response
   * @param statsParam the raw {@link StatsParams#STATS_FIELD} string
   */
  public StatsField(ResponseBuilder rb, String statsParam) { 
    this.rb = rb;
    this.searcher = rb.req.getSearcher();
    this.originalParam = statsParam;

    SolrParams params = rb.req.getParams();
    try {
      isShard = params.getBool("isShard", false);
      SolrParams localParams = QueryParsing.getLocalParams(originalParam, params);
      if (null == localParams) {
        // simplest possible input: bare string (field name)
        ModifiableSolrParams customParams = new ModifiableSolrParams();
        customParams.add(QueryParsing.V, originalParam);
        localParams = customParams;
      }

      this.localParams = localParams;
      
      String parserName = localParams.get(QueryParsing.TYPE);
      SchemaField sf = null;
      ValueSource vs = null;

      if ( StringUtils.isBlank(parserName) ) {

        // basic request for field stats
        sf = searcher.getSchema().getField(localParams.get(QueryParsing.V));

      } else {
        // we have a non trivial request to compute stats over a query (or function)

        // NOTE we could use QParser.getParser(...) here, but that would redundently
        // reparse everything.  ( TODO: refactor a common method in QParser ?)
        QParserPlugin qplug = rb.req.getCore().getQueryPlugin(parserName);
        QParser qp =  qplug.createParser(localParams.get(QueryParsing.V), 
                                         localParams, params, rb.req);

        // figure out what type of query we are dealing, get the most direct ValueSource
        vs = extractValueSource(qp.parse());

        // if this ValueSource directly corrisponds to a SchemaField, act as if
        // we were asked to compute stats on it directly
        // ie:  "stats.field={!func key=foo}field(foo)" == "stats.field=foo"
        sf = extractSchemaField(vs, searcher.getSchema());
        if (null != sf) {
          vs = null;
        }
      }
      
      assert ( (null == vs) ^ (null == sf) ) : "exactly one of vs & sf must be null";
      
      this.schemaField = sf;
      this.valueSource = vs;

    } catch (SyntaxError e) {
      throw new SolrException(ErrorCode.BAD_REQUEST, "Unable to parse " + 
                              StatsParams.STATS_FIELD + ": " + originalParam + " due to: "
                              + e.getMessage(), e);
    }

    // allow explicit setting of the response key via localparams...
    this.key = localParams.get(CommonParams.OUTPUT_KEY, 
                               // default to the main param value...
                               localParams.get(CommonParams.VALUE, 
                                               // default to entire original param str.
                                               originalParam));

    this.topLevelCalcDistinct = null == schemaField
        ? params.getBool(StatsParams.STATS_CALC_DISTINCT, false) 
        : params.getFieldBool(schemaField.getName(), StatsParams.STATS_CALC_DISTINCT, false);

    populateStatsSets();
        
    String[] facets = params.getFieldParams(key, StatsParams.STATS_FACET);
    this.facets = (null == facets) ? new String[0] : facets;
    String tagStr = localParams.get(CommonParams.TAG);
    this.tagList = (null == tagStr)
        ? Collections.<String>emptyList()
        : StrUtils.splitSmart(tagStr,',');

    // figure out if we need a special base DocSet
    String excludeStr = localParams.get(CommonParams.EXCLUDE);
    this.excludeTagList = (null == excludeStr) 
      ? Collections.<String>emptyList()
      : StrUtils.splitSmart(excludeStr,',');

    assert ( (null == this.valueSource) ^ (null == this.schemaField) ) 
      : "exactly one of valueSource & schemaField must be null";
  }

  /**
   * Inspects a {@link Query} to see if it directly maps to a {@link ValueSource},
   * and if so returns it -- otherwise wraps it as needed.
   *
   * @param q Query whose scores we have been asked to compute stats of
   * @returns a ValueSource to use for computing the stats
   */
  private static ValueSource extractValueSource(Query q) {
    return (q instanceof FunctionQuery) ?
      // Common case: we're wrapping a func, so we can directly pull out ValueSource
      ((FunctionQuery) q).getValueSource() :
      // asked to compute stats over a query, wrap it up as a ValueSource
      new QueryValueSource(q, 0.0F);
  }

  /**
   * Inspects a {@link ValueSource} to see if it directly maps to a {@link SchemaField}, 
   * and if so returns it.
   *
   * @param vs ValueSource we've been asked to compute stats of
   * @param schema The Schema to use
   * @returns Corrisponding {@link SchemaField} or null if the ValueSource is more complex
   * @see FieldCacheSource
   */
  private static SchemaField extractSchemaField(ValueSource vs, IndexSchema schema) {
    if (vs instanceof FieldCacheSource) {
      String fieldName = ((FieldCacheSource)vs).getField();
      return schema.getField(fieldName);
    }
    return null;
  }

  /** 
   * The key to be used when refering to this {@link StatsField} instance in the 
   * response tp clients.
   */
  public String getOutputKey() {
    return key;
  }

  /**
   * Computes a base {@link DocSet} for the current request to be used
   * when computing global stats for the local index.
   *
   * This is typically the same as the main DocSet for the {@link ResponseBuilder}
   * unless {@link CommonParams#TAG tag}ged filter queries have been excluded using 
   * the {@link CommonParams#EXCLUDE ex} local param
   */
  public DocSet computeBaseDocSet() throws IOException {

    DocSet docs = rb.getResults().docSet;
    Map<?,?> tagMap = (Map<?,?>) rb.req.getContext().get("tags");

    if (excludeTagList.isEmpty() || null == tagMap) {
      // either the exclude list is empty, or there
      // aren't any tagged filters to exclude anyway.
      return docs;
    }

    IdentityHashMap<Query,Boolean> excludeSet = new IdentityHashMap<Query,Boolean>();
    for (String excludeTag : excludeTagList) {
      Object olst = tagMap.get(excludeTag);
      // tagMap has entries of List<String,List<QParser>>, but subject to change in the future
      if (!(olst instanceof Collection)) continue;
      for (Object o : (Collection<?>)olst) {
        if (!(o instanceof QParser)) continue;
        QParser qp = (QParser)o;
        try {
          excludeSet.put(qp.getQuery(), Boolean.TRUE);
        } catch (SyntaxError e) {
          // this shouldn't be possible since the request should have already
          // failed when attempting to execute the query, but just in case...
          throw new SolrException(ErrorCode.BAD_REQUEST, "Excluded query can't be parsed: " + 
                                  originalParam + " due to: " + e.getMessage(), e);
        }
      }
    }
    if (excludeSet.size() == 0) return docs;
    
    List<Query> qlist = new ArrayList<Query>();
    
    // add the base query
    if (!excludeSet.containsKey(rb.getQuery())) {
      qlist.add(rb.getQuery());
    }
    
    // add the filters
    if (rb.getFilters() != null) {
      for (Query q : rb.getFilters()) {
        if (!excludeSet.containsKey(q)) {
          qlist.add(q);
        }
      }
    }
    
    // get the new base docset for this facet
    return searcher.getDocSet(qlist);
  }

  /**
   * Computes the {@link StatsValues} for this {@link StatsField} relative to the 
   * specified {@link DocSet} 
   * @see #computeBaseDocSet
   */
  public StatsValues computeLocalStatsValues(DocSet base) throws IOException {

    if (statsToCalculate.isEmpty()) { 
      // perf optimization for the case where we compute nothing
      // ie: stats.field={!min=$domin}myfield&domin=false
      return StatsValuesFactory.createStatsValues(this);
    }

    if (null != schemaField && !schemaField.getType().isPointField()
        && (schemaField.multiValued() || schemaField.getType().multiValuedFieldCache())) {

      // TODO: should this also be used for single-valued string fields? (should work fine)
      return DocValuesStats.getCounts(searcher, this, base, facets);
    } else {
      // either a single valued field we pull from FieldCache, or an explicit
      // function ValueSource
      return computeLocalValueSourceStats(base);
    }
  }

  private StatsValues computeLocalValueSourceStats(DocSet base) throws IOException {

    IndexSchema schema = searcher.getSchema();

    final StatsValues allstats = StatsValuesFactory.createStatsValues(this);

    List<FieldFacetStats> facetStats = new ArrayList<>();
    for( String facetField : facets ) {
      SchemaField fsf = schema.getField(facetField);

      if ( fsf.multiValued()) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
          "Stats can only facet on single-valued fields, not: " + facetField );
      }

      facetStats.add(new FieldFacetStats(searcher, fsf, this));
    }

    final Iterator<LeafReaderContext> ctxIt = searcher.getIndexReader().leaves().iterator();
    LeafReaderContext ctx = null;
    for (DocIterator docsIt = base.iterator(); docsIt.hasNext(); ) {
      final int doc = docsIt.nextDoc();
      if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
        // advance
        do {
          ctx = ctxIt.next();
        } while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
        assert doc >= ctx.docBase;

        // propagate the context among accumulators.
        allstats.setNextReader(ctx);
        for (FieldFacetStats f : facetStats) {
          f.setNextReader(ctx);
        }
      }

      // accumulate
      allstats.accumulate(doc - ctx.docBase);
      for (FieldFacetStats f : facetStats) {
        f.facet(doc - ctx.docBase);
      }
    }

    for (FieldFacetStats f : facetStats) {
      allstats.addFacet(f.name, f.facetStatsValues);
    }
    return allstats;
  }

  /**
   * The searcher that should be used for processing local stats
   * @see SolrQueryRequest#getSearcher
   */
  public SolrIndexSearcher getSearcher() {
    // see AbstractStatsValues.setNextReader

    return searcher;
  }

  /**
   * The {@link SchemaField} whose results these stats are computed over, may be null 
   * if the stats are computed over the results of a function or query
   *
   * @see #getValueSource
   */
  public SchemaField getSchemaField() {
    return schemaField;
  }

  /**
   * The {@link ValueSource} of a function or query whose results these stats are computed 
   * over, may be null if the stats are directly over a {@link SchemaField}
   *
   * @see #getValueSource
   */
  public ValueSource getValueSource() {
    return valueSource;
  }

  public List<String> getTagList() {
    return tagList;
  }

  public String toString() {
    return "StatsField<" + originalParam + ">";
  }

  /**
   * A helper method which inspects the {@link #localParams} associated with this StatsField, 
   * and uses them to populate the {@link #statsInResponse} and {@link #statsToCalculate} data 
   * structures
   */
  private void populateStatsSets() {
    boolean statSpecifiedByLocalParam = false;
    // local individual stat
    Iterator<String> itParams = localParams.getParameterNamesIterator();
    
    while (itParams.hasNext()) {
      String paramKey = itParams.next();
      Stat stat = Stat.forName(paramKey);
      if (stat != null) {
        statSpecifiedByLocalParam = true;
        if (stat.parseParams(this)) {
          statsInResponse.add(stat);
        }
      }
    }

    // if no individual stat setting use the default set
    if ( ! ( statSpecifiedByLocalParam
             // calcdistinct (as a local param) is a psuedo-stat, prevents default set
             || localParams.getBool("calcdistinct", false) ) ) {
      statsInResponse.addAll(DEFAULT_STATS);
    }

    // calcDistinct is a psuedo-stat with optional top level param default behavior
    // if not overridden by the specific individual stats
    if (localParams.getBool("calcdistinct", topLevelCalcDistinct)) {
      for (Stat stat : CALCDISTINCT_PSUEDO_STAT) {
        // assume true, but don't include if specific stat overrides
        if (localParams.getBool(stat.name(), true)) {
          statsInResponse.add(stat);
        }
      }
    }

    for (Stat stat : statsInResponse) {
      statsToCalculate.addAll(stat.getDistribDeps());
    }
  }

  public boolean calculateStats(Stat stat) {
    return statsToCalculate.contains(stat);
  }
  
  public boolean includeInResponse(Stat stat) {
    if (isShard) {
      return statsToCalculate.contains(stat);
    }
   
    if (statsInResponse.contains(stat)) {
      return true;
    }
    return false;
  }

  public List<Double> getPercentilesList() {
    return percentilesList;
  }
  
  public boolean getIsShard() {
    return isShard;
  }
  
  public double getTdigestCompression() {
    return tdigestCompression;
  }

  public HllOptions getHllOptions() {
    return hllOpts;
  }

  /**
   * Helper Struct for parsing and encapsulating all of the options relaed to building a {@link HLL}
   *
   * @see Stat#cardinality
   * @lucene.internal
   */
  public static final class HllOptions {
    final HashFunction hasher;
    
    // NOTE: this explanation linked to from the java-hll jdocs...
    // https://github.com/aggregateknowledge/postgresql-hll/blob/master/README.markdown#explanation-of-parameters-and-tuning
    // ..if i'm understanding the regwidth chart correctly, a value of 6 should be a enough
    // to support any max cardinality given that we're always dealing with hashes and 
    // the cardinality of the set of all long values is 2**64 == 1.9e19
    //
    // But i guess that assumes a *perfect* hash and high log2m? ... if the hash algo is imperfect 
    // and/or log2m is low (ie: user is less concerned about accuracy), then many diff hash values 
    // might fall in the same register (ie: bucket) and having a wider register to count more of 
    // them may be useful

    final int log2m;  
    final int regwidth;
    
    final static String ERR = "cardinality must be specified as 'true' (for default tunning) or decimal number between 0 and 1 to adjust accuracy vs memory usage (large number is more memory and more accuracy)";

    private HllOptions(int log2m, int regwidth, HashFunction hasher) {
      this.log2m = log2m;
      this.regwidth = regwidth;
      this.hasher = hasher;
    }
    /** 
     * Creates an HllOptions based on the (local) params specified (if appropriate).
     *
     * @param localParams the LocalParams for this {@link StatsField}
     * @param field the field corresponding to this {@link StatsField}, may be null if these stats are over a value source
     * @return the {@link HllOptions} to use based on the params, or null if no {@link HLL} should be computed
     * @throws SolrException if there are invalid options
     */
    public static HllOptions parseHllOptions(SolrParams localParams, SchemaField field) 
      throws SolrException {

      String cardinalityOpt = localParams.get(Stat.cardinality.name());
      if (StringUtils.isBlank(cardinalityOpt)) {
        return null;
      }

      final NumberType hashableNumType = getHashableNumericType(field);

      // some sane defaults
      int log2m = 13;   // roughly equivilent to "cardinality='0.33'"
      int regwidth = 6; // with decent hash, this is plenty for all valid long hashes

      if (NumberType.FLOAT.equals(hashableNumType) || NumberType.INTEGER.equals(hashableNumType)) {
        // for 32bit values, we can adjust our default regwidth down a bit
        regwidth--;

        // NOTE: EnumField uses LegacyNumericType.INT, and in theory we could be super conservative
        // with it, but there's no point - just let the EXPLICIT HLL handle it
      }

      // TODO: we could attempt additional reductions in the default regwidth based on index
      // statistics -- but thta doesn't seem worth the effort.  for tiny indexes, the 
      // EXPLICIT and SPARSE HLL representations have us nicely covered, and in general we don't 
      // want to be too aggresive about lowering regwidth or we could really poor results if 
      // log2m is also low and  there is heavy hashkey collision

      try {
        // NFE will short out here if it's not a number
        final double accuracyOpt = Double.parseDouble(cardinalityOpt);

        // if a float between 0 and 1 is specified, treat it as a prefrence of accuracy
        // - 0 means accuracy is not a concern, save RAM
        // - 1 means be as accurate as possible, using as much RAM as needed.

        if (accuracyOpt < 0D || 1.0D < accuracyOpt) {
          throw new SolrException(ErrorCode.BAD_REQUEST, ERR);
        }

        // use accuracyOpt as a scaling factor between min & max legal log2m values
        log2m = HLL.MINIMUM_LOG2M_PARAM
          + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_LOG2M_PARAM - HLL.MINIMUM_LOG2M_PARAM));

        // use accuracyOpt as a scaling factor for regwidth as well, BUT...
        // be more conservative -- HLL.MIN_REGWIDTH_PARAM is too absurdly low to be useful
        // use previously computed (hashableNumType) default regwidth -1 as lower bound for scaling
        final int MIN_HUERISTIC_REGWIDTH = regwidth-1;
        regwidth = MIN_HUERISTIC_REGWIDTH
          + (int) Math.round(accuracyOpt * (HLL.MAXIMUM_REGWIDTH_PARAM - MIN_HUERISTIC_REGWIDTH));

      } catch (NumberFormatException nfe) {
        // param value isn't a number -- let's check for simple true/false
        if (! localParams.getBool(Stat.cardinality.name(), false)) {
          return null;
        }
      }

      // let explicit params override both the default and/or any accuracy specification
      log2m = localParams.getInt("hllLog2m", log2m);
      regwidth = localParams.getInt("hllRegwidth", regwidth);

      // validate legal values
      if (log2m < HLL.MINIMUM_LOG2M_PARAM || HLL.MAXIMUM_LOG2M_PARAM < log2m) {
        throw new SolrException(ErrorCode.BAD_REQUEST, "hllLog2m must be at least " + 
                                HLL.MINIMUM_LOG2M_PARAM + " and at most " + HLL.MAXIMUM_LOG2M_PARAM
                                + " (" + log2m +")");
      }
      if (regwidth < HLL.MINIMUM_REGWIDTH_PARAM || HLL.MAXIMUM_REGWIDTH_PARAM < regwidth) {
        throw new SolrException(ErrorCode.BAD_REQUEST, "hllRegwidth must be at least " + 
                                HLL.MINIMUM_REGWIDTH_PARAM + " and at most " + HLL.MAXIMUM_REGWIDTH_PARAM);
      }
      
      HashFunction hasher = localParams.getBool("hllPreHashed", false) ? null : Hashing.murmur3_128();

      if (null == hasher) {
        // if this is a function, or a non Long field, pre-hashed is invalid
        // NOTE: we ignore hashableNumType - it's LONG for non numerics like Strings
        if (null == field || !(NumberType.LONG.equals(field.getType().getNumberType()) || NumberType.DATE.equals(field.getType().getNumberType()))) { 
          throw new SolrException(ErrorCode.BAD_REQUEST, "hllPreHashed is only supported with Long based fields");
        }
      }

      // if we're still here, then we need an HLL...
      return new HllOptions(log2m, regwidth, hasher);
    }
    /** @see HLL */
    public int getLog2m() {
      return log2m;
    }
    /** @see HLL */
    public int getRegwidth() {
      return regwidth;
    }
    /** May be null if user has indicated that field values are pre-hashed */
    public HashFunction getHasher() {
      return hasher;
    }
    public HLL newHLL() {
      // Although it (in theory) saves memory for "medium" size sets, the SPARSE type seems to have
      // some nasty impacts on response time as it gets larger - particularly in distrib requests.
      // Merging large SPARSE HLLs is much much slower then merging FULL HLLs with the same num docs
      //
      // TODO: add more tunning options for this.
      return new HLL(getLog2m(), getRegwidth(), -1 /* auto explict threshold */,
                     false /* no sparse representation */, HLLType.EMPTY);
                     
    }
  }

  /**
   * Returns the effective {@link NumberType} for the field for the purposes of hash values.
   * ie: If the field has an explict NumberType that is returned; If the field has no explicit
   * NumberType then {@link NumberType#LONG} is returned;  If field is null, then
   * {@link NumberType#FLOAT} is assumed for ValueSource.
   */
  private static NumberType getHashableNumericType(SchemaField field) {
    if (null == field) {
      return NumberType.FLOAT;
    }
    final NumberType result = field.getType().getNumberType();
    return null == result ? NumberType.LONG : result;
  }
}