HLLUtil.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.util.hll;

/**
 * Static functions for computing constants and parameters used in the HLL
 * algorithm.
 */
final class HLLUtil {
    /**
     * Precomputed <code>pwMaxMask</code> values indexed by <code>registerSizeInBits</code>.
     * Calculated with this formula:
     * <pre>
     *     int maxRegisterValue = (1 << registerSizeInBits) - 1;
     *     // Mask with all bits set except for (maxRegisterValue - 1) least significant bits (see #addRaw())
     *     return ~((1L << (maxRegisterValue - 1)) - 1);
     * </pre>
     *
     * @see #pwMaxMask(int)
     */
    private static final long[] PW_MASK = {
            ~((1L << (((1 << 0) - 1) - 1)) - 1),
            ~((1L << (((1 << 1) - 1) - 1)) - 1),
            ~((1L << (((1 << 2) - 1) - 1)) - 1),
            ~((1L << (((1 << 3) - 1) - 1)) - 1),
            ~((1L << (((1 << 4) - 1) - 1)) - 1),
            ~((1L << (((1 << 5) - 1) - 1)) - 1),
            ~((1L << (((1 << 6) - 1) - 1)) - 1),
            ~((1L << (((1 << 7) - 1) - 1)) - 1),
            ~((1L << (((1 << 8) - 1) - 1)) - 1)
    };

    /**
     * Precomputed <code>twoToL</code> values indexed by a linear combination of
     * <code>regWidth</code> and <code>log2m</code>.
     *
     * The array is one-dimensional and can be accessed by using index
     * <code>(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m</code>
     * for <code>regWidth</code> and <code>log2m</code> between the specified
     * <code>HLL.{MINIMUM,MAXIMUM}_{REGWIDTH,LOG2M}_PARAM</code> constants.
     *
     * @see #largeEstimator(int, int, double)
     * @see #largeEstimatorCutoff(int, int)
     * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 2^L</a>"
     */
    private static final double[] TWO_TO_L = new double[(HLL.MAXIMUM_REGWIDTH_PARAM + 1) * (HLL.MAXIMUM_LOG2M_PARAM + 1)];

    /**
     * Spacing constant used to compute offsets into {@link #TWO_TO_L}.
     */
    private static final int REG_WIDTH_INDEX_MULTIPLIER = HLL.MAXIMUM_LOG2M_PARAM + 1;

    static {
        for(int regWidth = HLL.MINIMUM_REGWIDTH_PARAM; regWidth <= HLL.MAXIMUM_REGWIDTH_PARAM; regWidth++) {
            for(int log2m = HLL.MINIMUM_LOG2M_PARAM ; log2m <= HLL.MAXIMUM_LOG2M_PARAM; log2m++) {
                int maxRegisterValue = (1 << regWidth) - 1;

                // Since 1 is added to p(w) in the insertion algorithm, only
                // (maxRegisterValue - 1) bits are inspected hence the hash
                // space is one power of two smaller.
                final int pwBits = (maxRegisterValue - 1);
                final int totalBits = (pwBits + log2m);
                final double twoToL = Math.pow(2, totalBits);
                TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * regWidth) + log2m] = twoToL;
            }
        }
    }

    // ************************************************************************
    /**
     * Computes the bit-width of HLL registers necessary to estimate a set of
     * the specified cardinality.
     *
     * @param  expectedUniqueElements an upper bound on the number of unique
     *         elements that are expected.  This must be greater than zero.
     * @return a register size in bits (i.e. <code>log2(log2(n))</code>)
     */
    public static int registerBitSize(final long expectedUniqueElements) {
        return Math.max(HLL.MINIMUM_REGWIDTH_PARAM,
                        (int)Math.ceil(NumberUtil.log2(NumberUtil.log2(expectedUniqueElements))));
    }

    // ========================================================================
    /**
     * Computes the 'alpha-m-squared' constant used by the HyperLogLog algorithm.
     *
     * @param  m this must be a power of two, cannot be less than
     *         16 (2<sup>4</sup>), and cannot be greater than 65536 (2<sup>16</sup>).
     * @return gamma times <code>registerCount</code> squared where gamma is
     *         based on the value of <code>registerCount</code>.
     * @throws IllegalArgumentException if <code>registerCount</code> is less
     *         than 16.
     */
    public static double alphaMSquared(final int m) {
        switch(m) {
            case 1/*2^0*/:
            case 2/*2^1*/:
            case 4/*2^2*/:
            case 8/*2^3*/:
                throw new IllegalArgumentException("'m' cannot be less than 16 (" + m + " < 16).");

            case 16/*2^4*/:
                return 0.673 * m * m;

            case 32/*2^5*/:
                return 0.697 * m * m;

            case 64/*2^6*/:
                return 0.709 * m * m;

            default/*>2^6*/:
                return (0.7213 / (1.0 + 1.079 / m)) * m * m;
        }
    }

    // ========================================================================
    /**
     * Computes a mask that prevents overflow of HyperLogLog registers.
     *
     * @param  registerSizeInBits the size of the HLL registers, in bits.
     * @return mask a <code>long</code> mask to prevent overflow of the registers
     * @see #registerBitSize(long)
     */
    public static long pwMaxMask(final int registerSizeInBits) {
        return PW_MASK[registerSizeInBits];
    }

    // ========================================================================
    /**
     * The cutoff for using the "small range correction" formula, in the
     * HyperLogLog algorithm.
     *
     * @param  m the number of registers in the HLL. <em>m<em> in the paper.
     * @return the cutoff for the small range correction.
     * @see #smallEstimator(int, int)
     */
    public static double smallEstimatorCutoff(final int m) {
        return ((double)m * 5) / 2;
    }

    /**
     * The "small range correction" formula from the HyperLogLog algorithm. Only
     * appropriate if both the estimator is smaller than <pre>(5/2) * m</pre> and
     * there are still registers that have the zero value.
     *
     * @param  m the number of registers in the HLL. <em>m<em> in the paper.
     * @param  numberOfZeroes the number of registers with value zero. <em>V</em>
     *         in the paper.
     * @return a corrected cardinality estimate.
     */
    public static double smallEstimator(final int m, final int numberOfZeroes) {
        return m * Math.log((double)m / numberOfZeroes);
    }

    /**
     * The cutoff for using the "large range correction" formula, from the
     * HyperLogLog algorithm, adapted for 64 bit hashes.
     *
     * @param  log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
     * @param  registerSizeInBits the size of the HLL registers, in bits.
     * @return the cutoff for the large range correction.
     * @see #largeEstimator(int, int, double)
     * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction' cutoff</a>"
     */
    public static double largeEstimatorCutoff(final int log2m, final int registerSizeInBits) {
        return (TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m]) / 30.0;
    }

    /**
     * The "large range correction" formula from the HyperLogLog algorithm, adapted
     * for 64 bit hashes. Only appropriate for estimators whose value exceeds
     * the return of {@link #largeEstimatorCutoff(int, int)}.
     *
     * @param  log2m log-base-2 of the number of registers in the HLL. <em>b<em> in the paper.
     * @param  registerSizeInBits the size of the HLL registers, in bits.
     * @param  estimator the original estimator ("E" in the paper).
     * @return a corrected cardinality estimate.
     * @see "<a href='http://research.neustar.biz/2013/01/24/hyperloglog-googles-take-on-engineering-hll/'>Blog post with section on 64 bit hashes and 'large range correction'</a>"
     */
    public static double largeEstimator(final int log2m, final int registerSizeInBits, final double estimator) {
        final double twoToL = TWO_TO_L[(REG_WIDTH_INDEX_MULTIPLIER * registerSizeInBits) + log2m];
        return -1 * twoToL * Math.log(1.0 - (estimator/twoToL));
    }
}