BloomCalculations.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.commoncrawl.util;

/**
 * The following calculations are taken from:
 * http://www.cs.wisc.edu/~cao/papers/summary-cache/node8.html
 * "Bloom Filters - the math"
 * 
 * This class's static methods are meant to facilitate the use of the Bloom
 * Filter class by helping to choose correct values of 'bits per element' and
 * 'number of hash functions, k'. Author : Avinash Lakshman (
 * alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
 */
public class BloomCalculations {

  private static final int   maxBuckets     = 15;
  private static final int   minBuckets     = 2;
  private static final int   minK           = 1;
  private static final int   maxK           = 8;
  private static final int[] optKPerBuckets = new int[] { 1, // dummy K for 0
                                                             // buckets per
                                                             // element
      1, // dummy K for 1 buckets per element
      1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 8, 8, 8, 8 };

  /**
   * In the following table, the row 'i' shows false positive rates if i buckets
   * per element are used. Column 'j' shows false positive rates if j hash
   * functions are used. The first row is 'i=0', the first column is 'j=0'. Each
   * cell (i,j) the false positive rate determined by using i buckets per
   * element and j hash functions.
   */
  static final double[][]    probs          = new double[][] {
      { 1.0 }, // dummy row representing 0 buckets per element
      { 1.0, 1.0 }, // dummy row representing 1 buckets per element
      { 1.0, 0.393, 0.400 },
      { 1.0, 0.283, 0.237, 0.253 },
      { 1.0, 0.221, 0.155, 0.147, 0.160 },
      { 1.0, 0.181, 0.109, 0.092, 0.092, 0.101 }, // 5
      { 1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638 },
      { 1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364 },
      { 1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229 },
      { 1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145 }, // 9
      { 1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846 },
      { 1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509 },
      { 1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314 },
      { 1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217,
      0.00199 },
      { 1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146,
      0.00129 },
      { 1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852 } // 15
                                            };   // the first column is a dummy
                                                  // column representing K=0.

  /**
   * Given the number of buckets that can be used per element, return the
   * optimal number of hash functions in order to minimize the false positive
   * rate.
   * 
   * @param bucketsPerElement
   * @return The number of hash functions that minimize the false positive rate.
   */
  public static int computeBestK(int bucketsPerElement) {
    assert bucketsPerElement >= 0;
    if (bucketsPerElement >= optKPerBuckets.length)
      return optKPerBuckets[optKPerBuckets.length - 1];
    return optKPerBuckets[bucketsPerElement];
  }

  /**
   * A wrapper class that holds two key parameters for a Bloom Filter: the
   * number of hash functions used, and the number of buckets per element used.
   */
  public static final class BloomSpecification {
    final int K;                // number of hash functions.
    final int bucketsPerElement;

    public BloomSpecification(int k, int bucketsPerElement) {
      K = k;
      this.bucketsPerElement = bucketsPerElement;
    }
  }

  /**
   * Given a maximum tolerable false positive probability, compute a Bloom
   * specification which will give less than the specified false positive rate,
   * but minimize the number of buckets per element and the number of hash
   * functions used. Because bandwidth (and therefore total bitvector size) is
   * considered more expensive than computing power, preference is given to
   * minimizing buckets per element rather than number of hash funtions.
   * 
   * @param maxFalsePosProb
   *          The maximum tolerable false positive rate.
   * @return A Bloom Specification which would result in a false positive rate
   *         less than specified by the function call.
   */
  public static BloomSpecification computeBucketsAndK(double maxFalsePosProb) {
    // Handle the trivial cases
    if (maxFalsePosProb >= probs[minBuckets][minK]) {
      return new BloomSpecification(2, optKPerBuckets[2]);
    }
    if (maxFalsePosProb < probs[maxBuckets][maxK]) {
      return new BloomSpecification(maxK, maxBuckets);
    }

    // First find the minimal required number of buckets:
    int bucketsPerElement = 2;
    int K = optKPerBuckets[2];
    while (probs[bucketsPerElement][K] > maxFalsePosProb) {
      bucketsPerElement++;
      K = optKPerBuckets[bucketsPerElement];
    }
    // Now that the number of buckets is sufficient, see if we can relax K
    // without losing too much precision.
    while (probs[bucketsPerElement][K - 1] <= maxFalsePosProb) {
      K--;
    }

    return new BloomSpecification(K, bucketsPerElement);
  }

    /**
     * Max elements you can put in a Bloom filter of a particular size with a specified error rate
     * @param nbits total bitsize of the bloom filter
     * @param hashCount how many hashes we're using
     * @param errorRate the false positive rate we're looking for
     */
    public static long calcMaxElements(long nbits, double errorRate, int hashCount) {
        return (long) (-nbits * 1.0 / hashCount * Math.log(1 - Math.exp(Math.log(errorRate) / hashCount)));
    }

    /**
     * Get error rate for a given bloom filter based on size and hash count
     * @param numElements how many elements we're storing in the bloom filter
     * @param nbits total bitsize of the bloom filter
     * @param hashCount how many hashes we're using
     */
    public static double calcErrorRate(long numElements, long nbits, int hashCount) {
        return Math.exp(Math.log(1 - Math.exp(-hashCount * numElements * 1.0 / nbits)) * hashCount);
    }

    public static void usage(int exitCode) {
        String error = "Usage: \n" +
                "calcErrorRate numElements nbits hashCount\n" +
                "calcMaxElements nbits errorRate hashCount";
        System.err.println(error);
        System.exit(exitCode);
    }
    public static void main(String[] args) {
        if (args.length < 4) {
            usage(1);
        }

        if (args[0].equals("calcErrorRate")) {
            long numElements = Long.parseLong(args[1]);
            long nbits = Long.parseLong(args[2]);
            int hashCount = Integer.parseInt(args[3]);
            System.out.println(calcErrorRate(numElements, nbits, hashCount));
        } else if (args[0].equals("calcMaxElements")) {
            long nbits = Long.parseLong(args[1]);
            double errorRate = Double.parseDouble(args[2]);
            int hashCount = Integer.parseInt(args[3]);
            System.out.println(calcMaxElements(nbits, errorRate, hashCount));
        }
    }

}