/** * Copyright 2013 Oak Ridge National Laboratory * Author: James Horey <horeyjl@ornl.gov> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package gov.ornl.keva.core; /** * Java libs. **/ import java.util.BitSet; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.nio.ByteBuffer; /** * Hashing functions. **/ import net.jpountz.xxhash.XXHash32; import net.jpountz.xxhash.XXHashFactory; /** * A bloom filter is a space efficient datastructure used to test membership * against some set of data. The interesting property of bloom filters are that * the false negative rate is zero, but there is a small false positive rate. That * means that if the filter returns "true" for some membership, the data may not * actually reside in the set, but we need to perform a more expensive test to * find out. * * @author James Horey */ public class BloomFilter { /** * Initial hash seed value. This is just a random value, but * must be consistent so that we can find items. */ private static final int HASH_SEED = 3571; private int k; // Number of hashes. private int m; // Length of the bit field. private int n; // Number of expected items. private double p; // False positive rate. private BitSet filter; // This is where we actually store our bits. /** * Fast hash capabilities. */ private static final XXHashFactory factory = XXHashFactory.fastestInstance(); private static final XXHash32 hash = factory.hash32(); /** * Static table of false positive rates. The columns are for increasing values of * 'k' (from 0 to 8). The rows are the ratio of 'm / n'. */ private static final double[][] fpProb = new double[][]{ {1.0}, {1.0, 1.0}, {1.0, 0.393, 0.400}, {1.0, 0.283, 0.237, 0.253}, {1.0, 0.221, 0.155, 0.147, 0.160}, {1.0, 0.181, 0.109, 0.092, 0.092, 0.101}, {1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638}, {1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364}, {1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229}, {1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145}, {1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846}, {1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509}, {1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314}, {1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199}, {1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129}, {1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852} }; /** * Create a default bloom filter with reasonable settings. */ public BloomFilter() { n = 10000; // Let's assume the user wants to store 10,000 items. p = 0.05; // 5% false positive rate. m = 7 * n; // Minimize the length of m. k = 3; // Set k to 3. // We use two hashes to simulate multiple hashes. filter = new BitSet(m); // Stores results in a bitset. } /** * Create a bloom filter for the number of items expected and the * false positive rate. * * @param fpRate False positive rate * @param expected Number of expected elements */ public BloomFilter(double fpRate, int expected) { int[] v; this.n = expected; // Set the expected number of elements. this.p = fpRate; // Set the false positive rate. // Get the minimum m, and k values for this false positive rate. v = getMinMK(p); m = v[0]; k = v[1]; // We use two hashes to simulate multiple hashes. filter = new BitSet(m); // Stores results in a bitset. } /** * Get the minimum 'm', and then the minimum 'k' that is less than or equal * to the supplied probability. **/ private int[] getMinMK(double minP) { int[] mk; if(minP > 1.00) { minP = 1.00; // Obviously can't be greater than 1.00. } else if(minP < 0.00) { minP = 0.000852; // This is the lowest false positive we support. } // Find the row that contains this probability. for(int i = 0; i < fpProb.length; ++i) { for(int j = 0; j < fpProb[i].length; ++j) { if(fpProb[i][j] <= minP) { mk = new int[2]; // Store the results here. mk[0] = (i +1) * n; // This is the m / n ratio. mk[1] = (j + 1); // This is our k-value. p = fpProb[i][j]; // Set to our actual prob. return mk; } } } return null; } /** * Get the expected number of elements for this filter. * * @return Number of expected elements */ public int getExpected() { return n; } /** * Calculate the false positive rate. * * @return False positive rate */ public double getFalsePositiveRate() { return p; } /** * Add this item to the filter. **/ public void add(byte[] data) { int hashValue; int b; hashValue = HASH_SEED; for(int i = 0; i < k; ++i) { hashValue = hash.hash(data, 0, data.length, hashValue); b = Math.abs( (hashValue + i) % m ); filter.set(b); // Set the bits appropriately. } } /** * Indicate whether this item is contained in the filter. * Bloom filters have a false positive rate, so a True value * does not necessarily mean that the item is actually in * the set, but simply that it has a high probability of being * in the set. * * @param data Data element we are searching for. * @return True if the element might be found. False otherwise. **/ public boolean contains(byte[] data) { int hashValue; int b; hashValue = HASH_SEED; for(int i = 0; i < k; ++i) { hashValue = hash.hash(data, 0, data.length, hashValue); b = Math.abs( (hashValue + i) % m ); if(!filter.get(b)) { // Check if there is a bit unset. return false; } } // All the bits are set properly. return true; } /** * Clear the filter. */ public void clear() { filter.clear(); // Reset to all false. } /** * Get the size of the filter. * * @return Size of the filter. */ public int size() { return m; } /** * Get the amount of memory used by the filter. * * @return Memory consumption of the filter. */ public int memory() { return (Double.SIZE / 8) + // FP rate (Integer.SIZE / 8) + // Number of hashes (Integer.SIZE / 8) + // Number of expected items (Integer.SIZE / 8) + // Number of bits filter.size(); // Bit field size } /** * Serialize the bloom filter. * * @return Serialized filter */ public void serialize(ByteBuffer buffer) { buffer.putDouble(p); // False positive rate. buffer.putInt(k); // Number of hashes. buffer.putInt(n); // Number of expected items. buffer.putInt(m); //Length of bit field. buffer.put(filter.toByteArray()); // Bit field. } /** * Initialize the bloom filter from the byte array. * * @param data Serialized filter */ public void unSerialize(ByteBuffer buffer) { p = buffer.getDouble(); k = buffer.getInt(); n = buffer.getInt(); m = buffer.getInt(); byte[] bitSet = new byte[buffer.remaining()]; buffer.get(bitSet); filter = BitSet.valueOf(bitSet); } }