/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hive.common.util; import java.io.*; import java.util.ArrayList; import java.util.Arrays; import java.util.List; /** * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of * bloom filter false positive (element not present in bloom filter but test() says true) are * possible but false negatives are not possible (if element is present then test() will never * say false). The false positive probability is configurable (default: 5%) depending on which * storage requirement may increase or decrease. Lower the false positive probability greater * is the space requirement. * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. * During the creation of bloom filter expected number of entries must be specified. If the number * of insertions exceed the specified initial number of entries then false positive probability will * increase accordingly. * * Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash * algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash * collisions for specific sequence of repeating bytes. Check the following link for more info * https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw */ public class BloomFilter { public static final double DEFAULT_FPP = 0.05; protected BitSet bitSet; protected int numBits; protected int numHashFunctions; public BloomFilter() { } public BloomFilter(long expectedEntries) { this(expectedEntries, DEFAULT_FPP); } static void checkArgument(boolean expression, String message) { if (!expression) { throw new IllegalArgumentException(message); } } public BloomFilter(long expectedEntries, double fpp) { checkArgument(expectedEntries > 0, "expectedEntries should be > 0"); checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0"); int nb = optimalNumOfBits(expectedEntries, fpp); // make 'm' multiple of 64 this.numBits = nb + (Long.SIZE - (nb % Long.SIZE)); this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits); this.bitSet = new BitSet(numBits); } /** * A constructor to support rebuilding the BloomFilter from a serialized representation. * @param bits * @param numBits * @param numFuncs */ public BloomFilter(List<Long> bits, int numBits, int numFuncs) { super(); long[] copied = new long[bits.size()]; for (int i = 0; i < bits.size(); i++) copied[i] = bits.get(i); bitSet = new BitSet(copied); this.numBits = numBits; numHashFunctions = numFuncs; } static int optimalNumOfHashFunctions(long n, long m) { return Math.max(1, (int) Math.round((double) m / n * Math.log(2))); } static int optimalNumOfBits(long n, double p) { return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2))); } public void add(byte[] val) { if (val == null) { addBytes(val, -1, -1); } else { addBytes(val, 0, val.length); } } public void addBytes(byte[] val, int offset, int length) { // We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter" // by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively // implement a Bloom filter without any loss in the asymptotic false positive probability' // Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned // in the above paper long hash64 = val == null ? Murmur3.NULL_HASHCODE : Murmur3.hash64(val, offset, length); addHash(hash64); } private void addHash(long hash64) { int hash1 = (int) hash64; int hash2 = (int) (hash64 >>> 32); for (int i = 1; i <= numHashFunctions; i++) { int combinedHash = hash1 + (i * hash2); // hashcode should be positive, flip all the bits if it's negative if (combinedHash < 0) { combinedHash = ~combinedHash; } int pos = combinedHash % numBits; bitSet.set(pos); } } public void addString(String val) { if (val == null) { add(null); } else { add(val.getBytes()); } } public void addLong(long val) { addHash(getLongHash(val)); } public void addDouble(double val) { addLong(Double.doubleToLongBits(val)); } public boolean test(byte[] val) { if (val == null) { return testBytes(val, -1, -1); } return testBytes(val, 0, val.length); } public boolean testBytes(byte[] val, int offset, int length) { long hash64 = val == null ? Murmur3.NULL_HASHCODE : Murmur3.hash64(val, offset, length); return testHash(hash64); } private boolean testHash(long hash64) { int hash1 = (int) hash64; int hash2 = (int) (hash64 >>> 32); for (int i = 1; i <= numHashFunctions; i++) { int combinedHash = hash1 + (i * hash2); // hashcode should be positive, flip all the bits if it's negative if (combinedHash < 0) { combinedHash = ~combinedHash; } int pos = combinedHash % numBits; if (!bitSet.get(pos)) { return false; } } return true; } public boolean testString(String val) { if (val == null) { return test(null); } else { return test(val.getBytes()); } } public boolean testLong(long val) { return testHash(getLongHash(val)); } // Thomas Wang's integer hash function // http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm private long getLongHash(long key) { key = (~key) + (key << 21); // key = (key << 21) - key - 1; key = key ^ (key >> 24); key = (key + (key << 3)) + (key << 8); // key * 265 key = key ^ (key >> 14); key = (key + (key << 2)) + (key << 4); // key * 21 key = key ^ (key >> 28); key = key + (key << 31); return key; } public boolean testDouble(double val) { return testLong(Double.doubleToLongBits(val)); } public long sizeInBytes() { return getBitSize() / 8; } public int getBitSize() { return bitSet.getData().length * Long.SIZE; } public int getNumHashFunctions() { return numHashFunctions; } public long[] getBitSet() { return bitSet.getData(); } @Override public String toString() { return "m: " + numBits + " k: " + numHashFunctions; } /** * Merge the specified bloom filter with current bloom filter. * * @param that - bloom filter to merge */ public void merge(BloomFilter that) { if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) { this.bitSet.putAll(that.bitSet); } else { throw new IllegalArgumentException("BloomFilters are not compatible for merging." + " this - " + this.toString() + " that - " + that.toString()); } } public void reset() { this.bitSet.clear(); } /** * Serialize a bloom filter * @param out output stream to write to * @param bloomFilter BloomFilter that needs to be seralized */ public static void serialize(OutputStream out, BloomFilter bloomFilter) throws IOException { /** * Serialized BloomFilter format: * 1 byte for the number of hash functions. * 1 big endian int(That is how OutputStream works) for the number of longs in the bitset * big endina longs in the BloomFilter bitset */ DataOutputStream dataOutputStream = new DataOutputStream(out); dataOutputStream.writeByte(bloomFilter.numHashFunctions); dataOutputStream.writeInt(bloomFilter.numBits); for (long value : bloomFilter.getBitSet()) { dataOutputStream.writeLong(value); } } /** * Deserialize a bloom filter * Read a byte stream, which was written by {@linkplain #serialize(OutputStream, BloomFilter)} * into a {@code BloomFilter} * @param in input bytestream * @return deserialized BloomFilter */ public static BloomFilter deserialize(InputStream in) throws IOException { if (in == null) { throw new IOException("Input stream is null"); } try { DataInputStream dataInputStream = new DataInputStream(in); int numHashFunc = dataInputStream.readByte(); int numBits = dataInputStream.readInt(); int sz = (numBits/Long.SIZE); List<Long> data = new ArrayList<Long>(); for (int i = 0; i < sz; i++) { data.add(dataInputStream.readLong()); } return new BloomFilter(data, numBits, numHashFunc); } catch (RuntimeException e) { IOException io = new IOException( "Unable to deserialize BloomFilter"); io.initCause(e); throw io; } } // Given a byte array consisting of a serialized BloomFilter, gives the offset (from 0) // for the start of the serialized long values that make up the bitset. // NumHashFunctions (1 byte) + NumBits (4 bytes) public static final int START_OF_SERIALIZED_LONGS = 5; /** * Merges BloomFilter bf2 into bf1. * Assumes 2 BloomFilters with the same size/hash functions are serialized to byte arrays * @param bf1Bytes * @param bf1Start * @param bf1Length * @param bf2Bytes * @param bf2Start * @param bf2Length */ public static void mergeBloomFilterBytes( byte[] bf1Bytes, int bf1Start, int bf1Length, byte[] bf2Bytes, int bf2Start, int bf2Length) { if (bf1Length != bf2Length) { throw new IllegalArgumentException("bf1Length " + bf1Length + " does not match bf2Length " + bf2Length); } // Validation on the bitset size/3 hash functions. for (int idx = 0; idx < START_OF_SERIALIZED_LONGS; ++idx) { if (bf1Bytes[bf1Start + idx] != bf2Bytes[bf2Start + idx]) { throw new IllegalArgumentException("bf1 NumHashFunctions/NumBits does not match bf2"); } } // Just bitwise-OR the bits together - size/# functions should be the same, // rest of the data is serialized long values for the bitset which are supposed to be bitwise-ORed. for (int idx = START_OF_SERIALIZED_LONGS; idx < bf1Length; ++idx) { bf1Bytes[bf1Start + idx] |= bf2Bytes[bf2Start + idx]; } } /** * Bare metal bit set implementation. For performance reasons, this implementation does not check * for index bounds nor expand the bit set size if the specified index is greater than the size. */ public class BitSet { private final long[] data; public BitSet(long bits) { this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]); } /** * Deserialize long array as bit set. * * @param data - bit array */ public BitSet(long[] data) { assert data.length > 0 : "data length is zero!"; this.data = data; } /** * Sets the bit at specified index. * * @param index - position */ public void set(int index) { data[index >>> 6] |= (1L << index); } /** * Returns true if the bit is set in the specified index. * * @param index - position * @return - value at the bit position */ public boolean get(int index) { return (data[index >>> 6] & (1L << index)) != 0; } /** * Number of bits */ public long bitSize() { return (long) data.length * Long.SIZE; } public long[] getData() { return data; } /** * Combines the two BitArrays using bitwise OR. */ public void putAll(BitSet array) { assert data.length == array.data.length : "BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")"; for (int i = 0; i < data.length; i++) { data[i] |= array.data[i]; } } /** * Clear the bit set. */ public void clear() { Arrays.fill(data, 0); } } }