/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hive.common.util;
import java.io.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* BloomFilter is a probabilistic data structure for set membership check. BloomFilters are
* highly space efficient when compared to using a HashSet. Because of the probabilistic nature of
* bloom filter false positive (element not present in bloom filter but test() says true) are
* possible but false negatives are not possible (if element is present then test() will never
* say false). The false positive probability is configurable (default: 5%) depending on which
* storage requirement may increase or decrease. Lower the false positive probability greater
* is the space requirement.
* Bloom filters are sensitive to number of elements that will be inserted in the bloom filter.
* During the creation of bloom filter expected number of entries must be specified. If the number
* of insertions exceed the specified initial number of entries then false positive probability will
* increase accordingly.
*
* Internally, this implementation of bloom filter uses Murmur3 fast non-cryptographic hash
* algorithm. Although Murmur2 is slightly faster than Murmur3 in Java, it suffers from hash
* collisions for specific sequence of repeating bytes. Check the following link for more info
* https://code.google.com/p/smhasher/wiki/MurmurHash2Flaw
*/
public class BloomFilter {
public static final double DEFAULT_FPP = 0.05;
protected BitSet bitSet;
protected int numBits;
protected int numHashFunctions;
public BloomFilter() {
}
public BloomFilter(long expectedEntries) {
this(expectedEntries, DEFAULT_FPP);
}
static void checkArgument(boolean expression, String message) {
if (!expression) {
throw new IllegalArgumentException(message);
}
}
public BloomFilter(long expectedEntries, double fpp) {
checkArgument(expectedEntries > 0, "expectedEntries should be > 0");
checkArgument(fpp > 0.0 && fpp < 1.0, "False positive probability should be > 0.0 & < 1.0");
int nb = optimalNumOfBits(expectedEntries, fpp);
// make 'm' multiple of 64
this.numBits = nb + (Long.SIZE - (nb % Long.SIZE));
this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, numBits);
this.bitSet = new BitSet(numBits);
}
/**
* A constructor to support rebuilding the BloomFilter from a serialized representation.
* @param bits
* @param numBits
* @param numFuncs
*/
public BloomFilter(List<Long> bits, int numBits, int numFuncs) {
super();
long[] copied = new long[bits.size()];
for (int i = 0; i < bits.size(); i++) copied[i] = bits.get(i);
bitSet = new BitSet(copied);
this.numBits = numBits;
numHashFunctions = numFuncs;
}
static int optimalNumOfHashFunctions(long n, long m) {
return Math.max(1, (int) Math.round((double) m / n * Math.log(2)));
}
static int optimalNumOfBits(long n, double p) {
return (int) (-n * Math.log(p) / (Math.log(2) * Math.log(2)));
}
public void add(byte[] val) {
if (val == null) {
addBytes(val, -1, -1);
} else {
addBytes(val, 0, val.length);
}
}
public void addBytes(byte[] val, int offset, int length) {
// We use the trick mentioned in "Less Hashing, Same Performance: Building a Better Bloom Filter"
// by Kirsch et.al. From abstract 'only two hash functions are necessary to effectively
// implement a Bloom filter without any loss in the asymptotic false positive probability'
// Lets split up 64-bit hashcode into two 32-bit hash codes and employ the technique mentioned
// in the above paper
long hash64 = val == null ? Murmur3.NULL_HASHCODE :
Murmur3.hash64(val, offset, length);
addHash(hash64);
}
private void addHash(long hash64) {
int hash1 = (int) hash64;
int hash2 = (int) (hash64 >>> 32);
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = hash1 + (i * hash2);
// hashcode should be positive, flip all the bits if it's negative
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
int pos = combinedHash % numBits;
bitSet.set(pos);
}
}
public void addString(String val) {
if (val == null) {
add(null);
} else {
add(val.getBytes());
}
}
public void addLong(long val) {
addHash(getLongHash(val));
}
public void addDouble(double val) {
addLong(Double.doubleToLongBits(val));
}
public boolean test(byte[] val) {
if (val == null) {
return testBytes(val, -1, -1);
}
return testBytes(val, 0, val.length);
}
public boolean testBytes(byte[] val, int offset, int length) {
long hash64 = val == null ? Murmur3.NULL_HASHCODE :
Murmur3.hash64(val, offset, length);
return testHash(hash64);
}
private boolean testHash(long hash64) {
int hash1 = (int) hash64;
int hash2 = (int) (hash64 >>> 32);
for (int i = 1; i <= numHashFunctions; i++) {
int combinedHash = hash1 + (i * hash2);
// hashcode should be positive, flip all the bits if it's negative
if (combinedHash < 0) {
combinedHash = ~combinedHash;
}
int pos = combinedHash % numBits;
if (!bitSet.get(pos)) {
return false;
}
}
return true;
}
public boolean testString(String val) {
if (val == null) {
return test(null);
} else {
return test(val.getBytes());
}
}
public boolean testLong(long val) {
return testHash(getLongHash(val));
}
// Thomas Wang's integer hash function
// http://web.archive.org/web/20071223173210/http://www.concentric.net/~Ttwang/tech/inthash.htm
private long getLongHash(long key) {
key = (~key) + (key << 21); // key = (key << 21) - key - 1;
key = key ^ (key >> 24);
key = (key + (key << 3)) + (key << 8); // key * 265
key = key ^ (key >> 14);
key = (key + (key << 2)) + (key << 4); // key * 21
key = key ^ (key >> 28);
key = key + (key << 31);
return key;
}
public boolean testDouble(double val) {
return testLong(Double.doubleToLongBits(val));
}
public long sizeInBytes() {
return getBitSize() / 8;
}
public int getBitSize() {
return bitSet.getData().length * Long.SIZE;
}
public int getNumHashFunctions() {
return numHashFunctions;
}
public long[] getBitSet() {
return bitSet.getData();
}
@Override
public String toString() {
return "m: " + numBits + " k: " + numHashFunctions;
}
/**
* Merge the specified bloom filter with current bloom filter.
*
* @param that - bloom filter to merge
*/
public void merge(BloomFilter that) {
if (this != that && this.numBits == that.numBits && this.numHashFunctions == that.numHashFunctions) {
this.bitSet.putAll(that.bitSet);
} else {
throw new IllegalArgumentException("BloomFilters are not compatible for merging." +
" this - " + this.toString() + " that - " + that.toString());
}
}
public void reset() {
this.bitSet.clear();
}
/**
* Serialize a bloom filter
* @param out output stream to write to
* @param bloomFilter BloomFilter that needs to be seralized
*/
public static void serialize(OutputStream out, BloomFilter bloomFilter) throws IOException {
/**
* Serialized BloomFilter format:
* 1 byte for the number of hash functions.
* 1 big endian int(That is how OutputStream works) for the number of longs in the bitset
* big endina longs in the BloomFilter bitset
*/
DataOutputStream dataOutputStream = new DataOutputStream(out);
dataOutputStream.writeByte(bloomFilter.numHashFunctions);
dataOutputStream.writeInt(bloomFilter.numBits);
for (long value : bloomFilter.getBitSet()) {
dataOutputStream.writeLong(value);
}
}
/**
* Deserialize a bloom filter
* Read a byte stream, which was written by {@linkplain #serialize(OutputStream, BloomFilter)}
* into a {@code BloomFilter}
* @param in input bytestream
* @return deserialized BloomFilter
*/
public static BloomFilter deserialize(InputStream in) throws IOException {
if (in == null) {
throw new IOException("Input stream is null");
}
try {
DataInputStream dataInputStream = new DataInputStream(in);
int numHashFunc = dataInputStream.readByte();
int numBits = dataInputStream.readInt();
int sz = (numBits/Long.SIZE);
List<Long> data = new ArrayList<Long>();
for (int i = 0; i < sz; i++) {
data.add(dataInputStream.readLong());
}
return new BloomFilter(data, numBits, numHashFunc);
} catch (RuntimeException e) {
IOException io = new IOException( "Unable to deserialize BloomFilter");
io.initCause(e);
throw io;
}
}
// Given a byte array consisting of a serialized BloomFilter, gives the offset (from 0)
// for the start of the serialized long values that make up the bitset.
// NumHashFunctions (1 byte) + NumBits (4 bytes)
public static final int START_OF_SERIALIZED_LONGS = 5;
/**
* Merges BloomFilter bf2 into bf1.
* Assumes 2 BloomFilters with the same size/hash functions are serialized to byte arrays
* @param bf1Bytes
* @param bf1Start
* @param bf1Length
* @param bf2Bytes
* @param bf2Start
* @param bf2Length
*/
public static void mergeBloomFilterBytes(
byte[] bf1Bytes, int bf1Start, int bf1Length,
byte[] bf2Bytes, int bf2Start, int bf2Length) {
if (bf1Length != bf2Length) {
throw new IllegalArgumentException("bf1Length " + bf1Length + " does not match bf2Length " + bf2Length);
}
// Validation on the bitset size/3 hash functions.
for (int idx = 0; idx < START_OF_SERIALIZED_LONGS; ++idx) {
if (bf1Bytes[bf1Start + idx] != bf2Bytes[bf2Start + idx]) {
throw new IllegalArgumentException("bf1 NumHashFunctions/NumBits does not match bf2");
}
}
// Just bitwise-OR the bits together - size/# functions should be the same,
// rest of the data is serialized long values for the bitset which are supposed to be bitwise-ORed.
for (int idx = START_OF_SERIALIZED_LONGS; idx < bf1Length; ++idx) {
bf1Bytes[bf1Start + idx] |= bf2Bytes[bf2Start + idx];
}
}
/**
* Bare metal bit set implementation. For performance reasons, this implementation does not check
* for index bounds nor expand the bit set size if the specified index is greater than the size.
*/
public class BitSet {
private final long[] data;
public BitSet(long bits) {
this(new long[(int) Math.ceil((double) bits / (double) Long.SIZE)]);
}
/**
* Deserialize long array as bit set.
*
* @param data - bit array
*/
public BitSet(long[] data) {
assert data.length > 0 : "data length is zero!";
this.data = data;
}
/**
* Sets the bit at specified index.
*
* @param index - position
*/
public void set(int index) {
data[index >>> 6] |= (1L << index);
}
/**
* Returns true if the bit is set in the specified index.
*
* @param index - position
* @return - value at the bit position
*/
public boolean get(int index) {
return (data[index >>> 6] & (1L << index)) != 0;
}
/**
* Number of bits
*/
public long bitSize() {
return (long) data.length * Long.SIZE;
}
public long[] getData() {
return data;
}
/**
* Combines the two BitArrays using bitwise OR.
*/
public void putAll(BitSet array) {
assert data.length == array.data.length :
"BitArrays must be of equal length (" + data.length + "!= " + array.data.length + ")";
for (int i = 0; i < data.length; i++) {
data[i] |= array.data[i];
}
}
/**
* Clear the bit set.
*/
public void clear() {
Arrays.fill(data, 0);
}
}
}