/**
*
*/
package com.taobao.top.analysis.util.bloom;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.concurrent.atomic.AtomicInteger;
/**
* @author fangweng
* @email fangweng@taobao.com
* @date 2011-3-16
*
* Implements a Bloom filter, as defined by Bloom in 1970. The Bloom
* filter is a data structure that was introduced in 1970 and that has
* been adopted by the networking research community in the past decade
* thanks to the bandwidth efficiencies that it offers for the
* transmission of set membership information between networked hosts. A
* sender encodes the information into a bit vector, the Bloom filter,
* that is more compact than a conventional representation. Computation
* and space costs for construction are linear in the number of elements.
* The receiver uses the filter to test whether various elements are
* members of the set. Though the filter will occasionally return a false
* positive, it will never return a false negative. When creating the
* filter, the sender can choose its desired point in a trade-off between
* the false positive rate and the size.
*
*/
public class ByteBloomFilter implements BloomFilter {
public static final int VERSION = 1;
protected int byteSize;
protected final int hashCount;
protected Hash hash;
protected AtomicInteger keyCount = new AtomicInteger(0);
protected int maxKeys;
protected ByteBuffer bloom;
private static final byte[] bitvals = { (byte) 0x01, (byte) 0x02,
(byte) 0x04, (byte) 0x08, (byte) 0x10, (byte) 0x20, (byte) 0x40,
(byte) 0x80 };
/**
* Loads bloom filter meta data from file input.
*
* @param meta
* @throws IllegalArgumentException
*/
public ByteBloomFilter(ByteBuffer meta) throws IllegalArgumentException {
int version = meta.getInt();
if (version != VERSION)
throw new IllegalArgumentException("Bad version");
this.byteSize = meta.getInt();
this.hashCount = meta.getInt();
this.keyCount = new AtomicInteger(meta.getInt());
this.maxKeys = this.keyCount.intValue();
this.hash = MurmurHash.getInstance();
sanityCheck();
allocBloom();
}
public ByteBloomFilter(int maxKeys, float errorRate,
int foldFactor) throws IllegalArgumentException {
/*
* Bloom filters are very sensitive to the number of elements inserted
* into them. For HBase, the number of entries depends on the size of
* the data stored in the column. Currently the default region size is
* 256MB, so entry count ~= 256MB / (average value size for column).
* Despite this rule of thumb, there is no efficient way to calculate
* the entry count after compactions. Therefore, it is often easier to
* use a dynamic bloom filter that will add extra space instead of
* allowing the error rate to grow.
*
* ( http://www.eecs.harvard.edu/~michaelm/NEWWORK/postscripts/
* BloomFilterSurvey.pdf )
*
* m denotes the number of bits in the Bloom filter (bitSize) n denotes
* the number of elements inserted into the Bloom filter (maxKeys) k
* represents the number of hash functions used (nbHash) e represents
* the desired false positive rate for the bloom (err)
*
* If we fix the error rate (e) and know the number of entries, then the
* optimal bloom size m = -(n * ln(err) / (ln(2)^2) ~= n * ln(err) /
* ln(0.6185)
*
* The probability of false positives is minimized when k = m/n ln(2).
*/
int bitSize = (int) Math.ceil(maxKeys
* (Math.log(errorRate) / Math.log(0.6185)));
int functionCount = (int) Math.ceil(Math.log(2) * (bitSize / maxKeys));
// increase byteSize so folding is possible
int byteSize = (bitSize + 7) / 8;
int mask = (1 << foldFactor) - 1;
if ((mask & byteSize) != 0) {
byteSize >>= foldFactor;
++byteSize;
byteSize <<= foldFactor;
}
this.byteSize = byteSize;
this.hashCount = functionCount;
this.hash = MurmurHash.getInstance();
this.keyCount = new AtomicInteger(0);
this.maxKeys = maxKeys;
sanityCheck();
allocBloom();
}
@Override
public void setHash(Hash hash) {
this.hash = hash;
}
void sanityCheck() throws IllegalArgumentException {
if (this.byteSize <= 0) {
throw new IllegalArgumentException("maxValue must be > 0");
}
if (this.hashCount <= 0) {
throw new IllegalArgumentException(
"Hash function count must be > 0");
}
if (this.hash == null) {
throw new IllegalArgumentException("hashType must be known");
}
if (this.keyCount.intValue() < 0) {
throw new IllegalArgumentException("must have positive keyCount");
}
}
void bloomCheck(ByteBuffer bloom) throws IllegalArgumentException {
if (this.byteSize != bloom.limit()) {
throw new IllegalArgumentException(
"Configured bloom length should match actual length");
}
}
@Override
public void allocBloom() {
//不做并发保护
if (this.bloom != null) {
throw new IllegalArgumentException("can only create bloom once.");
}
this.bloom = ByteBuffer.allocate(this.byteSize);
assert this.bloom.hasArray();
}
@Override
public void add(byte[] buf) {
add(buf, 0, buf.length);
}
@Override
public void add(byte[] buf, int offset, int len) {
/*
* For faster hashing, use combinatorial generation
* http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf
*/
int hash1 = this.hash.hash(buf, offset, len, 0);
int hash2 = this.hash.hash(buf, offset, len, hash1);
for (int i = 0; i < this.hashCount; i++) {
int hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
set(hashLoc);
}
this.keyCount.incrementAndGet();
}
@Override
public boolean contains(byte[] buf) {
return contains(buf, 0, buf.length);
}
@Override
public boolean contains(byte[] buf, int offset, int length) {
if (bloom.limit() != this.byteSize) {
throw new IllegalArgumentException(
"Bloom does not match expected size");
}
int hash1 = this.hash.hash(buf, offset, length, 0);
int hash2 = this.hash.hash(buf, offset, length, hash1);
for (int i = 0; i < this.hashCount; i++) {
int hashLoc = Math.abs((hash1 + i * hash2) % (this.byteSize * 8));
if (!get(hashLoc, bloom)) {
return false;
}
}
return true;
}
/**
* Set the bit at the specified index to 1.
*
* @param pos
* index of bit
*/
void set(int pos) {
int bytePos = pos / 8;
int bitPos = pos % 8;
byte curByte = bloom.get(bytePos);
curByte |= bitvals[bitPos];
bloom.put(bytePos, curByte);
}
/**
* Check if bit at specified index is 1.
*
* @param pos
* index of bit
* @return true if bit at specified index is 1, false if 0.
*/
static boolean get(int pos, ByteBuffer theBloom) {
int bytePos = pos / 8;
int bitPos = pos % 8;
byte curByte = theBloom.get(bytePos);
curByte &= bitvals[bitPos];
return (curByte != 0);
}
@Override
public int getKeyCount() {
return this.keyCount.intValue();
}
@Override
public int getMaxKeys() {
return this.maxKeys;
}
@Override
public int getByteSize() {
return this.byteSize;
}
@Override
public void compactBloom() {
// see if the actual size is exponentially smaller than expected.
if (this.keyCount.intValue() > 0 && this.bloom.hasArray()) {
int pieces = 1;
int newByteSize = this.byteSize;
int newMaxKeys = this.maxKeys;
// while exponentially smaller & folding is lossless
while ((newByteSize & 1) == 0 && newMaxKeys > (this.keyCount.intValue() << 1)) {
pieces <<= 1;
newByteSize >>= 1;
newMaxKeys >>= 1;
}
// if we should fold these into pieces
if (pieces > 1) {
byte[] array = this.bloom.array();
int start = this.bloom.arrayOffset();
int end = start + newByteSize;
int off = end;
for (int p = 1; p < pieces; ++p) {
for (int pos = start; pos < end; ++pos) {
array[pos] |= array[off++];
}
}
// folding done, only use a subset of this array
this.bloom.rewind();
this.bloom.limit(newByteSize);
this.bloom = this.bloom.slice();
this.byteSize = newByteSize;
this.maxKeys = newMaxKeys;
}
}
}
/**
* Writes just the bloom filter to the output array
*
* @param out
* OutputStream to place bloom
* @throws IOException
* Error writing bloom array
*/
public void writeBloom(final DataOutput out) throws IOException {
if (!this.bloom.hasArray()) {
throw new IOException(
"Only writes ByteBuffer with underlying array.");
}
out.write(bloom.array(), bloom.arrayOffset(), bloom.limit());
}
@Override
public Writable getMetaWriter() {
return new MetaWriter();
}
@Override
public Writable getDataWriter() {
return new DataWriter();
}
private class MetaWriter implements Writable {
protected MetaWriter() {
}
@Override
public void readFields(DataInput arg0) throws IOException {
throw new IOException("Cant read with this class.");
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(VERSION);
out.writeInt(byteSize);
out.writeInt(hashCount);
out.writeInt(keyCount.intValue());
}
}
private class DataWriter implements Writable {
protected DataWriter() {
}
@Override
public void readFields(DataInput arg0) throws IOException {
throw new IOException("Cant read with this class.");
}
@Override
public void write(DataOutput out) throws IOException {
writeBloom(out);
}
}
}