/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.nearinfinity.bloomfilter;
import java.io.IOException;
import java.io.Serializable;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import com.nearinfinity.bloomfilter.bitset.BloomFilterBitSet;
import com.nearinfinity.bloomfilter.bitset.ThreadSafeBitSet;
/**
* This is a simple implementation of a bloom filter, it uses a chain of murmur
* hashes to create the bloom filter.
*
* @author Aaron McCurry (amccurry@nearinfinity.com)
*/
public class BloomFilter extends BloomFilterFormulas implements Serializable {
private static final long serialVersionUID = -4837894658242080928L;
private static final int seed = 1;
private BloomFilterBitSet bitSet;
private long numberOfBitsDivBy2;
private long elementSize;
private double probabilityOfFalsePositives;
private int hashes;
private int numberOfBits;
public void write(IndexOutput output) throws IOException {
output.writeLong(numberOfBitsDivBy2);
output.writeLong(elementSize);
output.writeLong(Double.doubleToLongBits(probabilityOfFalsePositives));
output.writeInt(hashes);
output.writeInt(numberOfBits);
bitSet.write(output);
}
public void read(IndexInput input) throws IOException {
numberOfBitsDivBy2 = input.readLong();
elementSize = input.readLong();
probabilityOfFalsePositives = Double.longBitsToDouble(input.readLong());
hashes = input.readInt();
numberOfBits = input.readInt();
bitSet = new ThreadSafeBitSet();
bitSet.read(input);
}
public BloomFilter() {
//do nothing
}
/**
* Creates a bloom filter with the provided number of hashed and hits.
* @param probabilityOfFalsePositives the probability of false positives for the given number of elements.
* @param numberOfBits the numberOfBits to be used in the bit set.
*/
public BloomFilter(double probabilityOfFalsePositives, long elementSize) {
this.hashes = getOptimalNumberOfHashesByBits(elementSize, getNumberOfBits(probabilityOfFalsePositives, elementSize));
this.numberOfBits = getNumberOfBits(probabilityOfFalsePositives, elementSize);
this.numberOfBitsDivBy2 = numberOfBits / 2;
this.bitSet = new ThreadSafeBitSet(numberOfBits);
this.probabilityOfFalsePositives = probabilityOfFalsePositives;
this.elementSize = elementSize;
}
/**
* Add a key to the bloom filter.
* @param key the key.
*/
public void addBytes(byte[] key, int offset, int length) {
byte[] bs = key;
for (int i = 0; i < hashes; i++) {
int hash = MurmurHash.hash(seed, bs, offset, length);
setBitSet(hash);
bs[0]++;
}
bs[0] -= hashes; //reset to original value
}
/**
* Tests a key in the bloom filter, it may provide false positives.
* @param key the key.
* @return boolean.
*/
public boolean testBytes(byte[] key, int offset, int length) {
byte[] bs = key;
for (int i = 0; i < hashes; i++) {
int hash = MurmurHash.hash(seed, bs, offset, length);
if (!testBitSet(hash)) {
bs[0] -= i; //reset to original value
return false;
}
bs[0]++;
}
bs[0] -= hashes; //reset to original value
return true;
}
/**
* Gets the number of long words in the bit set.
* @return the number of bytes in the heap (not counting jvm overhead).
*/
public long getMemorySize() {
return bitSet.getMemorySize();
}
/**
* Sets the bit position in the bit set.
* @param hash the hash produced by the murmur class.
*/
private void setBitSet(int hash) {
bitSet.set(getIndex(hash));
}
/**
* Tests the bit position in the bit set.
* @param hash the hash produced by the murmur class.
* @return boolean.
*/
private boolean testBitSet(int hash) {
return bitSet.get(getIndex(hash));
}
/**
* Gets the index into the bit set for the given hash.
* @param hash the hash produced by the murmur class.
* @return the index position.
*/
private long getIndex(int hash) {
return (hash % numberOfBitsDivBy2) + numberOfBitsDivBy2;
}
}