/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Nov 4, 2008
*/
package com.bigdata.btree;
import java.io.Externalizable;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.nio.ByteBuffer;
import org.apache.log4j.Logger;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.ICounterSet;
import com.bigdata.counters.Instrument;
import com.bigdata.io.LongPacker;
import com.bigdata.io.SerializerUtil;
import com.bigdata.rawstore.IRawStore;
/**
* Encapsulates the actual implementation class and provides the protocol for
* (de-)serialization.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* @todo Compare to the latest DSI code for the bloom filter. Has it been made
* faster?
*/
public class BloomFilter implements IBloomFilter, Externalizable {
private static final transient Logger log = Logger.getLogger(BloomFilter.class);
/**
*
*/
private static final long serialVersionUID = -4011582802868293737L;
/**
* The implementation object. This is cleared by {@link #disable()}.
*
* @serial
*/
private it.unimi.dsi.util.BloomFilter2 filter;
/**
* The natural logarithm of 2, used in the computation of the number of
* bits.
*/
private transient final static double LN2 = Math.log(2);
/**
* The expected #of index entries (from the ctor).
*/
private int n;
/**
* The target error rate when there are {@link #n} index entries (from the
* ctor).
*/
private double p;
/**
* The #of index entries at which the filter will have reached its maximum
* error rate (from the ctor).
* <p>
* Note: The value applied by {@link BTree} and normally calculated by the
* {@link BloomFilterFactory}.
*/
private int maxN;
/**
* The expected #of index entries (from the ctor).
*/
final public int getN() {
return n;
}
/**
* The target error rate when there are {@link #getN()} index entries (from
* the ctor).
* <p>
* Note: This class does not know the actual false positive error rate.
* However, that is tracked by the {@link AbstractBTree#btreeCounters}.
*/
final public double getP() {
return p;
}
/**
* The false positive error rate estimated as <code>2^-d</code>, where
* <i>d</i> is the #of hash functions. This will be close to but typically
* not exactly the same as the value of {@link #getP()} specified to the
* ctor.
* <p>
* Note: This class does not know the actual false positive error rate.
* However, that is tracked by the {@link AbstractBTree#btreeCounters}.
*/
public double getErrorRate() {
return Math.pow(2, -filter.d());
}
/**
* The #of index entries at which the filter will have reached its maximum
* error rate (from the ctor).
*/
final public int getMaxN() {
return maxN;
}
/**
* De-serialization ctor.
*/
public BloomFilter() {
}
/**
* Ctor specifies <code>maxN := n * 2</code>.
*
* @param n
* The expected #of index entries.
* @param p
* The target error rate.
*/
public BloomFilter(final int n, final double p) {
this(n, p, n * 2);
}
/**
*
* @param n
* The expected #of index entries.
* @param p
* The target error rate.
* @param maxN
* The #of index entries at which the filter will have reached
* its maximum error rate. (The value is normally calculated by
* the {@link BloomFilterFactory}.)
*
* @throws IllegalArgumentException
* if <i>n</i> is non-positive.
* @throws IllegalArgumentException
* unless <i>p</i> lies in (0:1).
* @throws IllegalArgumentException
* if <i>maxN</i> is LT <i>n</i>.
*
*/
public BloomFilter(final int n, final double p, final int maxN) {
if (n < 1) {
throw new IllegalArgumentException();
}
if (p <= 0.0 || p > 1.0) {
throw new IllegalArgumentException();
}
if (maxN < n) {
throw new IllegalArgumentException();
}
final int d = getHashFunctionCount(p);
filter = new it.unimi.dsi.util.BloomFilter2(n, d);
if (log.isDebugEnabled())
log.debug("n=" + n + ", p=" + p + ", d=" + d + ", m=" + filter.m());
this.n = n;
this.p = p;
this.maxN = maxN;
}
/**
* The #of hash functions used by the filter.
*/
final public int getHashFunctionCount() {
return filter.d();
}
/**
* The bit length of the filter.
*/
final public long getBitLength() {
return filter.m();
}
/**
* Return the #of hash functions required to achieve the specified error
* rate. If <code>p</code> is the probability of a false positive and
* <code>d</code> is the #of hash functions, then
*
* <pre>
* p = pow(2, -d)
* </pre>
*
* and
*
* <pre>
* d = ceil(-ln(p) / ln(2))
* </pre>
*/
public static int getHashFunctionCount(final double errorRate) {
if (errorRate <= 0.0 || errorRate > 1.0) {
throw new IllegalArgumentException();
}
final double p = errorRate;
final double d2 = -(Math.log(p) / LN2);
final int d = (int) Math.ceil(d2);
return d;
}
/**
* Return the bit length required to provision a filter having the specified
* #of hash functions and the target capacity. This is
*
* <pre>
*
* bitLength = ceil(nentries * (hashFunctionCount / ln(2)))
*
* </pre>
*
* @param hashFunctionCount
* The #of hash functions.
* @param nentries
* The target capacity.
*
* @return The required bit length.
*/
public static long getBitLength(final int hashFunctionCount,
final int nentries) {
final long bitLength = (long) Math.ceil(nentries
* (hashFunctionCount / LN2));
return bitLength;
}
/**
* This returns the #of index entries at which the bloom filter will have
* the specified <em>expected</em> error rate. The probability of a false
* positive is
*
* <pre>
* p = (1 - (1 - 1 / m) ˆ kn) ˆ k
* </pre>
*
* where m is the bit length of the filter, k is the #of hash functions, and
* n is the #of items that have been inserted into the filter.
*
* or approximately
*
* <pre>
* p = (1 - e ˆ -kn / m) ˆ k
* </pre>
*
* solving for <code>n</code> we obtain
*
* <pre>
* n = -m ln( 1 - pˆ(1/k) ) / k
* </pre>
*
* @param k
* The #of hash functions.
* @param m
* The bit length of the filter.
* @param p
* The expected error rate.
*
* @return The #of index entries at which the expected bloom filter error
* rate will be the specified value.
*
* @see http://en.wikipedia.org/wiki/Bloom_filter
*/
static public int getEntryCountForErrorRate(final int k, final long m,
final double p) {
final double n = -m * Math.log(1 - Math.pow(p, 1d / k)) / k;
if(log.isDebugEnabled())
log.debug("p=" + p + ", m=" + m + ", k=" + k + ", n=" + n);
return (int) n;
}
/**
* @throws IllegalStateException
* if the filter has been {@link #disable()}d
*/
public boolean add(final byte[] key) {
if (key == null)
throw new IllegalArgumentException();
if (!enabled)
throw new IllegalStateException();
if (filter.add(key)) {
// filter state was modified.
dirty = true;
counters.nbloomAdd++;
return true;
}
return false;
}
/**
* @throws IllegalStateException
* if the filter has been {@link #disable()}d
*/
public boolean contains(final byte[] key) {
if (key == null)
throw new IllegalArgumentException();
if (!enabled)
throw new IllegalStateException();
counters.nbloomTest++;
if(!filter.contains(key)) {
counters.nbloomRejects++;
return false;
}
return true;
}
public String toString() {
final StringBuilder sb = new StringBuilder();
sb.append("BloomFilter");
sb.append("{ minSize=" + filter.size());
sb.append(", n=" + n);
sb.append(", p=" + p);
sb.append(", maxN=" + maxN);
sb.append(", bitLength=" + filter.m());
sb.append(", hashFunctionCount=" + filter.d());
sb.append(", errorRate=" + getErrorRate());
if (dirty)
sb.append(", dirty");
if (!enabled)
sb.append(", disabled");
if (addr != 0L)
sb.append(", addr=" + addr);
sb.append("}");
return sb.toString();
}
/*
* Persistence protocol.
*/
/**
* Address that can be used to read this object from the store.
* <p>
* Note: This is not persisted since we do not have the address until after
* we have written out the state of this record. However the value is
* written into each {@link Checkpoint} record.
*/
private transient long addr;
/**
* Presumed clean until {@link #add(byte[])} indicates that the filter state
* was changed.
*/
private transient boolean dirty = false;
/**
* Address that can be used to read this object from the store.
* <p>
* Note: This is not a persistent property. However the value is set when
* the record is read from, or written on, the store.
*/
public final long getAddr() {
return addr;
}
/**
* Read a bloom filter record from the store.
*
* @param store
* the store.
* @param addr
* the address of the bloom filter record.
*
* @return the de-serialized bloom filter record. The address from which it
* was loaded is set on the bloom filter as a side-effect.
*/
public static BloomFilter read(final IRawStore store, final long addr) {
final BloomFilter filter = (BloomFilter) SerializerUtil
.deserialize(store.read(addr));
// save the address from which the record was loaded.
filter.addr = addr;
if (log.isInfoEnabled())
log.info("Read bloom filter: bytesOnDisk="
+ store.getByteCount(addr) + ": " + filter);
return filter;
}
/**
* Return <code>true</code> iff the state of the filter has been modified
* but not yet written onto the store. The filter is presumed clean when
* created or when it is read from the store. The filter will remain clean
* until {@link #add(byte[])} returns <code>true</code>, indicating that
* the state of the filter has been changed.
*/
final public boolean isDirty() {
return dirty;
}
// /**
// * Marks the filter as dirty.
// */
// final protected void setDirty() {
//
// dirty = true;
//
// }
/**
* Writes the bloom filter on the store and clears the {@link #isDirty()}
* flag.
* <p>
* Note: This also sets the address on {@link #addr} as a side-effect, but
* the address is NOT written into the store since it is not available until
* after the record has been serialized.
* <p>
* Note: This method DOES NOT test {@link #isDirty()}.
*
* @param store
* The store.
*
* @return The address on which it was written.
*
* @throws IllegalStateException
* if the filter is not dirty.
* @throws IllegalStateException
* if the filter is not enabled.
*/
public long write(final IRawStore store) {
if (!dirty)
throw new IllegalStateException();
if (!enabled)
throw new IllegalStateException();
addr = store.write(ByteBuffer.wrap(SerializerUtil.serialize(this)));
dirty = false;
if (log.isInfoEnabled())
log.info("Wrote bloom filter: bytesOnDisk="
+ store.getByteCount(addr) + ": " + filter);
return addr;
}
/**
* Disables the bloom filter associated with the index. A disabled bloom
* filter can not be persisted and will not respond to queries or permit
* mutations.
* <p>
* Note: This method is invoked by {@link BTree#insert(byte[], byte[])} when
* the #of index entries exceeds the maximum allowed for the bloom filter.
* At that point the {@link BTree} is dirty. {@link Checkpoint} will notice
* that the bloom filter is disabled will write its address as 0L so the
* bloom filter is no longer reachable from the post-checkpoint record.
* <p>
* @return the current address for recycling
*/
final public long disable() {
final long ret = addr;
if (enabled) {
enabled = false;
// release the filter impl. this is often 1-10M of data!
filter = null;
addr = 0;
if (log.isInfoEnabled())
log.info("disabled.");
}
return ret;
}
private transient boolean enabled = true;
/**
* Return <code>true</code> unless the bloom filter has been disabled.
* <p>
* Note: A bloom filter may be disabled is the #of index entries has
* exceeded the maximum desired error rate for the bloom filter.
*
* @return iff the bloom filter is enabled.
*/
final public boolean isEnabled() {
return enabled;
}
private final static transient int VERSION0 = 0x0;
/**
* Note: On read, the {@link #addr} is set to <code>0L</code>, the
* {@link #dirty} flag is cleared, and {@link #enabled} flag is set. It's
* necessary to override serialization otherwise java will default the
* {@link #enabled} flag to <code>false</code> when an object is
* de-serialized - whoops!
*
* @param in
* @throws IOException
* @throws ClassNotFoundException
*/
public void readExternal(ObjectInput in) throws IOException,
ClassNotFoundException {
final int version = (int) LongPacker.unpackLong(in);
if (version != VERSION0)
throw new IOException("Unknown version=" + version);
n = (int) LongPacker.unpackLong(in);
maxN = (int) LongPacker.unpackLong(in);
p = in.readDouble();
filter = (it.unimi.dsi.util.BloomFilter2) in.readObject();
dirty = false;
addr = 0L;
enabled = true;
}
/**
*
* @param out
* @throws IOException
*/
public void writeExternal(ObjectOutput out) throws IOException {
LongPacker.packLong(out, VERSION0);
LongPacker.packLong(out, n);
LongPacker.packLong(out, maxN);
out.writeDouble( p );
out.writeObject(filter);
}
public void falsePos() {
counters.nbloomFalsePos++;
}
/**
* Counters are not persistent.
*/
public transient BloomFilterCounters counters = new BloomFilterCounters();
/**
* Counters for bloom filter access and notification of false positives.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* @todo use long here and in {@link BTreeCounters}?
*/
public static class BloomFilterCounters {
/**
* #of keys added to the bloom filter.
*/
public int nbloomAdd = 0;
/** #of keys tested by the bloom filter in contains/lookup(key). */
public int nbloomTest = 0;
/** #of keys rejected by the bloom filter in contains/lookup(key). */
public int nbloomRejects = 0;
/** #of false positives from the bloom filter in contains/lookup(key). */
public int nbloomFalsePos = 0;
/**
* @todo summarize when the {@link BTreeCounters} are summarized.
*
* @param o
*/
public void add(BloomFilterCounters o) {
nbloomAdd += o.nbloomAdd;
nbloomTest += o.nbloomTest;
nbloomRejects += o.nbloomRejects;
nbloomFalsePos += o.nbloomFalsePos;
}
/**
* The effective acceptance rate for the bloom filter (<code>1 - rejectRate</code>).
* This is the rate at which the bloom filter reports that the key is in
* the index. False positives occur when the filter accepts a key and
* the index is consulted but the key is not found in the index.
*
* @return The bloom filter acceptance rate.
*/
public double getBloomAcceptRate() {
return 1 - getBloomRejectionRate();
}
/**
* The effective rejection rate (correct rejection rate) for the bloom
* filter. Bloom filters do not make false negative errors, so any time
* the filter rejects a key we assume that it was a correct rejection.
*
* @return The bloom filter correct rejection rate.
*/
public double getBloomRejectionRate() {
if (nbloomTest == 0)
return 0d;
return (nbloomRejects / (double) nbloomTest);
}
/**
* The effective error rate (false positive rate) for the bloom filter.
* A false positive is an instance where the bloom filter reports that
* the key is in the index but a read against the index demonstrates
* that the key does not exist in the index. False positives are in the
* nature of bloom filters and arise because keys may be hash equivalent
* for the bloom filter.
*
* @return The bloom filter error rate.
*/
public double getBloomErrorRate() {
if (nbloomTest == 0)
return 0d;
return (nbloomFalsePos / (double) nbloomTest);
}
/**
* Return a {@link CounterSet} reporting on the various counters tracked
* in the instance fields of this class.
*
* FIXME Integrate with {@link BTreeCounters}. This needs to happen when we
* setup the bloom filter, so that is in _reopen() for both
* {@link BTree} and {@link IndexSegment}.
*/
synchronized public ICounterSet getCounters() {
if (counterSet == null) {
counterSet = new CounterSet();
counterSet.addCounter("#add", new Instrument<Integer>() {
protected void sample() {
setValue(nbloomAdd);
}
});
counterSet.addCounter("#test", new Instrument<Integer>() {
protected void sample() {
setValue(nbloomTest);
}
});
counterSet.addCounter("#reject", new Instrument<Integer>() {
protected void sample() {
setValue(nbloomRejects);
}
});
counterSet.addCounter("#falsePos", new Instrument<Integer>() {
protected void sample() {
setValue(nbloomFalsePos);
}
});
counterSet.addCounter("rejectRate", new Instrument<Double>() {
protected void sample() {
setValue(getBloomRejectionRate());
}
});
counterSet.addCounter("errorRate", new Instrument<Double>() {
protected void sample() {
setValue(getBloomErrorRate());
}
});
}
return counterSet;
}
private CounterSet counterSet;
/**
* XML representation of the {@link CounterSet}.
*/
public String toString() {
// in XML.
return getCounters().asXML(null/* filter */);
}
/**
* Returns a human readable representation of the bloom filter
* performance, including the correct rejection rate and the false
* positive rate to date (or at least since the bloom filter was
* read from the store).
*/
public String getBloomFilterPerformance() {
return "bloom filter" + //
": nadd=" + nbloomAdd + //
", ntest=" + nbloomTest + //
", nreject=" + nbloomRejects + //
", nfalsePos=" + nbloomFalsePos + //
", rejectRate=" + getBloomRejectionRate() + //
", errorRate=" + getBloomErrorRate()//
;
}
}
}