package it.unimi.dsi.util;
/*
* DSI utilities
*
* Copyright (C) 2006-2009 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
import it.unimi.dsi.bits.LongArrayBitVector;
import java.io.Serializable;
import java.util.Random;
import cern.jet.random.engine.MersenneTwister;
/** A Bloom filter for integers.
*
* <P>Instances of this class represent a set of integers (with false positives)
* using a Bloom filter. Because of the way Bloom filters work,
* you cannot remove elements.
*
* <P>Bloom filters have an expected error rate, depending on the number
* of hash functions used, on the filter size and on the number of elements in the filter. This implementation
* uses a variable optimal number of hash functions, depending on the expected
* number of elements. More precisely, a Bloom
* filter for <var>n</var> integers with <var>d</var> hash functions will use
* ln 2 <var>d</var><var>n</var> ≈ 1.44 <var>d</var><var>n</var> bits;
* false positives will happen with probability 2<sup>-<var>d</var></sup>.
*
* <P>Hash functions are generated at creation time using universal hashing. Each hash function
* uses two integers <var>A</var> and <var>B</var>, and the integer <var>x</var> is mapped
* to (<var>Ax</var>)⊕<var>B</var> before taking the remainder modulo the number of bits
* in the filter.
*
* <P>This class exports access methods that are very similar to those of {@link java.util.Set},
* but it does not implement that interface, as too many non-optional methods
* would be unimplementable (e.g., iterators).
*
* @author Sebastiano Vigna
*/
public class IntBloomFilter implements Serializable {
private static final long serialVersionUID = 1L;
/** The number of bits in this filter. */
final public long m;
/** The number of hash functions used by this filter. */
final public int d;
/** The underlying bit vector. */
final private LongArrayBitVector bits;
/** The random integers used multiplicatively. */
final private int[] a;
/** The random integers used in exclusive-or. */
final private int[] b;
/** The natural logarithm of 2, used in the computation of the number of bits. */
private final static double NATURAL_LOG_OF_2 = Math.log( 2 );
private final static boolean DEBUG = false;
/** Creates a new Bloom filter with given number of hash functions and expected number of elements.
*
* @param n the expected number of elements.
* @param d the number of hash functions; if the filter add not more than <code>n</code> elements,
* false positives will happen with probability 2<sup>-<var>d</var></sup>.
*/
public IntBloomFilter( final int n, final int d ) {
this.d = d;
bits = LongArrayBitVector.getInstance().length( (long)Math.ceil( ( n * d / NATURAL_LOG_OF_2 ) ) );
m = bits.length() * Long.SIZE;
if ( DEBUG ) System.err.println( "Number of bits: " + m );
// The purpose of Random().nextInt() is to generate a different seed at each invocation.
final MersenneTwister mersenneTwister = new MersenneTwister( new Random().nextInt() );
a = new int[ d ];
b = new int[ d ];
for( int i = 0; i < d; i++ ) {
a[ i ] = mersenneTwister.nextInt();
b[ i ] = mersenneTwister.nextInt();
}
}
/** Hashes the given integer with the given hash function.
*
* @param x an integer.
* @param k a hash function index (smaller than {@link #d}).
* @return the position in the filter corresponding to <code>x</code> for the hash function <code>k</code>.
*/
private long hash( final int x, final int k ) {
return ( ( ( a[ k ] * x ) ^ b[ k ] ) & 0x7FFFFFFFFFFFFFFFL ) % m;
}
/** Checks whether the given integer is in this filter.
*
* <P>Note that this method may return true on an integer that has
* not been added to the filter. This will happen with probability 2<sup>-<var>d</var></sup>,
* where <var>d</var> is the number of hash functions specified at creation time, if
* the number of the elements in the filter is less than <var>n</var>, the number
* of expected elements specified at creation time.
*
* @param x an integer.
* @return true if the integer is in the filter (or if an integer with the
* same hash sequence is in the filter).
*/
public boolean contains( final int x ) {
int i = d;
while( i-- != 0 ) if ( ! bits.getBoolean( hash( x, i ) ) ) return false;
return true;
}
/** Adds an integer to the filter.
*
* @param x an integer.
*/
public void add( final int x ) {
int i = d;
while( i-- != 0 ) bits.set( hash( x, i ) );
}
}