package ch.akuhn.hapax.util;
//TODO: this class is NOT TESTED!
import static java.lang.Math.pow;
import java.util.BitSet;
import ch.akuhn.util.Lorem;
import ch.akuhn.util.Out;
public class BloomFilter {
private long mask;
private BitSet bits;
private int count = 0;
public BloomFilter() {
this(12);
}
public BloomFilter(int m) {
mask = (1 << m) - 1;
bits = new BitSet();
}
public void add(String element) {
for (Hash each: Hash.values()) {
bits.set(hash(each, element));
}
count++;
}
public double falsePositiveProbability() {
double k = 10, m = (mask + 1), n = count;
return pow(1 - pow(1 - 1/m, k*n), k);
}
private int hash(Hash hash, String element) {
return (int) (hash.hash(element) & mask);
}
public boolean contains(String element) {
for (Hash each: Hash.values()) {
if (!bits.get(hash(each, element))) return false;
}
return true;
}
public enum Hash {
// The hashing function had been distributed under:
//
// Copyright (c) 2008, Zbigniew Jerzak, Dresden University of Technology
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of the Dresden University of Technology nor the
// names of its contributors may be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/**
* An algorithm produced by Arash Partow. Based on ideas from all of the
* above hash functions making a hybrid rotative and additive hash
* function algorithm. Resembles the design as close as possible to a
* simple LFSR. An empirical result which demonstrated the distributive
* abilities of the hash algorithm was obtained using a hash-table with
* 100003 buckets, hashing The Project Gutenberg Etext of Webster's
* Unabridged Dictionary, the longest encountered chain length was 7,
* the average chain length was 2, the number of empty buckets was 4579.
*
* @author Arash Partow
* @author Zbigniew Jerzak
*/
AP {
@Override
public long hash(final String data) {
long hash = 0xAAAAAAAA;
for (int i = 0; i < data.length(); i++) {
if ((i & 1) == 0) {
hash ^= ((hash << 7) ^ data.charAt(i) ^ (hash >> 3));
} else {
hash ^= (~((hash << 11) ^ data.charAt(i) ^ (hash >> 5)));
}
}
return hash;
}
},
/**
* This hash function comes from Brian Kernighan and Dennis Ritchie's
* book "The C Programming Language". It is a simple hash function using
* a strange set of possible seeds which all constitute a pattern of
* 31....31...31 etc, it seems to be very similar to the DJB hash
* function.
*
* @author Zbigniew Jerzak
* @author Arash Partow
*/
BKDR {
private final static long seed = 131;
@Override
public long hash(final String data) {
long hash = 0;
for (int i = 0; i < data.length(); i++) {
hash = (hash * seed) + data.charAt(i);
}
return hash;
}
},
/**
* Yet another hash implementation
*
* @author Zbigniew Jerzak
* @author Arash Partow
*
*/
BP {
@Override
public long hash(final String data) {
long hash = 0;
for (int i = 0; i < data.length(); i++) {
hash = hash << 7 ^ data.charAt(i);
}
return hash;
}
},
/**
* An algorithm proposed by Donald E. Knuth in The Art Of Computer
* Programming Volume 3, under the topic of sorting and search chapter
* 6.4.
*
* @author Zbigniew Jerzak
* @author Arash Partow
*/
DEK {
@Override
public long hash(final String data) {
long hash = data.length();
for (int i = 0; i < data.length(); i++) {
hash = ((hash << 5) ^ (hash >> 27)) ^ data.charAt(i);
}
return hash;
}
},
/**
* An algorithm produced by Professor Daniel J. Bernstein and shown
* first to the world on the usenet newsgroup comp.lang.c. It is one of
* the most efficient hash functions ever published.
*
* @author Zbigniew Jerzak
* @author Arash Partow
*/
DJB {
@Override
public long hash(final String data) {
long hash = 5381;
for (int i = 0; i < data.length(); i++) {
hash = ((hash << 5) + hash) + data.charAt(i);
}
return hash;
}
},
/**
* Yet another hash implementation.
*
* @author Zbigniew Jerzak
* @author Arash Partow
*
*/
FNV {
private final static long fnv_prime = 0x811C9DC5;
@Override
public long hash(final String data) {
long hash = 0;
for (int i = 0; i < data.length(); i++) {
hash *= fnv_prime;
hash ^= data.charAt(i);
}
return hash;
}
},
/**
* A bitwise hash function written by Justin Sobel
*
* @author Zbigniew Jerzak
* @author Arash Partow
*/
JS {
@Override
public long hash(final String data) {
long hash = 1315423911;
for (int i = 0; i < data.length(); i++) {
hash ^= ((hash << 5) + data.charAt(i) + (hash >> 2));
}
return hash;
}
},
/**
* This hash algorithm is based on work by Peter J. Weinberger of AT&T
* Bell Labs. The book Compilers (Principles, Techniques and Tools) by
* Aho, Sethi and Ulman, recommends the use of hash functions that
* employ the hashing methodology found in this particular algorithm.
*
* @author Zbigniew Jerzak
* @author Arash Partow
*/
PJW {
private final static long BitsInUnsignedInt = 32L;
private final static long ThreeQuarters = (long) ((BitsInUnsignedInt * 3) / 4);
private final static long OneEighth = (long) (BitsInUnsignedInt / 8);
private final static long HighBits = (long) (0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);
@Override
public long hash(final String data) {
long hash = 0;
long test = 0;
for (int i = 0; i < data.length(); i++) {
hash = (hash << OneEighth) + data.charAt(i);
if ((test = hash & HighBits) != 0) {
hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
}
}
return hash;
}
},
/**
* A simple hash function from Robert Sedgwicks Algorithms in C book.
* Added some simple optimizations to the algorithm in order to speed up
* its hashing process.
*
* @author Zbigniew Jerzak
* @author Arash Partow
*/
RS {
private final static long b = 378551;
/*
* (non-Javadoc)
*
* @see bloomfilter.hashes.IHash#hash(java.lang.String)
*/
@Override
public long hash(final String data) {
long a = 63689;
long hash = 0;
for (int i = 0; i < data.length(); i++) {
hash = hash * a + data.charAt(i);
a = a * b;
}
return hash;
}
},
/**
* This is the algorithm of choice which is used in the open source SDBM
* project. The hash function seems to have a good over-all distribution
* for many different data sets. It seems to work well in situations
* where there is a high variance in the MSBs of the elements in a data
* set.
*
* @author Zbigniew Jerzak
* @author Arash Partow
*/
SDBM {
/*
* (non-Javadoc)
*
* @see bloomfilter.hashes.IHash#hash(java.lang.String)
*/
@Override
public long hash(final String data) {
long hash = 0;
for (int i = 0; i < data.length(); i++) {
hash = data.charAt(i) + (hash << 6) + (hash << 16) - hash;
}
return hash;
}
};
public long hash(String data) {
return -1l;
}
}
public static void main(String[] args) {
BloomFilter bf = new BloomFilter();
for (String each: Lorem.ipsum()) bf.add(each);
Out.puts( bf.contains("Lorem") );
Out.puts( bf.contains("Foo") );
Out.puts( bf.falsePositiveProbability() );
}
}