BloomFilter.java example

Explorer
codemap-master
package ch.akuhn.hapax.util;
//TODO: this class is NOT TESTED!
import static java.lang.Math.pow;

import java.util.BitSet;

import ch.akuhn.util.Lorem;
import ch.akuhn.util.Out;

public class BloomFilter {

    private long mask;
    private BitSet bits;
    private int count = 0;

    public BloomFilter() {
        this(12);
    }
    
    public BloomFilter(int m) {
        mask = (1 << m) - 1;
        bits = new BitSet();
    }
    
    public void add(String element) {
        for (Hash each: Hash.values()) {
            bits.set(hash(each, element));
        }
        count++;
    }
    
    public double falsePositiveProbability() {
        double k = 10, m = (mask + 1), n = count;
        return pow(1 - pow(1 - 1/m, k*n), k);
    }
    
    private int hash(Hash hash, String element) {
        return (int) (hash.hash(element) & mask);
    }

    public boolean contains(String element) {
        for (Hash each: Hash.values()) {
            if (!bits.get(hash(each, element))) return false;
        }
        return true;
    }

    public enum Hash {

        // The hashing function had been distributed under:
        //
        // Copyright (c) 2008, Zbigniew Jerzak, Dresden University of Technology
        //
        // All rights reserved.
        //
        // Redistribution and use in source and binary forms, with or without
        // modification, are permitted provided that the following conditions
        // are met:
        //
        // * Redistributions of source code must retain the above copyright
        // notice, this list of conditions and the following disclaimer.
        // * Redistributions in binary form must reproduce the above copyright
        // notice, this list of conditions and the following disclaimer in the
        // documentation and/or other materials provided with the distribution.
        // * Neither the name of the Dresden University of Technology nor the
        // names of its contributors may be used to endorse or promote products
        // derived from this software without specific prior written permission.
        //
        // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
        // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
        // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
        // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
        // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
        // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
        // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
        // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
        // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
        // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
        // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

        /**
         * An algorithm produced by Arash Partow. Based on ideas from all of the
         * above hash functions making a hybrid rotative and additive hash
         * function algorithm. Resembles the design as close as possible to a
         * simple LFSR. An empirical result which demonstrated the distributive
         * abilities of the hash algorithm was obtained using a hash-table with
         * 100003 buckets, hashing The Project Gutenberg Etext of Webster's
         * Unabridged Dictionary, the longest encountered chain length was 7,
         * the average chain length was 2, the number of empty buckets was 4579.
         * 
         * @author Arash Partow
         * @author Zbigniew Jerzak
         */
        AP {
            @Override
            public long hash(final String data) {
                long hash = 0xAAAAAAAA;
                for (int i = 0; i < data.length(); i++) {
                    if ((i & 1) == 0) {
                        hash ^= ((hash << 7) ^ data.charAt(i) ^ (hash >> 3));
                    } else {
                        hash ^= (~((hash << 11) ^ data.charAt(i) ^ (hash >> 5)));
                    }
                }
                return hash;
            }
        },
        /**
         * This hash function comes from Brian Kernighan and Dennis Ritchie's
         * book "The C Programming Language". It is a simple hash function using
         * a strange set of possible seeds which all constitute a pattern of
         * 31....31...31 etc, it seems to be very similar to the DJB hash
         * function.
         * 
         * @author Zbigniew Jerzak
         * @author Arash Partow
         */
        BKDR {
            private final static long seed = 131;

            @Override
            public long hash(final String data) {
                long hash = 0;
                for (int i = 0; i < data.length(); i++) {
                    hash = (hash * seed) + data.charAt(i);
                }
                return hash;
            }
        },
        /**
         * Yet another hash implementation
         * 
         * @author Zbigniew Jerzak
         * @author Arash Partow
         * 
         */
        BP {
            @Override
            public long hash(final String data) {
                long hash = 0;

                for (int i = 0; i < data.length(); i++) {
                    hash = hash << 7 ^ data.charAt(i);
                }
                return hash;
            }
        },
        /**
         * An algorithm proposed by Donald E. Knuth in The Art Of Computer
         * Programming Volume 3, under the topic of sorting and search chapter
         * 6.4.
         * 
         * @author Zbigniew Jerzak
         * @author Arash Partow
         */
        DEK {
            @Override
            public long hash(final String data) {
                long hash = data.length();
                for (int i = 0; i < data.length(); i++) {
                    hash = ((hash << 5) ^ (hash >> 27)) ^ data.charAt(i);
                }
                return hash;
            }
        },
        /**
         * An algorithm produced by Professor Daniel J. Bernstein and shown
         * first to the world on the usenet newsgroup comp.lang.c. It is one of
         * the most efficient hash functions ever published.
         * 
         * @author Zbigniew Jerzak
         * @author Arash Partow
         */
        DJB {
            @Override
            public long hash(final String data) {
                long hash = 5381;

                for (int i = 0; i < data.length(); i++) {
                    hash = ((hash << 5) + hash) + data.charAt(i);
                }
                return hash;
            }
        },
        /**
         * Yet another hash implementation.
         * 
         * @author Zbigniew Jerzak
         * @author Arash Partow
         * 
         */
        FNV {
            private final static long fnv_prime = 0x811C9DC5;

            @Override
            public long hash(final String data) {
                long hash = 0;

                for (int i = 0; i < data.length(); i++) {
                    hash *= fnv_prime;
                    hash ^= data.charAt(i);
                }

                return hash;
            }
        },
        /**
         * A bitwise hash function written by Justin Sobel
         * 
         * @author Zbigniew Jerzak
         * @author Arash Partow
         */
        JS {
            @Override
            public long hash(final String data) {
                long hash = 1315423911;

                for (int i = 0; i < data.length(); i++) {
                    hash ^= ((hash << 5) + data.charAt(i) + (hash >> 2));
                }

                return hash;
            }
        },
        /**
         * This hash algorithm is based on work by Peter J. Weinberger of AT&T
         * Bell Labs. The book Compilers (Principles, Techniques and Tools) by
         * Aho, Sethi and Ulman, recommends the use of hash functions that
         * employ the hashing methodology found in this particular algorithm.
         * 
         * @author Zbigniew Jerzak
         * @author Arash Partow
         */
        PJW {
            private final static long BitsInUnsignedInt = 32L;
            private final static long ThreeQuarters = (long) ((BitsInUnsignedInt * 3) / 4);
            private final static long OneEighth = (long) (BitsInUnsignedInt / 8);
            private final static long HighBits = (long) (0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);

            @Override
            public long hash(final String data) {
                long hash = 0;
                long test = 0;

                for (int i = 0; i < data.length(); i++) {
                    hash = (hash << OneEighth) + data.charAt(i);

                    if ((test = hash & HighBits) != 0) {
                        hash = ((hash ^ (test >> ThreeQuarters)) & (~HighBits));
                    }
                }
                return hash;
            }
        },
        /**
         * A simple hash function from Robert Sedgwicks Algorithms in C book.
         * Added some simple optimizations to the algorithm in order to speed up
         * its hashing process.
         * 
         * @author Zbigniew Jerzak
         * @author Arash Partow
         */
        RS {

            private final static long b = 378551;

            /*
             * (non-Javadoc)
             * 
             * @see bloomfilter.hashes.IHash#hash(java.lang.String)
             */
            @Override
            public long hash(final String data) {
                long a = 63689;
                long hash = 0;

                for (int i = 0; i < data.length(); i++) {
                    hash = hash * a + data.charAt(i);
                    a = a * b;
                }
                return hash;
            }
        },
        /**
         * This is the algorithm of choice which is used in the open source SDBM
         * project. The hash function seems to have a good over-all distribution
         * for many different data sets. It seems to work well in situations
         * where there is a high variance in the MSBs of the elements in a data
         * set.
         * 
         * @author Zbigniew Jerzak
         * @author Arash Partow
         */
        SDBM {

            /*
             * (non-Javadoc)
             * 
             * @see bloomfilter.hashes.IHash#hash(java.lang.String)
             */
            @Override
            public long hash(final String data) {
                long hash = 0;

                for (int i = 0; i < data.length(); i++) {
                    hash = data.charAt(i) + (hash << 6) + (hash << 16) - hash;
                }

                return hash;
            }
        };

        public long hash(String data) {
            return -1l;
        }

    }
    
    public static void main(String[] args) {
        
        BloomFilter bf = new BloomFilter();
        
        for (String each: Lorem.ipsum()) bf.add(each);
        
        Out.puts( bf.contains("Lorem") );
        Out.puts( bf.contains("Foo") );
        
        Out.puts( bf.falsePositiveProbability() );
        
    }

}