BloomFilter.java example

Explorer
jucy-master
package uc.crypto;


import helpers.GH;


import java.util.BitSet;



import uc.files.filelist.FileList;
import uc.files.filelist.FileListFile;

/**
 * 
Implementation: DC++, ADCH++

Description: Bloom filters allow the hub to filter certain searches using 
bitmap that represents the hashes of the files in the users share. For an 
introduction to bloom filters, see Wikipedia. When the user updates the share, 
it must send an INF containing the flag SF. The hub may at any time request 
that the client sends an updated version of its bloom filter by sending a 
GET command to the client. The client will then respond using SND and send 
the bloom filter binary data.

Let b be the number of bits used for file hashes, n the number of files in 
the users share, m the size of the bloom filter in bits, k the number of subhashes 
constructed from the file hash, h the number of bits to use for each subhash 
and p the probability of a false positive. The hub chooses k, h and m. k must 
be chosen so that k*h <= b. h must be chosen <= 64 and such that 2^h > m. m 
must be chosen such that m % 64 == 0. The hub then sends GET specifying "blom" 
as type, "/" as path, "0" as start and "m/8" as number of bytes, k in the flag 
"BK" and h in the flag "BH".

The client constructs the bloom filter by creating a bit array of m bits set to 0. 
For each file it then sets to "1" k positions constructed from the file hash. Seeing 
the file hash as a stream of bits (starting from the lowest bit of the first byte, 
ending at the highest bit of the last byte), the client should use h bits starting 
at the first bit of the first byte to create an integer and apply modulo m to get 
the position in the bit array, then redo the process k times advancing the starting 
position by h each time.

Once the hub has received the bloom filter bit array, for each search command it 
processes, if it contains a hash search term, it can discard skip broadcasting 
the search to a particular client if at least one of the k bits in that clients 
bit array is "0", calculating positions as the client does when setting bits to 
"1". The hub has to evaluate the filter for each client that it has a bloom filter 
for, for each search.

p = (1 - (1 - 1 / m)^(k * n))^k, thus p becomes smaller as m grows and larger as n 
grows. Larger m means more bits to transfer but also fewer false positives. The 
optimum value for k given m and n is (m / n) * ln 2. The largest k supported by a 
hash of a certain size is b / h, so if the hub wants the smallest p possible, it 
should choose the smallest possible h which gives the largest k, and then calculate 
m = k * n/ln 2, checking that the resulting m < 2^h. 2^h should much be larger than m 
(at least 3-4 times), because of how the modulo operator works. Also, with m grows the 
required bandwidth to transfer the bloom filter, so the hub may wish to cap m. In that 
case, it should still choose k according to m / n * ln 2, but send an h as big as possible 
to alleviate biasing problems.

For TTH roots, b is 192 and a reasonable value for h is 24, giving a maximum k = 8 which 
means that m = 8 * n / ln 2 ≈ 11.5 * n. The required bandwidth then becomes 11.5 * n / 8 bytes, 
so approximately 1.44 bytes per file in the users share. For 20000 files, m should then be 
230016 (taking into account the modulo 64 requirement), giving a p = 0.004, in other words 
~99.6% of all searches that don't match a users share would not be sent, saving valuable upload 
bandwidth for the hub. The client calculates i valid positions, if x is an array of bytes 
containing the hash of the file, on a little-endian machine, by doing 
pos = x[0+i*h/8] | (x[1+i*h/8] << 8) | (x[2+i*h/8] << 16) for i = [0;k). 
This is of course a special case where h % 8 = 0, the actual algorithm has to take into 
consideration values for h where the integer crosses byte boundaries. 
 *
 */
public class BloomFilter {

	//private final byte[] filter;
//	private static final Logger logger = LoggerFactory.make(Level.DEBUG);
	
	private final BitSet bits;
	
	private static final int b = TigerHashValue.digestlength * 8;
	
	private final int h,k,m;
	
	
	public static BloomFilter create(FileList fileList, int m, int h, int k) {
		BloomFilter blom = new BloomFilter(m,h,k);
		
		for (FileListFile f:fileList.getRoot()) {
			blom.addHashValue(f.getTTHRoot());
		}
	//	logger.debug("Created Bloomfilter: k:"+k+" h:"+h+" m:"+m);
		
		return blom;
	}
	
	/**
	 * 
	 * @param m  the size of the Filter in Bits
	 * @param h - 
	 * @param k
	 */
	public BloomFilter(int m, int h, int k) {
		if (k*h > b ) {
			throw new IllegalArgumentException("k*h > b!  k:"+k+" h:"+h);
		}
		if (h > 64) {
			throw new IllegalArgumentException("h:"+h);
		}
		if (m % 64 != 0) {
			throw new IllegalArgumentException("m:"+m);
		}
		
		this.m = m;
		bits = new BitSet(m);
		this.h = h;
		this.k = k;
	}
	
	/**
	 * constructs bloomfilter from bytearray..
	 * 
	 * @param bytes - bytes being bloomfilter
	 * @param h 
	 * @param k
	 */
	public BloomFilter(byte[] bytes,int h,int k) {
		this(bytes.length * 8,h,k);
		
		BitSet bs = GH.toSet(bytes);
		bits.or(bs);
	}
	
	
	public void addHashValue(HashValue hash)  {
		/*
		 * The client constructs the bloom filter by creating a bit array of m bits 
		 * set to 0.  
		 * For each file it then sets to "1" k positions constructed from 
		 * the file hash. 
		 * Seeing the file hash as a stream of bits (starting from the 
		 * lowest bit of the first byte, ending at the highest bit of the last byte), 
		 * the client should use h bits starting at the first bit of the first byte to 
		 * create an integer and apply modulo m to get the position in the bit array, 
		 * then redo the process k times advancing the starting position by h each time
		 */
		for (int i = 0; i < k; i++ ) {
			int pos = calcPos(hash.getRaw(),i*h);
			bits.set(pos);
		}
	}
	
	public boolean possiblyContains(HashValue hash) {
		boolean possiblyContains = true;
		for (int i = 0; i < k && possiblyContains; i++ ) {
			int pos = calcPos(hash.getRaw(),i*h);
			possiblyContains &= bits.get(pos);
		}
		return possiblyContains;
	}
	
	private int calcPos(byte[] hash, int startpos) {
		long pos = getLong(hash,startpos,h);
		if (pos < 0 ) { //circumvent sign for negative numbers..
			long modpos = pos >>> 1;
			long res = modpos % m +  pos % 2 ;
			return (int)res % m;
		} else {
			return (int) (pos % m);
		}
	}
	
	private static long getLong(byte[] source,int startpos,int length) {
		long ret = 0;
		for (int i = 0; i < length; i++) {
			boolean b = GH.getBit(source,startpos+i);
			if (b) {
				ret +=  1L << i;
			}
		}
		return ret;
	}
	


	/**
	 * fill a byte array with the BloomFilter ... starting with the first bit
	 * of the BloomFilter as lowest bit of the first byte of the byte array.
	 * 
	 * @return
	 */
	public byte[] getBytes() {
		return GH.toBytes(bits,m);
	}
	
	public String toString() {
		return BASE32Encoder.encode(getBytes());
	}

	@Override
	public int hashCode() {
		final int prime = 31;
		int result = 1;
		result = prime * result + ((bits == null) ? 0 : bits.hashCode());
		result = prime * result + h;
		result = prime * result + k;
		result = prime * result + m;
		return result;
	}

	@Override
	public boolean equals(Object obj) {
		if (this == obj)
			return true;
		if (obj == null)
			return false;
		if (getClass() != obj.getClass())
			return false;
		BloomFilter other = (BloomFilter) obj;
		if (bits == null) {
			if (other.bits != null)
				return false;
		} else if (!bits.equals(other.bits))
			return false;
		if (h != other.h)
			return false;
		if (k != other.k)
			return false;
		if (m != other.m)
			return false;
		return true;
	}
	
	
	
	
}