package org.commoncrawl.service.crawler.util; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.TreeSet; import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.protocol.URLFP; import org.commoncrawl.util.MurmurHash; import org.commoncrawl.util.URLUtils; public class URLFPBloomFilter implements Cloneable { long nbits = 0; int numElements = 0; int bucketsPerElement = 0; int hashCount = 0; byte bits[] = null; long hashResults[] = null; public URLFPBloomFilter(int numElements,int hashCount, int bucketsPerElement){ this.numElements = numElements; this.bucketsPerElement = bucketsPerElement; this.hashCount = hashCount; this.nbits = (long)numElements * (long)bucketsPerElement + 20; System.out.println("Number of Bits is:" + nbits); System.out.println("Number of Bytes is:" + nbits / 8); this.bits = new byte[(int)(((nbits + 7) / 8))]; this.hashResults = new long[hashCount]; } public final synchronized boolean isPresent(URLFP key) { if (key == null) return false; for (long bucketIndex : getHashBuckets(key,hashCount,nbits)){ if (((bits[(int)(bucketIndex >> 3)] >> (bucketIndex & 0x7)) & 0x1) != 1) { //if (((bits.get((int)(bucketIndex >> 3)) >> (bucketIndex & 0x7))& 0x1) != 1) { return false; } /* if (!bitset.get(bucketIndex)) { return false; } */ } return true; } public final synchronized void add(URLFP key) { if (key != null) { for (long bucketIndex : getHashBuckets(key,hashCount,nbits)) { bits[(int)(bucketIndex >> 3)] |= (1 << (bucketIndex & 0x7)); //bitset.set(bucketIndex); //bits.put((int)(bucketIndex >> 3), (byte) (bits.get((int)(bucketIndex >> 3)) | (1 << (bucketIndex & 0x7)))); } } } public final synchronized void serialize(OutputStream outputStream) throws IOException { DataOutputStream dataOut = new DataOutputStream(outputStream); dataOut.writeInt(0); // version placeholder dataOut.writeInt(numElements); dataOut.writeInt(hashCount); dataOut.writeInt(bucketsPerElement); outputStream.write(bits); outputStream.flush(); } public static URLFPBloomFilter load(InputStream inputStream) throws IOException { DataInputStream dataIn = new DataInputStream(inputStream); // skip version bytes ... dataIn.readInt(); // initialize filter ... URLFPBloomFilter filter = new URLFPBloomFilter(dataIn.readInt(),dataIn.readInt(),dataIn.readInt()); if (inputStream instanceof FSDataInputStream) { ((FSDataInputStream)inputStream).readFully(filter.bits); } else { // load bits ... inputStream.read(filter.bits); } return filter; } @Override protected Object clone() throws CloneNotSupportedException { URLFPBloomFilter cloned = new URLFPBloomFilter(this.numElements,this.hashCount,this.bucketsPerElement); System.arraycopy(this.bits, 0, cloned.bits, 0, this.bits.length); return cloned; } final long[] getHashBuckets(URLFP key, int hashCount, long max) { byte[] b = new byte[16]; b[0] = (byte)((key.getDomainHash() >>> 24) & 0xFF); b[1] = (byte)((key.getDomainHash() >>> 16) & 0xFF); b[2] = (byte)((key.getDomainHash() >>> 8) & 0xFF); b[3] = (byte)((key.getDomainHash() >>> 32) & 0xFF); b[4] = (byte)((key.getDomainHash()) & 0xFF); b[5] = (byte)((key.getUrlHash() >>> 56) & 0xFF); b[6] = (byte)((key.getUrlHash()>>> 48) & 0xFF); b[7] = (byte)((key.getUrlHash() >>> 40) & 0xFF); b[8] = (byte)((key.getUrlHash() >>> 32) & 0xFF); b[9] = (byte)((key.getUrlHash() >>> 24) & 0xFF); b[10] = (byte)((key.getUrlHash() >>> 16) & 0xFF); b[11] = (byte)((key.getUrlHash() >>> 8) & 0xFF); b[12] = (byte)((key.getUrlHash()) & 0xFF); int hash1 = MurmurHash.hash(b, b.length, 0); int hash2 = MurmurHash.hash(b, b.length, hash1); for (int i = 0; i < hashCount; i++) { hashResults[i] = Math.abs(((long)hash1 + i * (long)hash2) % max); } return hashResults; } public static void main(String[] args) { Configuration conf = new Configuration(); conf.addResource("nutch-default.xml"); conf.addResource("nutch-site.xml"); conf.addResource("hadoop-default.xml"); conf.addResource("hadoop-site.xml"); conf.addResource("commoncrawl-default.xml"); conf.addResource("commoncrawl-site.xml"); CrawlEnvironment.setHadoopConfig(conf); CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/"); simpleTest(args[0]); } public static void simpleTest(String outputFileName) { URLFPBloomFilter bigBloomFilter = new URLFPBloomFilter(750000000,10,11); TreeSet<URLFP> addedSet = new TreeSet<URLFP>(); TreeSet<URLFP> notAddedSet = new TreeSet<URLFP>(); for (int i=0;i<100000;++i) { URLFP fingerprint = URLUtils.getURLFPFromURL("http://foo.bar.com/" + i,false); URLFP notfingerprint = URLUtils.getURLFPFromURL("http://someother.bar.com/" + i,false); addedSet.add(fingerprint); notAddedSet.add(notfingerprint); } System.out.println("Adding " + addedSet.size() + " elements to bloom filter"); long timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : addedSet) { bigBloomFilter.add(testFingerprint); } long timeEnd = System.currentTimeMillis(); System.out.println("Add Took:" + (timeEnd - timeStart) + " MS"); timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : addedSet) { if (!bigBloomFilter.isPresent(testFingerprint)) { Assert.assertFalse(true); } } timeEnd = System.currentTimeMillis(); System.out.println("Lookup of " + addedSet.size() + " items in set took:" + (timeEnd - timeStart) + " MS"); timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : notAddedSet) { if (bigBloomFilter.isPresent(testFingerprint)) { Assert.assertTrue(addedSet.contains(testFingerprint)); } } timeEnd = System.currentTimeMillis(); System.out.println("Lookup of " + addedSet.size() + " items not in set took:" + (timeEnd - timeStart) + " MS"); System.out.println("Cloning"); URLFPBloomFilter clone = null; timeStart = System.currentTimeMillis(); try { clone = (URLFPBloomFilter) bigBloomFilter.clone(); } catch (CloneNotSupportedException e1) { e1.printStackTrace(); } timeEnd = System.currentTimeMillis(); System.out.println("Clone took:" + (timeEnd - timeStart) + " MS"); Path outputLocation = new Path(outputFileName); // serialize System.out.println("Serializing to:" + outputLocation); try { timeStart = System.currentTimeMillis(); FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); FSDataOutputStream outputStream = fs.create(outputLocation,true,10240000); clone.serialize(outputStream); outputStream.flush(); outputStream.close(); timeEnd = System.currentTimeMillis(); System.out.println("Seialization took:" + (timeEnd - timeStart) + " MS"); clone = null; bigBloomFilter = null; System.out.println("Reloading"); timeStart = System.currentTimeMillis(); FSDataInputStream inputStream = fs.open(outputLocation); bigBloomFilter = URLFPBloomFilter.load(inputStream); inputStream.close(); timeEnd = System.currentTimeMillis(); System.out.println("Reload took:" + (timeEnd - timeStart) + " MS"); timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : addedSet) { if (!bigBloomFilter.isPresent(testFingerprint)) { Assert.assertFalse(true); } } timeEnd = System.currentTimeMillis(); System.out.println("Lookup of " + addedSet.size() + " items in set took:" + (timeEnd - timeStart) + " MS"); timeStart = System.currentTimeMillis(); for (URLFP testFingerprint : notAddedSet) { if (bigBloomFilter.isPresent(testFingerprint)) { Assert.assertTrue(addedSet.contains(testFingerprint)); } } timeEnd = System.currentTimeMillis(); System.out.println("Lookup of " + addedSet.size() + " items not in set took:" + (timeEnd - timeStart) + " MS"); } catch (IOException e) { e.printStackTrace(); } } public int getNumElements() { return numElements; } public int getBucketsPerElement() { return bucketsPerElement; } public int getHashCount() { return hashCount; } }