/**
* Copyright 2008 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.util;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.TreeSet;
import junit.framework.Assert;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.protocol.URLFP;
import org.commoncrawl.protocol.URLFPV2;
/**
*
* @author rana
*
*/
public class URLFPBloomFilter {
long nbits = 0;
int numElements = 0;
int bucketsPerElement = 0;
int hashCount = 0;
OpenBitSet bits = null;
long hashResults[] = null;
static final int BUCKETS_PER_WORD = 16;
public URLFPBloomFilter(int numElements,int hashCount, int bucketsPerElement){
this.numElements = numElements;
this.bucketsPerElement = bucketsPerElement;
this.hashCount = hashCount;
this.nbits = (long)numElements * (long)bucketsPerElement + 20;
this.bits = new OpenBitSet(nbits,true);
this.hashResults = new long[hashCount];
}
public final synchronized boolean isPresent(URLFP key) {
if (key == null)
return false;
for (long bucketIndex : getHashBuckets(key,hashCount,nbits)){
if (!bits.fastGet(bucketIndex))
return false;
}
return true;
}
public final synchronized boolean isPresent(URLFPV2 key) {
if (key == null)
return false;
for (long bucketIndex : getHashBuckets(key,hashCount,nbits)){
if (!bits.fastGet(bucketIndex))
return false;
}
return true;
}
public final synchronized void add(URLFP key) {
if (key != null) {
for (long bucketIndex : getHashBuckets(key,hashCount,nbits)) {
bits.fastSet(bucketIndex);
}
}
}
public final synchronized void add(URLFPV2 key) {
if (key != null) {
for (long bucketIndex : getHashBuckets(key,hashCount,nbits)) {
bits.fastSet(bucketIndex);
}
}
}
public final synchronized void clear() {
if (bits != null) {
bits.clear();
}
}
public synchronized void copyBitsTo(URLFPBloomFilter destination)throws IOException {
if (this.nbits != destination.nbits || bits.getNumWords() != destination.bits.getNumWords()) {
throw new IOException("Source and Destination BloomFilters are sized differently!");
}
int pageCount = bits.getPageCount();
for (int p = 0; p < pageCount; p++){
long[] srcData = bits.getPage(p);
long[] destData = destination.bits.getPage(p);
System.arraycopy(srcData,0,destData,0,srcData.length);
}
}
private void serializeBits(DataOutput dos) throws IOException
{
int bitLengthInWords = bits.getNumWords();
int pageSize = bits.getPageSize();
int pageCount = bits.getPageCount();
dos.writeLong(nbits);
for (int p = 0; p < pageCount; p++){
long[] data = bits.getPage(p);
for (int i = 0; i < pageSize && bitLengthInWords-- > 0; i++)
dos.writeLong(data[i]);
}
}
private void deserializeBits(DataInput dis) throws IOException {
long nbitsOut = dis.readLong();
if (nbitsOut != nbits) {
throw new IOException("Serialized bitCount:"+ nbitsOut + " Expected bitCount:" + nbits);
}
int bitLengthInWords = bits.getNumWords();
int pageSize = bits.getPageSize();
int pageCount = bits.getPageCount();
for (int p = 0; p < pageCount; p++) {
long[] data = bits.getPage(p);
for (int i = 0; i < pageSize && bitLengthInWords-- > 0; i++)
data[i] = dis.readLong();
}
}
public final synchronized void serialize(OutputStream outputStream) throws IOException {
DataOutputStream dataOut = new DataOutputStream(outputStream);
dataOut.writeInt(0); // version placeholder
dataOut.writeInt(numElements);
dataOut.writeInt(hashCount);
dataOut.writeInt(bucketsPerElement);
serializeBits(dataOut);
outputStream.flush();
}
public static URLFPBloomFilter load(InputStream inputStream) throws IOException {
DataInputStream dataIn = new DataInputStream(inputStream);
// skip version bytes ...
dataIn.readInt();
// initialize filter ...
URLFPBloomFilter filter = new URLFPBloomFilter(dataIn.readInt(),dataIn.readInt(),dataIn.readInt());
// read bits
if (inputStream instanceof FSDataInputStream) {
filter.deserializeBits((FSDataInputStream)inputStream);
}
else {
// load bits ...
filter.deserializeBits(new DataInputStream(inputStream));
}
return filter;
}
final long[] getHashBuckets(URLFP key, int hashCount, long max) {
byte[] b = new byte[12];
b[0] = (byte)((key.getDomainHash() >>> 24) & 0xFF);
b[1] = (byte)((key.getDomainHash() >>> 16) & 0xFF);
b[2] = (byte)((key.getDomainHash() >>> 8) & 0xFF);
b[3] = (byte)((key.getDomainHash()) & 0xFF);
b[4] = (byte)((key.getUrlHash() >>> 56) & 0xFF);
b[5] = (byte)((key.getUrlHash()>>> 48) & 0xFF);
b[6] = (byte)((key.getUrlHash() >>> 40) & 0xFF);
b[7] = (byte)((key.getUrlHash() >>> 32) & 0xFF);
b[8] = (byte)((key.getUrlHash() >>> 24) & 0xFF);
b[9] = (byte)((key.getUrlHash() >>> 16) & 0xFF);
b[10] = (byte)((key.getUrlHash() >>> 8) & 0xFF);
b[11] = (byte)((key.getUrlHash()) & 0xFF);
int hash1 = MurmurHash.hash(b, b.length, 0);
int hash2 = MurmurHash.hash(b, b.length, hash1);
for (int i = 0; i < hashCount; i++) {
hashResults[i] = Math.abs(((long)hash1 + i * (long)hash2) % max);
}
return hashResults;
}
final long[] getHashBuckets(URLFPV2 key, int hashCount, long max) {
byte[] b = new byte[16];
long domainHash = key.getDomainHash();
long urlHash = key.getUrlHash();
b[0] = (byte)((domainHash >>> 56) & 0xFF);
b[1] = (byte)((domainHash>>> 48) & 0xFF);
b[2] = (byte)((domainHash >>> 40) & 0xFF);
b[3] = (byte)((domainHash >>> 32) & 0xFF);
b[4] = (byte)((domainHash >>> 24) & 0xFF);
b[5] = (byte)((domainHash >>> 16) & 0xFF);
b[6] = (byte)((domainHash >>> 8) & 0xFF);
b[7] = (byte)((domainHash) & 0xFF);
b[8] = (byte)((urlHash >>> 56) & 0xFF);
b[9] = (byte)((urlHash>>> 48) & 0xFF);
b[10] = (byte)((urlHash >>> 40) & 0xFF);
b[11] = (byte)((urlHash >>> 32) & 0xFF);
b[12] = (byte)((urlHash >>> 24) & 0xFF);
b[13] = (byte)((urlHash >>> 16) & 0xFF);
b[14] = (byte)((urlHash >>> 8) & 0xFF);
b[15] = (byte)((urlHash) & 0xFF);
int hash1 = MurmurHash.hash(b, b.length, 0);
int hash2 = MurmurHash.hash(b, b.length, hash1);
for (int i = 0; i < hashCount; i++) {
hashResults[i] = Math.abs(((long)hash1 + i * (long)hash2) % max);
}
return hashResults;
}
public static void main(String[] args) {
Configuration conf = new Configuration();
conf.addResource("nutch-default.xml");
conf.addResource("nutch-site.xml");
conf.addResource("hadoop-default.xml");
conf.addResource("hadoop-site.xml");
conf.addResource("commoncrawl-default.xml");
conf.addResource("commoncrawl-site.xml");
CrawlEnvironment.setHadoopConfig(conf);
CrawlEnvironment.setDefaultHadoopFSURI("hdfs://ccn01:9000/");
simpleTest(args[0]);
}
public static void simpleTest(String outputFileName) {
URLFPBloomFilter bigBloomFilter = new URLFPBloomFilter(750000000,10,11);
TreeSet<URLFP> addedSet = new TreeSet<URLFP>();
TreeSet<URLFP> notAddedSet = new TreeSet<URLFP>();
for (int i=0;i<100000;++i) {
URLFP fingerprint = URLUtils.getURLFPFromURL("http://foo.bar.com/" + i,false);
URLFP notfingerprint = URLUtils.getURLFPFromURL("http://someother.bar.com/" + i,false);
addedSet.add(fingerprint);
notAddedSet.add(notfingerprint);
}
System.out.println("Adding " + addedSet.size() + " elements to bloom filter");
long timeStart = System.currentTimeMillis();
for (URLFP testFingerprint : addedSet) {
bigBloomFilter.add(testFingerprint);
}
long timeEnd = System.currentTimeMillis();
System.out.println("Add Took:" + (timeEnd - timeStart) + " MS");
timeStart = System.currentTimeMillis();
for (URLFP testFingerprint : addedSet) {
if (!bigBloomFilter.isPresent(testFingerprint)) {
Assert.assertFalse(true);
}
}
timeEnd = System.currentTimeMillis();
System.out.println("Lookup of " + addedSet.size() + " items in set took:" + (timeEnd - timeStart) + " MS");
timeStart = System.currentTimeMillis();
for (URLFP testFingerprint : notAddedSet) {
if (bigBloomFilter.isPresent(testFingerprint)) {
Assert.assertTrue(addedSet.contains(testFingerprint));
}
}
timeEnd = System.currentTimeMillis();
System.out.println("Lookup of " + addedSet.size() + " items not in set took:" + (timeEnd - timeStart) + " MS");
System.out.println("Cloning");
URLFPBloomFilter clone = null;
timeStart = System.currentTimeMillis();
try {
clone = (URLFPBloomFilter) bigBloomFilter.clone();
} catch (CloneNotSupportedException e1) {
e1.printStackTrace();
}
timeEnd = System.currentTimeMillis();
System.out.println("Clone took:" + (timeEnd - timeStart) + " MS");
Path outputLocation = new Path(outputFileName);
// serialize
System.out.println("Serializing to:" + outputLocation);
try {
timeStart = System.currentTimeMillis();
FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
FSDataOutputStream outputStream = fs.create(outputLocation,true,10240000);
clone.serialize(outputStream);
outputStream.flush();
outputStream.close();
timeEnd = System.currentTimeMillis();
System.out.println("Seialization took:" + (timeEnd - timeStart) + " MS");
clone = null;
bigBloomFilter = null;
System.out.println("Reloading");
timeStart = System.currentTimeMillis();
FSDataInputStream inputStream = fs.open(outputLocation);
bigBloomFilter = URLFPBloomFilter.load(inputStream);
inputStream.close();
timeEnd = System.currentTimeMillis();
System.out.println("Reload took:" + (timeEnd - timeStart) + " MS");
timeStart = System.currentTimeMillis();
for (URLFP testFingerprint : addedSet) {
if (!bigBloomFilter.isPresent(testFingerprint)) {
Assert.assertFalse(true);
}
}
timeEnd = System.currentTimeMillis();
System.out.println("Lookup of " + addedSet.size() + " items in set took:" + (timeEnd - timeStart) + " MS");
timeStart = System.currentTimeMillis();
for (URLFP testFingerprint : notAddedSet) {
if (bigBloomFilter.isPresent(testFingerprint)) {
Assert.assertTrue(addedSet.contains(testFingerprint));
}
}
timeEnd = System.currentTimeMillis();
System.out.println("Lookup of " + addedSet.size() + " items not in set took:" + (timeEnd - timeStart) + " MS");
} catch (IOException e) {
e.printStackTrace();
}
}
public int getNumElements() {
return numElements;
}
public int getBucketsPerElement() {
return bucketsPerElement;
}
public int getHashCount() {
return hashCount;
}
}