/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.ec2.postprocess.deduper;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.SimHash;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import org.junit.Assert;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.stream.JsonWriter;
/**
* Various utilities and classes to support dedupe rewrite
* @author rana
*
*/
public class DeduperUtils {
static final Log LOG = LogFactory.getLog(DeduperUtils.class);
/**
* key consisting of the pattern index and
* the key bits
*
* @author rana
*
*/
public static class DeduperKey extends LongWritable {
static final long KEY_COMPONENT_MASK = 0xFFFFFFFFFFFFL;
static final long PATTERN_COMPONENT_MASK = 0xFFFF000000000000L;
static final int PATTERN_BITS = 16;
public static void setKey(LongWritable writableTarget,int patternIndex,long key) {
writableTarget.set(keyToLong(patternIndex,key));
}
public static long keyToLong(int patternIndex,long keyValue) {
return ( ((long)patternIndex) << (64-PATTERN_BITS)) | (keyValue >> (64-patternKeyMSBits[patternIndex]) & KEY_COMPONENT_MASK);
}
public static long keyFromLong(long longValue) {
return (longValue & KEY_COMPONENT_MASK);
}
public static int patternIndexFromLong(long longValue) {
return (int) (longValue >>> (64-PATTERN_BITS));
}
}
/**
* DeduperValue
*
* @author rana
*
*/
public static class DeduperValue implements Writable {
public long _simHashValue;
public long _rootHash;
public long _urlHash;
public int _srcIP;
public int _srcContentLen;
public TextBytes _urlText = new TextBytes();
public DeduperValue() {
}
public DeduperValue(long simhashValue,long rootHash,long urlHashValue,int srcIP,int srcContentLen, TextBytes urlText) {
setValue(simhashValue, rootHash, urlHashValue,srcIP,srcContentLen,urlText);
}
public void setValue(long simHashValue,long rootHash,long urlHashValue,int srcIP,int srcContentLen,TextBytes urlText) {
_simHashValue = simHashValue;
_rootHash = rootHash;
_urlHash = urlHashValue;
_urlText.set(urlText);
_srcIP = srcIP;
_srcContentLen = srcContentLen;
}
@Override
public void readFields(DataInput in) throws IOException {
_simHashValue = in.readLong();
_rootHash = in.readLong();
_urlHash = in.readLong();
_srcIP = in.readInt();
_srcContentLen = in.readInt();
_urlText.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(_simHashValue);
out.writeLong(_rootHash);
out.writeLong(_urlHash);
out.writeInt(_srcIP);
out.writeInt(_srcContentLen);
_urlText.write(out);
}
}
/**
*
* @author rana
*
*/
public static class DeduperSetTuple implements Writable {
public long _rootHashA;
public long _urlHashA;
public long _rootHashB;
public long _urlHashB;
public TextBytes _textURLA = new TextBytes();
public TextBytes _textURLB = new TextBytes();
public DeduperSetTuple() {
}
public void setIntegralValues(long rootHashA,long urlHashA, long rootHashB,long urlHashB) {
_rootHashA = rootHashA;
_urlHashA = urlHashA;
_rootHashB = rootHashB;
_urlHashB = urlHashB;
}
@Override
public void readFields(DataInput in) throws IOException {
_rootHashA = in.readLong();
_urlHashA = in.readLong();
_rootHashB = in.readLong();
_urlHashB = in.readLong();
_textURLA.readFields(in);
_textURLB.readFields(in);
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(_rootHashA);
out.writeLong(_urlHashA);
out.writeLong(_rootHashB);
out.writeLong(_urlHashB);
_textURLA.write(out);
_textURLB.write(out);
}
}
static final int TOTAL_CHUNKS = 6;
static final int K = 3;
static final int BINOMIAL_COFF = 20; //PRECOMPUTED - BASED ON (n=6,k=3)
static final int CHUNK_LENGTHS[] = {
11,
11,
11,
11,
10,
10
};
// based on n == 6 and k == 3
static final int patternArray[];
static final int patternKeyMSBits[];
static {
patternArray = new int[BINOMIAL_COFF];
patternKeyMSBits = new int[BINOMIAL_COFF];
// run through all 64 combinations looking for
// the ones where only three out six bits are ones
int patternIndex=0;
for (int i=0;i<=63;++i) {
int test = i;
int oneBitsCount = 0;
int chunkIndex=TOTAL_CHUNKS-1;
int keyMSBits = 0;
while (test != 0) {
if ((test & 0x01) == 1) {
oneBitsCount++;
keyMSBits += CHUNK_LENGTHS[chunkIndex];
}
test >>= 1;
chunkIndex--;
}
if (oneBitsCount == K) {
patternArray[patternIndex] = i;
patternKeyMSBits[patternIndex] = keyMSBits;
patternIndex++;
}
}
}
static final long ELEVEN_BITS_MASK = 0x7FF;
static final long TEN_BITS_MASK = 0x3FF;
static final int CHUNK_POS[] = {
0,
11,
22,
33,
44,
54
};
static final long CHUNK_MASKS[] = {
ELEVEN_BITS_MASK,
ELEVEN_BITS_MASK,
ELEVEN_BITS_MASK,
ELEVEN_BITS_MASK,
TEN_BITS_MASK,
TEN_BITS_MASK
};
/**
* Divide incoming key into chunks and then produce a resulting key based on the defined bit pattern
*
* @param pattern
* @param originalValue
* @return
*/
public static long buildKeyForPatternIndex(int patternIdx,long originalValue) {
// get the bit pattern specifying key/non-key chunk for the given
// pattern index
int pattern = patternArray[patternIdx];
long keyOut = 0;
int onChunkPos = 0;
int offChunkPos = 0;
//TODO: GOING WITH THE LESS EFFICIENT ROUTE FOR EXPEDIENCY'S SAKE
//TODO: WE ONLY GENERATE THE KEY COMPONENT, AND SKIP THE VALUE BITS ALTOGETHER
for (int pass=0;pass<1;++pass) {
for (int chunkNumber=0;chunkNumber<TOTAL_CHUNKS;++chunkNumber) {
// figure out on or off ...
boolean onChunk = ((pattern & (1 << (TOTAL_CHUNKS - (chunkNumber + 1)))) != 0);
if (pass == 0 && onChunk) {
// get chunk bits ...
//System.out.println("Chunk:" + chunkNumber + " is on");
long chunkBits = ((originalValue >>> (64 - (CHUNK_POS[chunkNumber]+ CHUNK_LENGTHS[chunkNumber]))) & CHUNK_MASKS[chunkNumber]);
//System.out.println("Chunk Bits are:" + Long.toHexString(chunkBits));
// shift back in
keyOut |= (chunkBits << (64 - (onChunkPos+CHUNK_LENGTHS[chunkNumber])));
// increment offset ...
onChunkPos += CHUNK_LENGTHS[chunkNumber];
}
else if (pass == 1 && !onChunk) {
//System.out.println("Chunk:" + chunkNumber + " is off");
// get chunk bits ...
long chunkBits = ((originalValue >>> (64 - (CHUNK_POS[chunkNumber]+ CHUNK_LENGTHS[chunkNumber]))) & CHUNK_MASKS[chunkNumber]);
//System.out.println("Chunk Bits are:" + Long.toHexString(chunkBits));
// shift back in
keyOut |= (chunkBits << (64 - (onChunkPos+offChunkPos+CHUNK_LENGTHS[chunkNumber])));
// increment offset
offChunkPos += CHUNK_LENGTHS[chunkNumber];
}
}
}
return keyOut;
}
/**
The various chunk combinations and their bit representations ...
Values: [7, 11, 13, 14, 19, 21, 22, 25, 26, 28, 35, 37, 38, 41, 42, 44, 49, 50, 52, 56]
Bits:
000111
001011
001101
001110
010011
010101
010110
011001
011010
011100
100011
100101
100110
101001
101010
101100
110001
110010
110100
111000
*/
/**
* BitBuilder helper class
*/
static class BitBuilder {
long bits;
int count;
BitBuilder() {
bits = 0;
count = 0;
}
BitBuilder on(int amt) {
for (int i=0;i<amt;++i) {
bits = bits << 1;
bits |= 1L;
}
return this;
}
BitBuilder off(int amt) {
bits = bits << amt;
return this;
}
long bits() {
return bits;
}
}
/**
* TestCase - pattern generator validator
*
* @author rana
*
*/
static class TestCase {
int _patternIdx;
long _key;
long _expectedResult;
TestCase(int patternIdx,long key,long expectedResult) {
_patternIdx = patternIdx;
_key = key;
_expectedResult = expectedResult;
}
void validate() {
long expectedResult = (_expectedResult &
new BitBuilder().on(patternKeyMSBits[_patternIdx]).off(64-patternKeyMSBits[_patternIdx]).bits());
System.out.println("pattern:" + Integer.toHexString(patternArray[_patternIdx]) + " testKey:" + Long.toHexString(_key) + " expectedKey:" + Long.toHexString(expectedResult));
Assert.assertEquals(expectedResult,buildKeyForPatternIndex(_patternIdx,_key));
}
}
static final long FIRST_VALUE = 10;
static final long SECOND_VALUE = 11;
static final long THIRD_VALUE = 12;
//TODO: NEED A MORE SANE WAY TO DEFINE TEST CASES ...
static ImmutableSet<TestCase> testCases = new ImmutableSet.Builder<TestCase>()
// 000111
.add(new TestCase(0,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64 - 11)) | (SECOND_VALUE << (64 - (11+10))) | (THIRD_VALUE << (64 - (11+10+10))))))
// 001011
.add(new TestCase(1,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((SECOND_VALUE << (64 - (11+10))) | (THIRD_VALUE << (64 - (11+10+10))) | FIRST_VALUE)))
// 001101
.add(new TestCase(2,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64 - (11+11))) | (THIRD_VALUE << (64 - (11+11+10))) | SECOND_VALUE)))
// 001110
.add(new TestCase(3,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64 - (11 + 11))) | (SECOND_VALUE << (64 - (11+11+10))) | THIRD_VALUE)))
// 010011
.add(new TestCase(4,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((SECOND_VALUE << (64 - (11+10))) | (THIRD_VALUE << (64 - (11+10+10))) |FIRST_VALUE)))
// 010101
.add(new TestCase(5,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64 - (11 + 11))) | (THIRD_VALUE << (64 - (11+11+10))) |SECOND_VALUE)))
//110010
.add(new TestCase(17,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((SECOND_VALUE << (64 - (11 + 11 + 10))) | (FIRST_VALUE << (10) |THIRD_VALUE))))
//110010 (REPEAT)
.add(new TestCase(17,((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | THIRD_VALUE)))
//110100
.add(new TestCase(18,((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | THIRD_VALUE)))
//111000
.add(new TestCase(19,((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | (THIRD_VALUE << (11+11+11))),((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | (THIRD_VALUE << (11+11+11)))))
.build();
static void validateGenerator() {
for (TestCase testCase : testCases) {
testCase.validate();
}
}
public static class JSONSetBuilder {
public static final int NUM_HASH_FUNCTIONS = 10;
public static final int NUM_BITS = 11;
public static final int NUM_ELEMENTS = 1 << 18;
DataOutputBuffer _outputBuffer = new DataOutputBuffer();
JsonWriter writer;
URLFPBloomFilter filter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
public JSONSetBuilder() throws IOException {
reset();
}
public void reset() throws IOException {
filter.clear();
_outputBuffer.reset();
writer = new JsonWriter(new OutputStreamWriter(_outputBuffer, Charset.forName("UTF-8")));
writer.beginArray();
}
URLFPV2 fp = new URLFPV2();
public void add(long rootDomainHash,long urlHash,long ipAddressAndLenPacked,TextBytes urlData)throws IOException {
fp.setRootDomainHash(rootDomainHash);
fp.setDomainHash(rootDomainHash);
fp.setUrlHash(urlHash);
if (!filter.isPresent(fp)) {
filter.add(fp);
writer.beginObject();
writer.name("dh").value(rootDomainHash);
writer.name("uh").value(urlHash);
writer.name("url").value(urlData.toString());
// high word is ip address
writer.name("ip").value((int)((ipAddressAndLenPacked >> 32) & 0xFFFFFFFF));
writer.name("length").value((int)(ipAddressAndLenPacked & 0xFFFFFFFF));
writer.endObject();
}
}
public TextBytes flush() throws IOException {
writer.endArray();
writer.flush();
TextBytes textBytes = new TextBytes();
textBytes.set(_outputBuffer.getData(), 0, _outputBuffer.getLength());
return textBytes;
}
}
/**
* Build sets by comparing simhash values
*
* @author rana
*
*/
public static class SimhashMatcher {
private DataOutputBuffer _dataBuffer = new DataOutputBuffer();
private DataOutputBuffer _textDataBuffer = new DataOutputBuffer();
private int[] id;
private int count;
JSONSetBuilder setBuilder;
private static final int SIZEOF_DATABUF_ENTRY = 8 * 5;
public static final int SIMHASH_COMPONENT_IDX = 0;
public static final int ROOTHASH_COMPONENT_IDX = 1;
public static final int URLHASH_COMPONENT_IDX = 2;
public static final int IP_AND_LEN_COMPONENT_IDX = 3;
public static final int TEXT_DATA_COMPONENT_IDX = 4;
/**
* Constructor - slurp in all values associated with current deduper key...
* @param valueIterator
* @throws IOException
*/
public SimhashMatcher() throws IOException {
setBuilder = new JSONSetBuilder();
}
static long readLongComponent(DataOutputBuffer buffer,int index,int componentIndex)throws IOException {
byte readBuffer[] = buffer.getData();
int offset = (index * SIZEOF_DATABUF_ENTRY) + (componentIndex * 8);
return (((long)readBuffer[offset+0] << 56) +
((long)(readBuffer[offset+1] & 255) << 48) +
((long)(readBuffer[offset+2] & 255) << 40) +
((long)(readBuffer[offset+3] & 255) << 32) +
((long)(readBuffer[offset+4] & 255) << 24) +
((readBuffer[offset+5] & 255) << 16) +
((readBuffer[offset+6] & 255) << 8) +
((readBuffer[offset+7] & 255) << 0));
}
TextBytes textFromPackedLongInfo(TextBytes textToPopulate,long packedValue)throws IOException {
int offset = (int)((packedValue >> 32) & 0xFFFFFFFFL);
int length = (int)(packedValue & 0xFFFFFFFFL);
textToPopulate.set(_textDataBuffer.getData(),offset,length);
return textToPopulate;
}
private void collectRoots(Map<Long,TextBytes> rootDomainMap,TextBytes urlSampler,int N,int rootItemIndex)throws IOException {
// iterate the set looking for other items that have the same root
for (int j = 0; j < N; ++j) {
// ok found a match ...
if (id[j] == rootItemIndex && j != rootItemIndex){
long rootDomainA = readLongComponent(_dataBuffer, rootItemIndex, ROOTHASH_COMPONENT_IDX);
// OK .. ONE BIG LAST MINUTE HACK :-( - Need to join by root domain text key, not the long value ... :-(
// so we need to extract the key here... from the first matching hit url ...
if (!rootDomainMap.containsKey(rootDomainA)) {
textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, rootItemIndex,TEXT_DATA_COMPONENT_IDX));
String rootDomainStr = URLUtils.extractRootDomainName(new GoogleURL(urlSampler.toString()).getHost());
if (rootDomainStr != null) {
rootDomainMap.put(rootDomainA, new TextBytes(rootDomainStr));
}
}
// ok now do the same thing for the second component ...
long rootDomainB = readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX);
if (rootDomainA != rootDomainB) {
if (!rootDomainMap.containsKey(rootDomainB)) {
textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, j,TEXT_DATA_COMPONENT_IDX));
String rootDomainStr = URLUtils.extractRootDomainName(new GoogleURL(urlSampler.toString()).getHost());
if (rootDomainStr != null) {
rootDomainMap.put(rootDomainB, new TextBytes(rootDomainStr));
}
}
}
}
}
}
private static final int EXTRA_DOMAIN_MAX_SAMPLE_SIZE = 100;
private static final int OVERFLOW_THRESHOLD = 1 << 18;
/**
* emit any matched sets
*
* @param collector
* @throws IOException
*/
public void emitMatches(int maxHammingDistance,Iterator<DeduperValue> valueIterator,OutputCollector<TextBytes,TextBytes> collector,Reporter reporter) throws IOException {
_dataBuffer.reset();
_textDataBuffer.reset();
int itemCount = 0;
// ok slurp in values ...
while (valueIterator.hasNext()) {
if (++itemCount >= OVERFLOW_THRESHOLD) {
break;
}
DeduperValue value = valueIterator.next();
_dataBuffer.writeLong(value._simHashValue);
_dataBuffer.writeLong(value._rootHash);
_dataBuffer.writeLong(value._urlHash);
_dataBuffer.writeInt(value._srcIP);
_dataBuffer.writeInt(value._srcContentLen);
int originalSize = _textDataBuffer.size();
// write offset
_dataBuffer.writeInt(originalSize);
_textDataBuffer.write(value._urlText.getBytes(),value._urlText.getOffset(),value._urlText.getLength());
// write length
_dataBuffer.writeInt(_textDataBuffer.size() - originalSize);
}
if (itemCount < OVERFLOW_THRESHOLD) {
// count entries in data buffer
int N = count = _dataBuffer.size() / SIZEOF_DATABUF_ENTRY;
// allocate id array
id = new int[N];
// assume all sets are disjoint upfront ...
for (int i = 0; i < N; i++)
id[i] = i;
// ok time to start iteration ...
for (int i=0;i<N;++i) {
// forward scan potential match candidates ...
for (int j=i+1;j<N;++j) {
// if not already matched ...
if (id[i] != id[j]) {
if (SimHash.hammingDistance(
readLongComponent(_dataBuffer, i, SIMHASH_COMPONENT_IDX),
readLongComponent(_dataBuffer, j, SIMHASH_COMPONENT_IDX)) <= maxHammingDistance) {
// match ...
// union it ...
union(j,i);
}
}
}
}
// time to emit sets ...
for (int i = 0; i < N; ++i) {
// see if this is a root item
if (id[i] == i) {
// allocate hash set to contain root Domains
HashMap<Long,TextBytes> rootDomainMap = new HashMap<Long,TextBytes>();
// and a text bytes to collect url data
TextBytes urlSampler = new TextBytes();
// collect roots ...
collectRoots(rootDomainMap, urlSampler, N, i);
// ok walk roots...
for (Map.Entry<Long,TextBytes> rootEntry : rootDomainMap.entrySet()) {
// for each root ... walk items
// reset set builder ...
setBuilder.reset();
// reset extra domain item count
int extraDomainItemCount = 0;
// iterate the set
for (int j = 0; j < N; ++j) {
// if in set ...
if (id[j] == i){
// get root domain of entry ...
long itemRootDomain = readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX);
// IFF pass 0 .. only process documents from our root domain ...
if (itemRootDomain == rootEntry.getKey()) {
// add item no matter what ...
setBuilder.add(
readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX),
readLongComponent(_dataBuffer, j, URLHASH_COMPONENT_IDX),
readLongComponent(_dataBuffer, j, IP_AND_LEN_COMPONENT_IDX),
textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, j,TEXT_DATA_COMPONENT_IDX)));
}
else {
if (extraDomainItemCount++ < EXTRA_DOMAIN_MAX_SAMPLE_SIZE) {
setBuilder.add(
readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX),
readLongComponent(_dataBuffer, j, URLHASH_COMPONENT_IDX),
readLongComponent(_dataBuffer, j, IP_AND_LEN_COMPONENT_IDX),
textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, j,TEXT_DATA_COMPONENT_IDX)));
}
}
}
}
// emit data ...
TextBytes setDataOut = setBuilder.flush();
collector.collect(rootEntry.getValue(), setDataOut);
}
}
}
}
else {
LOG.error("Hit too many items in set! - skipping");
reporter.incrCounter("", "skipping-overflow-set", 1);
int N = count = _dataBuffer.size() / SIZEOF_DATABUF_ENTRY;
for (int i=0;i<100;++i) {
TextBytes urlSampler = new TextBytes();
textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, i,TEXT_DATA_COMPONENT_IDX));
LOG.error("Skipped URL Sample:" + urlSampler.toString());
}
}
}
// Return component identifier for component containing p
int find(int p) {
return id[p];
}
// are elements p and q in the same component?
boolean connected(int p, int q) {
return id[p] == id[q];
}
// merge components containing p and q
void union(int p, int q) {
if (connected(p, q))
return;
int pid = id[p];
for (int i = 0; i < id.length; i++)
if (id[i] == pid)
id[i] = id[q];
count--;
}
}
/**
* union incoming sets
*
* @author rana
*
*/
public static class SetUnionFinder {
public static final int NUM_HASH_FUNCTIONS = 10;
public static final int NUM_BITS = 11;
public static final int NUM_ELEMENTS = 1 << 18;
private DataOutputBuffer _dataBuffer = new DataOutputBuffer();
private DataOutputBuffer _textDataBuffer = new DataOutputBuffer();
private int[] id;
private int count;
private URLFPBloomFilter filter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
private JsonParser parser = new JsonParser();
private URLFPV2 fp = new URLFPV2();
private int lastUsedId=-1;
private TextBytes textBytes = new TextBytes();
private TreeMap<Long,Integer> hashToIdMap = new TreeMap<Long,Integer>();
private JSONSetBuilder setBuilder;
private static final int SIZEOF_DATABUF_ENTRY = 8 * 4;
public static final int ROOTHASH_COMPONENT_IDX = 0;
public static final int URLHASH_COMPONENT_IDX = 1;
public static final int IP_ADDRESS_AND_LEN_COMPONENT = 2;
public static final int TEXT_DATA_COMPONENT_IDX = 3;
private void reset() throws IOException {
_dataBuffer.reset();
_textDataBuffer.reset();
filter.clear();
lastUsedId = -1;
hashToIdMap.clear();
if (setBuilder == null) {
setBuilder = new JSONSetBuilder();
}
else {
setBuilder.reset();
}
}
private int insertItemGetId(long domainHash,long urlHash,int ipAddress,int length,String url)throws IOException {
Integer existingId = hashToIdMap.get(urlHash);
if (existingId == null) {
// make string to utf-8 bytes ...
textBytes.set(url);
// write out id info
_dataBuffer.writeLong(domainHash);
_dataBuffer.writeLong(urlHash);
_dataBuffer.writeInt(ipAddress);
_dataBuffer.writeInt(length);
// and string
int originalSize = _textDataBuffer.size();
// write offset
_dataBuffer.writeInt(originalSize);
_textDataBuffer.write(textBytes.getBytes(),0,textBytes.getLength());
// write length
_dataBuffer.writeInt(_textDataBuffer.size() - originalSize);
hashToIdMap.put(urlHash, ++lastUsedId);
return lastUsedId;
}
return existingId;
}
private TextBytes textFromPackedLongInfo(TextBytes textToPopulate,long packedValue)throws IOException {
int offset = (int)((packedValue >> 32) & 0xFFFFFFFFL);
int length = (int)(packedValue & 0xFFFFFFFFL);
textToPopulate.set(_textDataBuffer.getData(),offset,length);
return textToPopulate;
}
/**
* union incoming sets
*
* @param incomingSets
* @throws IOException
*/
public void union(Iterator<TextBytes> incomingSets)throws IOException {
reset();
ArrayList<ArrayList<Integer>> arrayOfSets = new ArrayList<ArrayList<Integer>>();
while (incomingSets.hasNext()) {
// allocate a new set array
ArrayList<Integer> setIdArray = new ArrayList<Integer>();
TextBytes setJSON = incomingSets.next();
try {
//
JsonArray array = parser.parse(setJSON.toString()).getAsJsonArray();
for (JsonElement element : array) {
JsonObject data = element.getAsJsonObject();
long domainHash = data.get("dh").getAsLong();
long urlHash = data.get("uh").getAsLong();
String url = data.get("url").getAsString();
int ipAddress = data.get("ip").getAsInt();
int length = data.get("length").getAsInt();
// insert the item into meta set, get back an id ...
int id = insertItemGetId(domainHash, urlHash,ipAddress,length, url);
// add id to local set
setIdArray.add(id);
}
// if not disjoint ...
if (setIdArray.size() > 1){
// sort new set first ...
Collections.sort(setIdArray);
// ok add this set to list of sets ...
arrayOfSets.add(setIdArray);
}
}
catch (Exception e) {
LOG.error("Exceptin in UnionFinder:" + CCStringUtils.stringifyException(e));
throw new IOException(e);
}
}
// allocate id array
id = new int[lastUsedId+1];
// assume all sets are disjoint upfront ...
for (int i = 0; i <= lastUsedId; i++)
id[i] = i;
// ok walk individual sets
for (ArrayList<Integer> idSet : arrayOfSets) {
// get root id
int rootId = idSet.get(0);
// walk remaining members and union to root
for (int i=1;i<idSet.size();++i) {
union(idSet.get(i),rootId);
}
}
}
static long readLongComponent(DataOutputBuffer buffer,int index,int componentIndex)throws IOException {
byte readBuffer[] = buffer.getData();
int offset = (index * SIZEOF_DATABUF_ENTRY) + (componentIndex * 8);
return (((long)readBuffer[offset+0] << 56) +
((long)(readBuffer[offset+1] & 255) << 48) +
((long)(readBuffer[offset+2] & 255) << 40) +
((long)(readBuffer[offset+3] & 255) << 32) +
((long)(readBuffer[offset+4] & 255) << 24) +
((readBuffer[offset+5] & 255) << 16) +
((readBuffer[offset+6] & 255) << 8) +
((readBuffer[offset+7] & 255) << 0));
}
public void emit(TextBytes rootKey,OutputCollector<TextBytes,TextBytes> collector,Reporter reporter)throws IOException {
// and a text bytes to collect url data
TextBytes urlSampler = new TextBytes();
// walk all members of the set
for (int i = 0; i < id.length; ++i) {
// see if this is a root item
if (id[i] == i) {
// reset set builder ...
setBuilder.reset();
// iterate the entire set
for (int j = 0; j < id.length; ++j) {
// if current item's root is current root ...
if (id[j] == i){
// add item to set builder
setBuilder.add(
readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX),
readLongComponent(_dataBuffer, j, URLHASH_COMPONENT_IDX),
readLongComponent(_dataBuffer, j, IP_ADDRESS_AND_LEN_COMPONENT),
textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, j,TEXT_DATA_COMPONENT_IDX)));
}
}
// emit data ...
TextBytes setDataOut = setBuilder.flush();
collector.collect(rootKey, setDataOut);
}
}
}
// are elements p and q in the same component?
boolean connected(int p, int q) {
return id[p] == id[q];
}
// merge components containing p and q
void union(int p, int q) {
if (connected(p, q))
return;
int pid = id[p];
for (int i = 0; i < id.length; i++)
if (id[i] == pid)
id[i] = id[q];
count--;
}
}
static private void populateTestJSONSetData(Multimap<String,Long> map,TextBytes rootDomain,TextBytes jsonPayload) throws IOException {
JsonParser parser = new JsonParser();
JsonArray array = parser.parse(jsonPayload.toString()).getAsJsonArray();
for (JsonElement el : array) {
JsonObject tuple = el.getAsJsonObject();
long urlHash = tuple.get("uh").getAsLong();
map.put(rootDomain.toString(), urlHash);
}
}
/**
*
* @param args
*/
public static void main(String[] args) throws IOException {
URLFPBloomFilter filter = new URLFPBloomFilter(JSONSetBuilder.NUM_ELEMENTS, JSONSetBuilder.NUM_HASH_FUNCTIONS, JSONSetBuilder.NUM_BITS);
DescriptiveStatistics filterClearStats = new DescriptiveStatistics();
for (int i=0;i<1000;++i) {
long timeStart = System.nanoTime();
filter.clear();
long timeEnd = System.nanoTime();
filterClearStats.addValue(timeEnd - timeStart);
}
System.out.println("Mean Clear Time:" + filterClearStats.getMean());
System.out.println("size:" + BINOMIAL_COFF);
for (int j=0;j<BINOMIAL_COFF;++j) {
int value = patternArray[j];
System.out.print("value:" + value + " ");
for (int i=5;i>=0;--i) {
System.out.print(((value & (1 << i)) != 0)? '1':'0');
}
System.out.print(" Key MSBLen:" + Integer.toString(patternKeyMSBits[j]) + "\n");
}
validateGenerator();
long key1 = new BitBuilder().on(10).off(1).on(53).bits();
long key2 = new BitBuilder().on(10).off(4).on(50).bits();
long key3 = new BitBuilder().on(10).off(4).on(47).off(3).bits();
long key4 = new BitBuilder().off(10).on(4).off(47).on(3).bits();
long key5 = new BitBuilder().off(10).on(4).off(47).on(1).off(2).bits();
Assert.assertTrue(SimHash.hammingDistance(key1, key2) == 3);
Assert.assertTrue(SimHash.hammingDistance(key1, key3) != 3);
Assert.assertTrue(SimHash.hammingDistance(key2, key3) == 3);
Assert.assertTrue(SimHash.hammingDistance(key1, key4) > 3);
Assert.assertTrue(SimHash.hammingDistance(key2, key4) > 3);
Assert.assertTrue(SimHash.hammingDistance(key3, key4) > 3);
Assert.assertTrue(SimHash.hammingDistance(key4, key5) <= 3);
ImmutableList<DeduperValue> values = new ImmutableList.Builder<DeduperValue>()
.add(new DeduperValue(key1,1000,2000,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.1"),1000,new TextBytes("http://adomain.com/")))
.add(new DeduperValue(key2,1001,2001,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://bdomain.com/")))
.add(new DeduperValue(key3,1002,2002,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.3"),1000,new TextBytes("http://cdomain.com/")))
.add(new DeduperValue(key4,1003,2003,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.4"),1000,new TextBytes("http://ddomain.com/")))
.add(new DeduperValue(key5,1004,2004,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.5"),1000,new TextBytes("http://edomain.com/")))
.build();
SimhashMatcher unionFinder = new SimhashMatcher();
final Multimap<String,Long> rootDomainToDupes = TreeMultimap.create();
// collect all json set representations ...
final ArrayList<TextBytes> jsonSets = new ArrayList<TextBytes>();
unionFinder.emitMatches(3,values.iterator(),new OutputCollector<TextBytes, TextBytes>() {
@Override
public void collect(TextBytes key, TextBytes value)throws IOException {
System.out.println("Root:" + key
+ " JSON: " + value.toString() );
populateTestJSONSetData(rootDomainToDupes,key,value);
// collect all json sets for later disjoint-set join
jsonSets.add(value);
}
},null);
ImmutableList<Long> hashSuperSet1 = ImmutableList.of(2000L,2001L,2002L);
ImmutableList<Long> hashSuperSet2 = ImmutableList.of(2003L,2004L);
Assert.assertTrue(rootDomainToDupes.get("adomain.com").containsAll(hashSuperSet1));
Assert.assertTrue(rootDomainToDupes.get("bdomain.com").containsAll(hashSuperSet1));
Assert.assertTrue(rootDomainToDupes.get("cdomain.com").containsAll(hashSuperSet1));
Assert.assertTrue(rootDomainToDupes.get("ddomain.com").containsAll(hashSuperSet2));
Assert.assertTrue(rootDomainToDupes.get("edomain.com").containsAll(hashSuperSet2));
ImmutableList<DeduperValue> secondSetValues = new ImmutableList.Builder<DeduperValue>()
.add(new DeduperValue(key1,1000,2000,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://adomain.com/")))
.add(new DeduperValue(key1,1007,2007,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://z1domain.com/")))
.add(new DeduperValue(key2,1008,2008,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://z2domain.com/")))
.add(new DeduperValue(key3,1009,2009,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://z3domain.com/")))
.build();
unionFinder.emitMatches(3,secondSetValues.iterator(),new OutputCollector<TextBytes, TextBytes>() {
@Override
public void collect(TextBytes key, TextBytes value)throws IOException {
System.out.println("Root:" + key
+ " JSON: " + value.toString() );
// collect all json sets for later disjoint-set join
jsonSets.add(value);
}
},null);
SetUnionFinder unionFinder2 = new SetUnionFinder();
// union all json sets ...
unionFinder2.union(jsonSets.iterator());
// ok emit union of sets ...
unionFinder2.emit(new TextBytes("test"), new OutputCollector<TextBytes, TextBytes>() {
@Override
public void collect(TextBytes key, TextBytes value) throws IOException {
System.out.println("Root:" + key
+ " JSON: " + value.toString() );
}
},null);
}
}