/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.ec2.postprocess.deduper; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.io.OutputStreamWriter; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.IPAddressUtils; import org.commoncrawl.util.SimHash; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLFPBloomFilter; import org.commoncrawl.util.URLUtils; import org.junit.Assert; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Multimap; import com.google.common.collect.TreeMultimap; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; import com.google.gson.stream.JsonWriter; /** * Various utilities and classes to support dedupe rewrite * @author rana * */ public class DeduperUtils { static final Log LOG = LogFactory.getLog(DeduperUtils.class); /** * key consisting of the pattern index and * the key bits * * @author rana * */ public static class DeduperKey extends LongWritable { static final long KEY_COMPONENT_MASK = 0xFFFFFFFFFFFFL; static final long PATTERN_COMPONENT_MASK = 0xFFFF000000000000L; static final int PATTERN_BITS = 16; public static void setKey(LongWritable writableTarget,int patternIndex,long key) { writableTarget.set(keyToLong(patternIndex,key)); } public static long keyToLong(int patternIndex,long keyValue) { return ( ((long)patternIndex) << (64-PATTERN_BITS)) | (keyValue >> (64-patternKeyMSBits[patternIndex]) & KEY_COMPONENT_MASK); } public static long keyFromLong(long longValue) { return (longValue & KEY_COMPONENT_MASK); } public static int patternIndexFromLong(long longValue) { return (int) (longValue >>> (64-PATTERN_BITS)); } } /** * DeduperValue * * @author rana * */ public static class DeduperValue implements Writable { public long _simHashValue; public long _rootHash; public long _urlHash; public int _srcIP; public int _srcContentLen; public TextBytes _urlText = new TextBytes(); public DeduperValue() { } public DeduperValue(long simhashValue,long rootHash,long urlHashValue,int srcIP,int srcContentLen, TextBytes urlText) { setValue(simhashValue, rootHash, urlHashValue,srcIP,srcContentLen,urlText); } public void setValue(long simHashValue,long rootHash,long urlHashValue,int srcIP,int srcContentLen,TextBytes urlText) { _simHashValue = simHashValue; _rootHash = rootHash; _urlHash = urlHashValue; _urlText.set(urlText); _srcIP = srcIP; _srcContentLen = srcContentLen; } @Override public void readFields(DataInput in) throws IOException { _simHashValue = in.readLong(); _rootHash = in.readLong(); _urlHash = in.readLong(); _srcIP = in.readInt(); _srcContentLen = in.readInt(); _urlText.readFields(in); } @Override public void write(DataOutput out) throws IOException { out.writeLong(_simHashValue); out.writeLong(_rootHash); out.writeLong(_urlHash); out.writeInt(_srcIP); out.writeInt(_srcContentLen); _urlText.write(out); } } /** * * @author rana * */ public static class DeduperSetTuple implements Writable { public long _rootHashA; public long _urlHashA; public long _rootHashB; public long _urlHashB; public TextBytes _textURLA = new TextBytes(); public TextBytes _textURLB = new TextBytes(); public DeduperSetTuple() { } public void setIntegralValues(long rootHashA,long urlHashA, long rootHashB,long urlHashB) { _rootHashA = rootHashA; _urlHashA = urlHashA; _rootHashB = rootHashB; _urlHashB = urlHashB; } @Override public void readFields(DataInput in) throws IOException { _rootHashA = in.readLong(); _urlHashA = in.readLong(); _rootHashB = in.readLong(); _urlHashB = in.readLong(); _textURLA.readFields(in); _textURLB.readFields(in); } @Override public void write(DataOutput out) throws IOException { out.writeLong(_rootHashA); out.writeLong(_urlHashA); out.writeLong(_rootHashB); out.writeLong(_urlHashB); _textURLA.write(out); _textURLB.write(out); } } static final int TOTAL_CHUNKS = 6; static final int K = 3; static final int BINOMIAL_COFF = 20; //PRECOMPUTED - BASED ON (n=6,k=3) static final int CHUNK_LENGTHS[] = { 11, 11, 11, 11, 10, 10 }; // based on n == 6 and k == 3 static final int patternArray[]; static final int patternKeyMSBits[]; static { patternArray = new int[BINOMIAL_COFF]; patternKeyMSBits = new int[BINOMIAL_COFF]; // run through all 64 combinations looking for // the ones where only three out six bits are ones int patternIndex=0; for (int i=0;i<=63;++i) { int test = i; int oneBitsCount = 0; int chunkIndex=TOTAL_CHUNKS-1; int keyMSBits = 0; while (test != 0) { if ((test & 0x01) == 1) { oneBitsCount++; keyMSBits += CHUNK_LENGTHS[chunkIndex]; } test >>= 1; chunkIndex--; } if (oneBitsCount == K) { patternArray[patternIndex] = i; patternKeyMSBits[patternIndex] = keyMSBits; patternIndex++; } } } static final long ELEVEN_BITS_MASK = 0x7FF; static final long TEN_BITS_MASK = 0x3FF; static final int CHUNK_POS[] = { 0, 11, 22, 33, 44, 54 }; static final long CHUNK_MASKS[] = { ELEVEN_BITS_MASK, ELEVEN_BITS_MASK, ELEVEN_BITS_MASK, ELEVEN_BITS_MASK, TEN_BITS_MASK, TEN_BITS_MASK }; /** * Divide incoming key into chunks and then produce a resulting key based on the defined bit pattern * * @param pattern * @param originalValue * @return */ public static long buildKeyForPatternIndex(int patternIdx,long originalValue) { // get the bit pattern specifying key/non-key chunk for the given // pattern index int pattern = patternArray[patternIdx]; long keyOut = 0; int onChunkPos = 0; int offChunkPos = 0; //TODO: GOING WITH THE LESS EFFICIENT ROUTE FOR EXPEDIENCY'S SAKE //TODO: WE ONLY GENERATE THE KEY COMPONENT, AND SKIP THE VALUE BITS ALTOGETHER for (int pass=0;pass<1;++pass) { for (int chunkNumber=0;chunkNumber<TOTAL_CHUNKS;++chunkNumber) { // figure out on or off ... boolean onChunk = ((pattern & (1 << (TOTAL_CHUNKS - (chunkNumber + 1)))) != 0); if (pass == 0 && onChunk) { // get chunk bits ... //System.out.println("Chunk:" + chunkNumber + " is on"); long chunkBits = ((originalValue >>> (64 - (CHUNK_POS[chunkNumber]+ CHUNK_LENGTHS[chunkNumber]))) & CHUNK_MASKS[chunkNumber]); //System.out.println("Chunk Bits are:" + Long.toHexString(chunkBits)); // shift back in keyOut |= (chunkBits << (64 - (onChunkPos+CHUNK_LENGTHS[chunkNumber]))); // increment offset ... onChunkPos += CHUNK_LENGTHS[chunkNumber]; } else if (pass == 1 && !onChunk) { //System.out.println("Chunk:" + chunkNumber + " is off"); // get chunk bits ... long chunkBits = ((originalValue >>> (64 - (CHUNK_POS[chunkNumber]+ CHUNK_LENGTHS[chunkNumber]))) & CHUNK_MASKS[chunkNumber]); //System.out.println("Chunk Bits are:" + Long.toHexString(chunkBits)); // shift back in keyOut |= (chunkBits << (64 - (onChunkPos+offChunkPos+CHUNK_LENGTHS[chunkNumber]))); // increment offset offChunkPos += CHUNK_LENGTHS[chunkNumber]; } } } return keyOut; } /** The various chunk combinations and their bit representations ... Values: [7, 11, 13, 14, 19, 21, 22, 25, 26, 28, 35, 37, 38, 41, 42, 44, 49, 50, 52, 56] Bits: 000111 001011 001101 001110 010011 010101 010110 011001 011010 011100 100011 100101 100110 101001 101010 101100 110001 110010 110100 111000 */ /** * BitBuilder helper class */ static class BitBuilder { long bits; int count; BitBuilder() { bits = 0; count = 0; } BitBuilder on(int amt) { for (int i=0;i<amt;++i) { bits = bits << 1; bits |= 1L; } return this; } BitBuilder off(int amt) { bits = bits << amt; return this; } long bits() { return bits; } } /** * TestCase - pattern generator validator * * @author rana * */ static class TestCase { int _patternIdx; long _key; long _expectedResult; TestCase(int patternIdx,long key,long expectedResult) { _patternIdx = patternIdx; _key = key; _expectedResult = expectedResult; } void validate() { long expectedResult = (_expectedResult & new BitBuilder().on(patternKeyMSBits[_patternIdx]).off(64-patternKeyMSBits[_patternIdx]).bits()); System.out.println("pattern:" + Integer.toHexString(patternArray[_patternIdx]) + " testKey:" + Long.toHexString(_key) + " expectedKey:" + Long.toHexString(expectedResult)); Assert.assertEquals(expectedResult,buildKeyForPatternIndex(_patternIdx,_key)); } } static final long FIRST_VALUE = 10; static final long SECOND_VALUE = 11; static final long THIRD_VALUE = 12; //TODO: NEED A MORE SANE WAY TO DEFINE TEST CASES ... static ImmutableSet<TestCase> testCases = new ImmutableSet.Builder<TestCase>() // 000111 .add(new TestCase(0,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64 - 11)) | (SECOND_VALUE << (64 - (11+10))) | (THIRD_VALUE << (64 - (11+10+10)))))) // 001011 .add(new TestCase(1,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((SECOND_VALUE << (64 - (11+10))) | (THIRD_VALUE << (64 - (11+10+10))) | FIRST_VALUE))) // 001101 .add(new TestCase(2,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64 - (11+11))) | (THIRD_VALUE << (64 - (11+11+10))) | SECOND_VALUE))) // 001110 .add(new TestCase(3,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64 - (11 + 11))) | (SECOND_VALUE << (64 - (11+11+10))) | THIRD_VALUE))) // 010011 .add(new TestCase(4,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((SECOND_VALUE << (64 - (11+10))) | (THIRD_VALUE << (64 - (11+10+10))) |FIRST_VALUE))) // 010101 .add(new TestCase(5,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64 - (11 + 11))) | (THIRD_VALUE << (64 - (11+11+10))) |SECOND_VALUE))) //110010 .add(new TestCase(17,((FIRST_VALUE << (10 + 10)) | (SECOND_VALUE << (10)) | (THIRD_VALUE << 0)),((SECOND_VALUE << (64 - (11 + 11 + 10))) | (FIRST_VALUE << (10) |THIRD_VALUE)))) //110010 (REPEAT) .add(new TestCase(17,((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | THIRD_VALUE))) //110100 .add(new TestCase(18,((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | (THIRD_VALUE << 0)),((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | THIRD_VALUE))) //111000 .add(new TestCase(19,((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | (THIRD_VALUE << (11+11+11))),((FIRST_VALUE << (64-11)) | (SECOND_VALUE << (64 - (11+11))) | (THIRD_VALUE << (11+11+11))))) .build(); static void validateGenerator() { for (TestCase testCase : testCases) { testCase.validate(); } } public static class JSONSetBuilder { public static final int NUM_HASH_FUNCTIONS = 10; public static final int NUM_BITS = 11; public static final int NUM_ELEMENTS = 1 << 18; DataOutputBuffer _outputBuffer = new DataOutputBuffer(); JsonWriter writer; URLFPBloomFilter filter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS); public JSONSetBuilder() throws IOException { reset(); } public void reset() throws IOException { filter.clear(); _outputBuffer.reset(); writer = new JsonWriter(new OutputStreamWriter(_outputBuffer, Charset.forName("UTF-8"))); writer.beginArray(); } URLFPV2 fp = new URLFPV2(); public void add(long rootDomainHash,long urlHash,long ipAddressAndLenPacked,TextBytes urlData)throws IOException { fp.setRootDomainHash(rootDomainHash); fp.setDomainHash(rootDomainHash); fp.setUrlHash(urlHash); if (!filter.isPresent(fp)) { filter.add(fp); writer.beginObject(); writer.name("dh").value(rootDomainHash); writer.name("uh").value(urlHash); writer.name("url").value(urlData.toString()); // high word is ip address writer.name("ip").value((int)((ipAddressAndLenPacked >> 32) & 0xFFFFFFFF)); writer.name("length").value((int)(ipAddressAndLenPacked & 0xFFFFFFFF)); writer.endObject(); } } public TextBytes flush() throws IOException { writer.endArray(); writer.flush(); TextBytes textBytes = new TextBytes(); textBytes.set(_outputBuffer.getData(), 0, _outputBuffer.getLength()); return textBytes; } } /** * Build sets by comparing simhash values * * @author rana * */ public static class SimhashMatcher { private DataOutputBuffer _dataBuffer = new DataOutputBuffer(); private DataOutputBuffer _textDataBuffer = new DataOutputBuffer(); private int[] id; private int count; JSONSetBuilder setBuilder; private static final int SIZEOF_DATABUF_ENTRY = 8 * 5; public static final int SIMHASH_COMPONENT_IDX = 0; public static final int ROOTHASH_COMPONENT_IDX = 1; public static final int URLHASH_COMPONENT_IDX = 2; public static final int IP_AND_LEN_COMPONENT_IDX = 3; public static final int TEXT_DATA_COMPONENT_IDX = 4; /** * Constructor - slurp in all values associated with current deduper key... * @param valueIterator * @throws IOException */ public SimhashMatcher() throws IOException { setBuilder = new JSONSetBuilder(); } static long readLongComponent(DataOutputBuffer buffer,int index,int componentIndex)throws IOException { byte readBuffer[] = buffer.getData(); int offset = (index * SIZEOF_DATABUF_ENTRY) + (componentIndex * 8); return (((long)readBuffer[offset+0] << 56) + ((long)(readBuffer[offset+1] & 255) << 48) + ((long)(readBuffer[offset+2] & 255) << 40) + ((long)(readBuffer[offset+3] & 255) << 32) + ((long)(readBuffer[offset+4] & 255) << 24) + ((readBuffer[offset+5] & 255) << 16) + ((readBuffer[offset+6] & 255) << 8) + ((readBuffer[offset+7] & 255) << 0)); } TextBytes textFromPackedLongInfo(TextBytes textToPopulate,long packedValue)throws IOException { int offset = (int)((packedValue >> 32) & 0xFFFFFFFFL); int length = (int)(packedValue & 0xFFFFFFFFL); textToPopulate.set(_textDataBuffer.getData(),offset,length); return textToPopulate; } private void collectRoots(Map<Long,TextBytes> rootDomainMap,TextBytes urlSampler,int N,int rootItemIndex)throws IOException { // iterate the set looking for other items that have the same root for (int j = 0; j < N; ++j) { // ok found a match ... if (id[j] == rootItemIndex && j != rootItemIndex){ long rootDomainA = readLongComponent(_dataBuffer, rootItemIndex, ROOTHASH_COMPONENT_IDX); // OK .. ONE BIG LAST MINUTE HACK :-( - Need to join by root domain text key, not the long value ... :-( // so we need to extract the key here... from the first matching hit url ... if (!rootDomainMap.containsKey(rootDomainA)) { textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, rootItemIndex,TEXT_DATA_COMPONENT_IDX)); String rootDomainStr = URLUtils.extractRootDomainName(new GoogleURL(urlSampler.toString()).getHost()); if (rootDomainStr != null) { rootDomainMap.put(rootDomainA, new TextBytes(rootDomainStr)); } } // ok now do the same thing for the second component ... long rootDomainB = readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX); if (rootDomainA != rootDomainB) { if (!rootDomainMap.containsKey(rootDomainB)) { textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, j,TEXT_DATA_COMPONENT_IDX)); String rootDomainStr = URLUtils.extractRootDomainName(new GoogleURL(urlSampler.toString()).getHost()); if (rootDomainStr != null) { rootDomainMap.put(rootDomainB, new TextBytes(rootDomainStr)); } } } } } } private static final int EXTRA_DOMAIN_MAX_SAMPLE_SIZE = 100; private static final int OVERFLOW_THRESHOLD = 1 << 18; /** * emit any matched sets * * @param collector * @throws IOException */ public void emitMatches(int maxHammingDistance,Iterator<DeduperValue> valueIterator,OutputCollector<TextBytes,TextBytes> collector,Reporter reporter) throws IOException { _dataBuffer.reset(); _textDataBuffer.reset(); int itemCount = 0; // ok slurp in values ... while (valueIterator.hasNext()) { if (++itemCount >= OVERFLOW_THRESHOLD) { break; } DeduperValue value = valueIterator.next(); _dataBuffer.writeLong(value._simHashValue); _dataBuffer.writeLong(value._rootHash); _dataBuffer.writeLong(value._urlHash); _dataBuffer.writeInt(value._srcIP); _dataBuffer.writeInt(value._srcContentLen); int originalSize = _textDataBuffer.size(); // write offset _dataBuffer.writeInt(originalSize); _textDataBuffer.write(value._urlText.getBytes(),value._urlText.getOffset(),value._urlText.getLength()); // write length _dataBuffer.writeInt(_textDataBuffer.size() - originalSize); } if (itemCount < OVERFLOW_THRESHOLD) { // count entries in data buffer int N = count = _dataBuffer.size() / SIZEOF_DATABUF_ENTRY; // allocate id array id = new int[N]; // assume all sets are disjoint upfront ... for (int i = 0; i < N; i++) id[i] = i; // ok time to start iteration ... for (int i=0;i<N;++i) { // forward scan potential match candidates ... for (int j=i+1;j<N;++j) { // if not already matched ... if (id[i] != id[j]) { if (SimHash.hammingDistance( readLongComponent(_dataBuffer, i, SIMHASH_COMPONENT_IDX), readLongComponent(_dataBuffer, j, SIMHASH_COMPONENT_IDX)) <= maxHammingDistance) { // match ... // union it ... union(j,i); } } } } // time to emit sets ... for (int i = 0; i < N; ++i) { // see if this is a root item if (id[i] == i) { // allocate hash set to contain root Domains HashMap<Long,TextBytes> rootDomainMap = new HashMap<Long,TextBytes>(); // and a text bytes to collect url data TextBytes urlSampler = new TextBytes(); // collect roots ... collectRoots(rootDomainMap, urlSampler, N, i); // ok walk roots... for (Map.Entry<Long,TextBytes> rootEntry : rootDomainMap.entrySet()) { // for each root ... walk items // reset set builder ... setBuilder.reset(); // reset extra domain item count int extraDomainItemCount = 0; // iterate the set for (int j = 0; j < N; ++j) { // if in set ... if (id[j] == i){ // get root domain of entry ... long itemRootDomain = readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX); // IFF pass 0 .. only process documents from our root domain ... if (itemRootDomain == rootEntry.getKey()) { // add item no matter what ... setBuilder.add( readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX), readLongComponent(_dataBuffer, j, URLHASH_COMPONENT_IDX), readLongComponent(_dataBuffer, j, IP_AND_LEN_COMPONENT_IDX), textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, j,TEXT_DATA_COMPONENT_IDX))); } else { if (extraDomainItemCount++ < EXTRA_DOMAIN_MAX_SAMPLE_SIZE) { setBuilder.add( readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX), readLongComponent(_dataBuffer, j, URLHASH_COMPONENT_IDX), readLongComponent(_dataBuffer, j, IP_AND_LEN_COMPONENT_IDX), textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, j,TEXT_DATA_COMPONENT_IDX))); } } } } // emit data ... TextBytes setDataOut = setBuilder.flush(); collector.collect(rootEntry.getValue(), setDataOut); } } } } else { LOG.error("Hit too many items in set! - skipping"); reporter.incrCounter("", "skipping-overflow-set", 1); int N = count = _dataBuffer.size() / SIZEOF_DATABUF_ENTRY; for (int i=0;i<100;++i) { TextBytes urlSampler = new TextBytes(); textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, i,TEXT_DATA_COMPONENT_IDX)); LOG.error("Skipped URL Sample:" + urlSampler.toString()); } } } // Return component identifier for component containing p int find(int p) { return id[p]; } // are elements p and q in the same component? boolean connected(int p, int q) { return id[p] == id[q]; } // merge components containing p and q void union(int p, int q) { if (connected(p, q)) return; int pid = id[p]; for (int i = 0; i < id.length; i++) if (id[i] == pid) id[i] = id[q]; count--; } } /** * union incoming sets * * @author rana * */ public static class SetUnionFinder { public static final int NUM_HASH_FUNCTIONS = 10; public static final int NUM_BITS = 11; public static final int NUM_ELEMENTS = 1 << 18; private DataOutputBuffer _dataBuffer = new DataOutputBuffer(); private DataOutputBuffer _textDataBuffer = new DataOutputBuffer(); private int[] id; private int count; private URLFPBloomFilter filter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS); private JsonParser parser = new JsonParser(); private URLFPV2 fp = new URLFPV2(); private int lastUsedId=-1; private TextBytes textBytes = new TextBytes(); private TreeMap<Long,Integer> hashToIdMap = new TreeMap<Long,Integer>(); private JSONSetBuilder setBuilder; private static final int SIZEOF_DATABUF_ENTRY = 8 * 4; public static final int ROOTHASH_COMPONENT_IDX = 0; public static final int URLHASH_COMPONENT_IDX = 1; public static final int IP_ADDRESS_AND_LEN_COMPONENT = 2; public static final int TEXT_DATA_COMPONENT_IDX = 3; private void reset() throws IOException { _dataBuffer.reset(); _textDataBuffer.reset(); filter.clear(); lastUsedId = -1; hashToIdMap.clear(); if (setBuilder == null) { setBuilder = new JSONSetBuilder(); } else { setBuilder.reset(); } } private int insertItemGetId(long domainHash,long urlHash,int ipAddress,int length,String url)throws IOException { Integer existingId = hashToIdMap.get(urlHash); if (existingId == null) { // make string to utf-8 bytes ... textBytes.set(url); // write out id info _dataBuffer.writeLong(domainHash); _dataBuffer.writeLong(urlHash); _dataBuffer.writeInt(ipAddress); _dataBuffer.writeInt(length); // and string int originalSize = _textDataBuffer.size(); // write offset _dataBuffer.writeInt(originalSize); _textDataBuffer.write(textBytes.getBytes(),0,textBytes.getLength()); // write length _dataBuffer.writeInt(_textDataBuffer.size() - originalSize); hashToIdMap.put(urlHash, ++lastUsedId); return lastUsedId; } return existingId; } private TextBytes textFromPackedLongInfo(TextBytes textToPopulate,long packedValue)throws IOException { int offset = (int)((packedValue >> 32) & 0xFFFFFFFFL); int length = (int)(packedValue & 0xFFFFFFFFL); textToPopulate.set(_textDataBuffer.getData(),offset,length); return textToPopulate; } /** * union incoming sets * * @param incomingSets * @throws IOException */ public void union(Iterator<TextBytes> incomingSets)throws IOException { reset(); ArrayList<ArrayList<Integer>> arrayOfSets = new ArrayList<ArrayList<Integer>>(); while (incomingSets.hasNext()) { // allocate a new set array ArrayList<Integer> setIdArray = new ArrayList<Integer>(); TextBytes setJSON = incomingSets.next(); try { // JsonArray array = parser.parse(setJSON.toString()).getAsJsonArray(); for (JsonElement element : array) { JsonObject data = element.getAsJsonObject(); long domainHash = data.get("dh").getAsLong(); long urlHash = data.get("uh").getAsLong(); String url = data.get("url").getAsString(); int ipAddress = data.get("ip").getAsInt(); int length = data.get("length").getAsInt(); // insert the item into meta set, get back an id ... int id = insertItemGetId(domainHash, urlHash,ipAddress,length, url); // add id to local set setIdArray.add(id); } // if not disjoint ... if (setIdArray.size() > 1){ // sort new set first ... Collections.sort(setIdArray); // ok add this set to list of sets ... arrayOfSets.add(setIdArray); } } catch (Exception e) { LOG.error("Exceptin in UnionFinder:" + CCStringUtils.stringifyException(e)); throw new IOException(e); } } // allocate id array id = new int[lastUsedId+1]; // assume all sets are disjoint upfront ... for (int i = 0; i <= lastUsedId; i++) id[i] = i; // ok walk individual sets for (ArrayList<Integer> idSet : arrayOfSets) { // get root id int rootId = idSet.get(0); // walk remaining members and union to root for (int i=1;i<idSet.size();++i) { union(idSet.get(i),rootId); } } } static long readLongComponent(DataOutputBuffer buffer,int index,int componentIndex)throws IOException { byte readBuffer[] = buffer.getData(); int offset = (index * SIZEOF_DATABUF_ENTRY) + (componentIndex * 8); return (((long)readBuffer[offset+0] << 56) + ((long)(readBuffer[offset+1] & 255) << 48) + ((long)(readBuffer[offset+2] & 255) << 40) + ((long)(readBuffer[offset+3] & 255) << 32) + ((long)(readBuffer[offset+4] & 255) << 24) + ((readBuffer[offset+5] & 255) << 16) + ((readBuffer[offset+6] & 255) << 8) + ((readBuffer[offset+7] & 255) << 0)); } public void emit(TextBytes rootKey,OutputCollector<TextBytes,TextBytes> collector,Reporter reporter)throws IOException { // and a text bytes to collect url data TextBytes urlSampler = new TextBytes(); // walk all members of the set for (int i = 0; i < id.length; ++i) { // see if this is a root item if (id[i] == i) { // reset set builder ... setBuilder.reset(); // iterate the entire set for (int j = 0; j < id.length; ++j) { // if current item's root is current root ... if (id[j] == i){ // add item to set builder setBuilder.add( readLongComponent(_dataBuffer, j, ROOTHASH_COMPONENT_IDX), readLongComponent(_dataBuffer, j, URLHASH_COMPONENT_IDX), readLongComponent(_dataBuffer, j, IP_ADDRESS_AND_LEN_COMPONENT), textFromPackedLongInfo(urlSampler,readLongComponent(_dataBuffer, j,TEXT_DATA_COMPONENT_IDX))); } } // emit data ... TextBytes setDataOut = setBuilder.flush(); collector.collect(rootKey, setDataOut); } } } // are elements p and q in the same component? boolean connected(int p, int q) { return id[p] == id[q]; } // merge components containing p and q void union(int p, int q) { if (connected(p, q)) return; int pid = id[p]; for (int i = 0; i < id.length; i++) if (id[i] == pid) id[i] = id[q]; count--; } } static private void populateTestJSONSetData(Multimap<String,Long> map,TextBytes rootDomain,TextBytes jsonPayload) throws IOException { JsonParser parser = new JsonParser(); JsonArray array = parser.parse(jsonPayload.toString()).getAsJsonArray(); for (JsonElement el : array) { JsonObject tuple = el.getAsJsonObject(); long urlHash = tuple.get("uh").getAsLong(); map.put(rootDomain.toString(), urlHash); } } /** * * @param args */ public static void main(String[] args) throws IOException { URLFPBloomFilter filter = new URLFPBloomFilter(JSONSetBuilder.NUM_ELEMENTS, JSONSetBuilder.NUM_HASH_FUNCTIONS, JSONSetBuilder.NUM_BITS); DescriptiveStatistics filterClearStats = new DescriptiveStatistics(); for (int i=0;i<1000;++i) { long timeStart = System.nanoTime(); filter.clear(); long timeEnd = System.nanoTime(); filterClearStats.addValue(timeEnd - timeStart); } System.out.println("Mean Clear Time:" + filterClearStats.getMean()); System.out.println("size:" + BINOMIAL_COFF); for (int j=0;j<BINOMIAL_COFF;++j) { int value = patternArray[j]; System.out.print("value:" + value + " "); for (int i=5;i>=0;--i) { System.out.print(((value & (1 << i)) != 0)? '1':'0'); } System.out.print(" Key MSBLen:" + Integer.toString(patternKeyMSBits[j]) + "\n"); } validateGenerator(); long key1 = new BitBuilder().on(10).off(1).on(53).bits(); long key2 = new BitBuilder().on(10).off(4).on(50).bits(); long key3 = new BitBuilder().on(10).off(4).on(47).off(3).bits(); long key4 = new BitBuilder().off(10).on(4).off(47).on(3).bits(); long key5 = new BitBuilder().off(10).on(4).off(47).on(1).off(2).bits(); Assert.assertTrue(SimHash.hammingDistance(key1, key2) == 3); Assert.assertTrue(SimHash.hammingDistance(key1, key3) != 3); Assert.assertTrue(SimHash.hammingDistance(key2, key3) == 3); Assert.assertTrue(SimHash.hammingDistance(key1, key4) > 3); Assert.assertTrue(SimHash.hammingDistance(key2, key4) > 3); Assert.assertTrue(SimHash.hammingDistance(key3, key4) > 3); Assert.assertTrue(SimHash.hammingDistance(key4, key5) <= 3); ImmutableList<DeduperValue> values = new ImmutableList.Builder<DeduperValue>() .add(new DeduperValue(key1,1000,2000,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.1"),1000,new TextBytes("http://adomain.com/"))) .add(new DeduperValue(key2,1001,2001,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://bdomain.com/"))) .add(new DeduperValue(key3,1002,2002,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.3"),1000,new TextBytes("http://cdomain.com/"))) .add(new DeduperValue(key4,1003,2003,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.4"),1000,new TextBytes("http://ddomain.com/"))) .add(new DeduperValue(key5,1004,2004,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.5"),1000,new TextBytes("http://edomain.com/"))) .build(); SimhashMatcher unionFinder = new SimhashMatcher(); final Multimap<String,Long> rootDomainToDupes = TreeMultimap.create(); // collect all json set representations ... final ArrayList<TextBytes> jsonSets = new ArrayList<TextBytes>(); unionFinder.emitMatches(3,values.iterator(),new OutputCollector<TextBytes, TextBytes>() { @Override public void collect(TextBytes key, TextBytes value)throws IOException { System.out.println("Root:" + key + " JSON: " + value.toString() ); populateTestJSONSetData(rootDomainToDupes,key,value); // collect all json sets for later disjoint-set join jsonSets.add(value); } },null); ImmutableList<Long> hashSuperSet1 = ImmutableList.of(2000L,2001L,2002L); ImmutableList<Long> hashSuperSet2 = ImmutableList.of(2003L,2004L); Assert.assertTrue(rootDomainToDupes.get("adomain.com").containsAll(hashSuperSet1)); Assert.assertTrue(rootDomainToDupes.get("bdomain.com").containsAll(hashSuperSet1)); Assert.assertTrue(rootDomainToDupes.get("cdomain.com").containsAll(hashSuperSet1)); Assert.assertTrue(rootDomainToDupes.get("ddomain.com").containsAll(hashSuperSet2)); Assert.assertTrue(rootDomainToDupes.get("edomain.com").containsAll(hashSuperSet2)); ImmutableList<DeduperValue> secondSetValues = new ImmutableList.Builder<DeduperValue>() .add(new DeduperValue(key1,1000,2000,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://adomain.com/"))) .add(new DeduperValue(key1,1007,2007,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://z1domain.com/"))) .add(new DeduperValue(key2,1008,2008,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://z2domain.com/"))) .add(new DeduperValue(key3,1009,2009,IPAddressUtils.IPV4AddressStrToInteger("10.0.0.2"),1000,new TextBytes("http://z3domain.com/"))) .build(); unionFinder.emitMatches(3,secondSetValues.iterator(),new OutputCollector<TextBytes, TextBytes>() { @Override public void collect(TextBytes key, TextBytes value)throws IOException { System.out.println("Root:" + key + " JSON: " + value.toString() ); // collect all json sets for later disjoint-set join jsonSets.add(value); } },null); SetUnionFinder unionFinder2 = new SetUnionFinder(); // union all json sets ... unionFinder2.union(jsonSets.iterator()); // ok emit union of sets ... unionFinder2.emit(new TextBytes("test"), new OutputCollector<TextBytes, TextBytes>() { @Override public void collect(TextBytes key, TextBytes value) throws IOException { System.out.println("Root:" + key + " JSON: " + value.toString() ); } },null); } }