/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.ec2.postprocess.crawldb; import java.io.DataInput; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableUtils; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Partitioner; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.ByteArrayUtils; import org.commoncrawl.util.FlexBuffer; import org.commoncrawl.util.MurmurHash; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLUtils; import org.commoncrawl.util.Tuples.Pair; import org.junit.Assert; import org.junit.Test; /** * composite key encoded as a utf-8 string * contains various components, including root domain hash, domain hash, url hash, key type, and extra datum * * @author rana * */ public class CrawlDBKey extends TextBytes { private static final Log LOG = LogFactory.getLog(CrawlDBKey.class); long rootDomainHash; long domainHash; long urlHash; long type; long timestamp; FlexBuffer extraData = null; static FlexBuffer[] scanArray = allocateScanArray(); @Override public void readFields(DataInput in) throws IOException { // delegate to super class super.readFields(in); synchronized (scanArray) { // read components scanForComponents(this, ':', scanArray); // populate fields .. rootDomainHash = getLongComponentFromComponentArray(scanArray,ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID); domainHash = getLongComponentFromComponentArray(scanArray,ComponentId.DOMAIN_HASH_COMPONENT_ID); urlHash = getLongComponentFromComponentArray(scanArray,ComponentId.URL_HASH_COMPONENT_ID); type = getLongComponentFromComponentArray(scanArray,ComponentId.TYPE_COMPONENT_ID); if (type == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) timestamp = getLongComponentFromComponentArray(scanArray,ComponentId.EXTRA_DATA_COMPONENT_ID); else extraData = getByteArrayFromComponentArray(scanArray, CrawlDBKey.ComponentId.EXTRA_DATA_COMPONENT_ID); } } public enum ComponentId { ROOT_DOMAIN_HASH_COMPONENT_ID, DOMAIN_HASH_COMPONENT_ID, URL_HASH_COMPONENT_ID, TYPE_COMPONENT_ID, EXTRA_DATA_COMPONENT_ID } public enum Type { KEY_TYPE_CRAWL_STATUS, KEY_TYPE_HTML_LINK, KEY_TYPE_ATOM_LINK, KEY_TYPE_RSS_LINK, KEY_TYPE_INCOMING_URLS_SAMPLE, KEY_TYPE_MERGED_RECORD, KEY_TYPE_ROOTDOMAIN_METADATA_RECORD, KEY_TYPE_SUBDOMAIN_METADATA_RECORD } public static final int TypeSortOrder[] = { 11, // KEY_TYPE_CRAWL_STATUS 12, // KEY_TYPE_HTML_LINK 13, // KEY_TYPE_ATOM_LINK 14, // KEY_TYPE_RSS_LINK 15, // KEY_TYPE_INCOMING_URLS_SAMPLE 10, // KEY_TYPE_MERGED_RECORD 0, // KEY_TYPE_ROOTDOMAIN_METADATA_RECORD 1, // KEY_TYPE_SUBDOMAIN_METADATA_RECORD }; public static FlexBuffer[] allocateScanArray() { FlexBuffer[] array = new FlexBuffer[ComponentId.values().length]; for (int i=0;i<array.length;++i) { array[i] = new FlexBuffer(); } return array; } public static TextBytes generateLinkKey(TextBytes url,CrawlDBKey.Type recordType,String md5Bytes) throws IOException { URLFPV2 fp = URLUtils.getURLFPV2FromURL(url.toString()); if (fp != null) { String key = fp.getRootDomainHash() +":"+fp.getDomainHash() +":"+fp.getUrlHash() +":"+recordType.ordinal() + ":" + ((md5Bytes != null) ? md5Bytes : ""); return new TextBytes(key); } return null; } public static TextBytes generateLinkKey(URLFPV2 fp,CrawlDBKey.Type recordType,String md5Bytes) throws IOException { if (fp != null) { String key = fp.getRootDomainHash() +":"+fp.getDomainHash() +":"+fp.getUrlHash() +":"+recordType.ordinal() +":" + ((md5Bytes != null) ? md5Bytes : ""); return new TextBytes(key); } return null; } public static Pair<TextBytes,TextBytes> generateMinMaxKeysForDomain(long rootDomainId,long subDomainId) throws IOException { String minKey = rootDomainId +":"+((subDomainId == -1) ? Long.MIN_VALUE : subDomainId) +":"+Long.MIN_VALUE +":"+Long.MIN_VALUE +":"; String maxKey = rootDomainId +":"+((subDomainId == -1) ? Long.MAX_VALUE : subDomainId) +":"+Long.MAX_VALUE +":"+Long.MAX_VALUE +":"; return new Pair<TextBytes,TextBytes>(new TextBytes(minKey),new TextBytes(maxKey)); } public static TextBytes generateCrawlStatusKey(Text url,long timestamp) throws IOException { URLFPV2 fp = URLUtils.getURLFPV2FromURL(url.toString()); if (fp != null) { String key = fp.getRootDomainHash() +":"+fp.getDomainHash() +":"+fp.getUrlHash() +":"+Type.KEY_TYPE_CRAWL_STATUS.ordinal() + ":" + timestamp; return new TextBytes(key); } return null; } public static TextBytes generateCrawlStatusKey(URLFPV2 fp,long timestamp) throws IOException { if (fp != null) { String key = fp.getRootDomainHash() +":"+fp.getDomainHash() +":"+fp.getUrlHash() +":"+Type.KEY_TYPE_CRAWL_STATUS.ordinal() + ":" + timestamp; return new TextBytes(key); } return null; } public static TextBytes generateKey(URLFPV2 fp,CrawlDBKey.Type type,long timestamp) throws IOException { if (fp != null) { String key = fp.getRootDomainHash() +":"+fp.getDomainHash() +":"+fp.getUrlHash() +":"+type.ordinal() + ":" + timestamp; return new TextBytes(key); } return null; } public static int scanForComponents(TextBytes key,int terminator,FlexBuffer[] parts) { int scanPos = key.getOffset(); int endPos = key.getOffset() + key.getLength() - 1; int partCount = 0; int tokenStart = key.getOffset(); byte[] data = key.getBytes(); do { if (scanPos == endPos || data[scanPos] == terminator) { if(data[scanPos] == terminator) parts[partCount++].set(data,tokenStart,scanPos-tokenStart); else parts[partCount++].set(data,tokenStart,scanPos-tokenStart + 1); tokenStart = scanPos + 1; } scanPos++; }while (scanPos <= endPos && partCount < parts.length); return partCount; } public static Pair<Integer,Integer> scanAndTerminateOn(byte[] data,int offset,int length,int terminator,int targetHitCount) { int scanPos = offset; int endPos = offset + length; int hitCount = 0; Pair<Integer,Integer> tupleOut = new Pair<Integer, Integer>(scanPos,0); while (scanPos != endPos) { if (data[scanPos] == terminator) { if (++hitCount == targetHitCount) break; else { tupleOut.e0 = scanPos + 1; scanPos++; } } else { scanPos++; } } tupleOut.e1 = scanPos - 1; return tupleOut; } public static class CrawlDBKeyPartitioner implements Partitioner<TextBytes, TextBytes> { static int hashCodeFromKey(TextBytes key) { int result = 1; result = MurmurHash.hashLong(getLongComponentFromKey(key, ComponentId.DOMAIN_HASH_COMPONENT_ID),result); result = MurmurHash.hashLong(getLongComponentFromKey(key, ComponentId.URL_HASH_COMPONENT_ID),result); return result; } @Override public int getPartition(TextBytes key, TextBytes value, int numPartitions) { return (hashCodeFromKey(key) & Integer.MAX_VALUE) % numPartitions; } @Override public void configure(JobConf job) { } } public static class PartitionBySuperDomainPartitioner implements Partitioner<TextBytes, TextBytes> { static int hashCodeFromKey(TextBytes key) { int result = 1; result = MurmurHash.hashLong(getLongComponentFromKey(key, ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID),result); return result; } @Override public int getPartition(TextBytes key, TextBytes value, int numPartitions) { return (hashCodeFromKey(key) & Integer.MAX_VALUE) % numPartitions; } @Override public void configure(JobConf job) { } } public static class CrawlDBKeyGroupByURLComparator implements RawComparator<TextBytes> { TextBytes key1 = new TextBytes(); TextBytes key2 = new TextBytes(); FlexBuffer scanArray1[] = allocateScanArray(); FlexBuffer scanArray2[] = allocateScanArray(); DataInputBuffer inputBuffer = new DataInputBuffer(); @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int keyLen; try { inputBuffer.reset(b1,s1,l1); keyLen = WritableUtils.readVInt(inputBuffer); key1.set(b1,inputBuffer.getPosition(), keyLen); inputBuffer.reset(b2,s2,l2); keyLen = WritableUtils.readVInt(inputBuffer); key2.set(b2,inputBuffer.getPosition(), keyLen); return compare(key1,key2); } catch (IOException e) { throw new RuntimeException(e); } } @Override public int compare(TextBytes o1, TextBytes o2) { scanForComponents(o1, ':',scanArray1); scanForComponents(o2, ':',scanArray2); long domain1Key = getLongComponentFromComponentArray(scanArray1,ComponentId.DOMAIN_HASH_COMPONENT_ID); long domain2Key = getLongComponentFromComponentArray(scanArray2,ComponentId.DOMAIN_HASH_COMPONENT_ID); int result = (domain1Key < domain2Key) ? -1 : (domain1Key > domain2Key) ? 1 : 0; if (result == 0) { long hash1Key = getLongComponentFromComponentArray(scanArray1,ComponentId.URL_HASH_COMPONENT_ID); long hash2Key = getLongComponentFromComponentArray(scanArray2,ComponentId.URL_HASH_COMPONENT_ID); result = (hash1Key < hash2Key) ? -1 : (hash1Key > hash2Key) ? 1 : 0; } return result; } } public static class CrawlDBKeyGroupByRootDomainComparator implements RawComparator<TextBytes> { TextBytes key1 = new TextBytes(); TextBytes key2 = new TextBytes(); FlexBuffer scanArray1[] = allocateScanArray(); FlexBuffer scanArray2[] = allocateScanArray(); DataInputBuffer inputBuffer = new DataInputBuffer(); @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int keyLen; try { inputBuffer.reset(b1,s1,l1); keyLen = WritableUtils.readVInt(inputBuffer); key1.set(b1,inputBuffer.getPosition(), keyLen); inputBuffer.reset(b2,s2,l2); keyLen = WritableUtils.readVInt(inputBuffer); key2.set(b2,inputBuffer.getPosition(), keyLen); return compare(key1,key2); } catch (IOException e) { throw new RuntimeException(e); } } @Override public int compare(TextBytes o1, TextBytes o2) { scanForComponents(o1, ':',scanArray1); scanForComponents(o2, ':',scanArray2); long rootDomain1Key = getLongComponentFromComponentArray(scanArray1,ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID); long rootDomain2Key = getLongComponentFromComponentArray(scanArray2,ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID); int result = (rootDomain1Key < rootDomain2Key) ? -1 : (rootDomain1Key > rootDomain2Key) ? 1 : 0; return result; } } public static class CrawlDBKeyComparator implements java.util.Comparator<CrawlDBKey> { @Override public int compare(CrawlDBKey o1, CrawlDBKey o2) { int result = (o1.rootDomainHash < o2.rootDomainHash) ? -1 : (o1.rootDomainHash > o2.rootDomainHash) ? 1 : 0; if (result == 0) { result = (o1.domainHash < o2.domainHash) ? -1 : (o1.domainHash > o2.domainHash) ? 1 : 0; if (result == 0) { result = (o1.urlHash < o2.urlHash) ? -1 : (o1.urlHash > o2.urlHash) ? 1 : 0; } if (result == 0) { long type1SortOrder = TypeSortOrder[(int)o1.type]; long type2SortOrder = TypeSortOrder[(int)o2.type]; result = (type1SortOrder < type2SortOrder) ? -1 : (type1SortOrder > type2SortOrder) ? 1 : 0; if (result == 0) { if (o1.type == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { result = (o1.timestamp < o2.timestamp) ? -1 : (o1.timestamp > o2.timestamp) ? 1 : 0; } else { result = o1.extraData.compareTo(o2.extraData); } } } } return result; } } public static class LinkKeyComparator implements RawComparator<TextBytes> { TextBytes key1 = new TextBytes(); TextBytes key2 = new TextBytes(); DataInputBuffer inputBuffer = new DataInputBuffer(); FlexBuffer scanArray1[] = allocateScanArray(); FlexBuffer scanArray2[] = allocateScanArray(); @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int keyLen; try { inputBuffer.reset(b1,s1,l1); keyLen = WritableUtils.readVInt(inputBuffer); key1.set(b1,inputBuffer.getPosition(), keyLen); inputBuffer.reset(b2,s2,l2); keyLen = WritableUtils.readVInt(inputBuffer); key2.set(b2,inputBuffer.getPosition(), keyLen); return compare(key1,key2); } catch (IOException e) { throw new RuntimeException(e); } } @Override public int compare(TextBytes o1, TextBytes o2) { scanForComponents(o1, ':',scanArray1); scanForComponents(o2, ':',scanArray2); long rootdomain1Key = getLongComponentFromComponentArray(scanArray1,ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID); long rootdomain2Key = getLongComponentFromComponentArray(scanArray2,ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID); int result = (rootdomain1Key < rootdomain2Key) ? -1 : (rootdomain1Key > rootdomain2Key) ? 1 : 0; if (result == 0) { long domain1Key = getLongComponentFromComponentArray(scanArray1,ComponentId.DOMAIN_HASH_COMPONENT_ID); long domain2Key = getLongComponentFromComponentArray(scanArray2,ComponentId.DOMAIN_HASH_COMPONENT_ID); result = (domain1Key < domain2Key) ? -1 : (domain1Key > domain2Key) ? 1 : 0; if (result == 0) { long hash1Key = getLongComponentFromComponentArray(scanArray1,ComponentId.URL_HASH_COMPONENT_ID); long hash2Key = getLongComponentFromComponentArray(scanArray2,ComponentId.URL_HASH_COMPONENT_ID); result = (hash1Key < hash2Key) ? -1 : (hash1Key > hash2Key) ? 1 : 0; } if (result == 0) { long type1 = getLongComponentFromComponentArray(scanArray1,ComponentId.TYPE_COMPONENT_ID); long type2 = getLongComponentFromComponentArray(scanArray2,ComponentId.TYPE_COMPONENT_ID); long type1SortOrder = TypeSortOrder[(int)type1]; long type2SortOrder = TypeSortOrder[(int)type2]; result = (type1SortOrder < type2SortOrder) ? -1 : (type1SortOrder > type2SortOrder) ? 1 : 0; if (result == 0) { if (type1 == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { long timestamp1 = getLongComponentFromComponentArray(scanArray1,ComponentId.EXTRA_DATA_COMPONENT_ID); long timestamp2 = getLongComponentFromComponentArray(scanArray2,ComponentId.EXTRA_DATA_COMPONENT_ID); result = (timestamp1 < timestamp2) ? -1 : (timestamp1 > timestamp2) ? 1 : 0; } else { FlexBuffer bytes1 = getByteArrayFromComponentArray(scanArray1, CrawlDBKey.ComponentId.EXTRA_DATA_COMPONENT_ID); FlexBuffer bytes2 = getByteArrayFromComponentArray(scanArray2, CrawlDBKey.ComponentId.EXTRA_DATA_COMPONENT_ID); result = bytes1.compareTo(bytes2); } } } } return result; } } public static long getLongComponentFromComponentArray(FlexBuffer[] array,ComponentId componentId) { int index = componentId.ordinal(); return ByteArrayUtils.parseLong(array[index].get(),array[index].getOffset(), array[index].getCount(), 10); } public static long getLongComponentFromKey(TextBytes key,ComponentId componentId) { byte[] data = key.getBytes(); int offset = key.getOffset(); int length = key.getLength(); //long startTime = System.nanoTime(); Pair<Integer,Integer> scanResult = scanAndTerminateOn(data, offset, length, ':', componentId.ordinal() + 1); long result = ByteArrayUtils.parseLong(data, scanResult.e0, scanResult.e1 - scanResult.e0 + 1, 10); //long endTime = System.nanoTime(); return result; } public static FlexBuffer getByteArrayFromComponentArray(FlexBuffer[] array,ComponentId componentId) { return array[componentId.ordinal()]; } public static FlexBuffer getByteArrayComponentFromKey(TextBytes key,ComponentId componentId) { byte[] data = key.getBytes(); int offset = key.getOffset(); int length = key.getLength(); Pair<Integer,Integer> scanResult = scanAndTerminateOn(data, offset, length, ':', componentId.ordinal() + 1); return new FlexBuffer(data, scanResult.e0, scanResult.e1 - scanResult.e0 + 1); } private static void compareKeys(RawComparator<TextBytes> comparator,TextBytes key1,TextBytes key2,int expectedResult) { long nanoStart = System.nanoTime(); Assert.assertEquals(comparator.compare(key1, key2),expectedResult); long nanoEnd = System.nanoTime(); System.out.println("Object Comparison Took:" + (nanoEnd-nanoStart)); DataOutputBuffer outputBuffer1 = new DataOutputBuffer(); DataOutputBuffer outputBuffer2 = new DataOutputBuffer(); try { key1.write(outputBuffer1); key2.write(outputBuffer2); nanoStart = System.nanoTime(); Assert.assertEquals(comparator.compare(outputBuffer1.getData(), 0, outputBuffer1.getLength(), outputBuffer2.getData(), 0, outputBuffer2.getLength()),expectedResult); nanoEnd = System.nanoTime(); System.out.println("Raw Comparison Took:" + (nanoEnd-nanoStart)); int offset1 = outputBuffer1.getLength(); int offset2 = outputBuffer2.getLength(); key1.write(outputBuffer1); key2.write(outputBuffer2); Assert.assertEquals(comparator.compare(outputBuffer1.getData(), offset1, outputBuffer1.getLength() - offset1, outputBuffer2.getData(), offset2, outputBuffer2.getLength() - offset2),expectedResult); if (comparator instanceof LinkKeyComparator) { DataInputBuffer inputStream1 = new DataInputBuffer(); DataInputBuffer inputStream2 = new DataInputBuffer(); inputStream1.reset(outputBuffer1.getData(), outputBuffer1.getLength()); inputStream2.reset(outputBuffer2.getData(), outputBuffer2.getLength()); CrawlDBKey cdbkey1 = new CrawlDBKey(); CrawlDBKey cdbkey2 = new CrawlDBKey(); cdbkey1.readFields(inputStream1); cdbkey2.readFields(inputStream2); CrawlDBKeyComparator altComparator = new CrawlDBKeyComparator(); System.out.println("*Comparing Using CrawlDBKey Comparator"); nanoStart = System.nanoTime(); Assert.assertEquals(altComparator.compare(cdbkey1, cdbkey2),expectedResult); nanoEnd = System.nanoTime(); System.out.println("Typed Comparison Took:" + (nanoEnd-nanoStart)); } } catch (IOException e) { e.printStackTrace(); throw new RuntimeException(e); } } @Test public void validateLinkKey()throws Exception { // allocate scan arrays FlexBuffer[] scanArray = allocateScanArray(); URLFPV2 fp = URLUtils.getURLFPV2FromURL("http://www.google.com/"); if (fp != null) { TextBytes key = generateLinkKey(fp,CrawlDBKey.Type.KEY_TYPE_HTML_LINK,"FOOBAR"); // get it the hard way scanForComponents(key,':',scanArray); System.out.println("Key is:" + key.toString()); System.out.println("Check Root Domain Key"); Assert.assertTrue(fp.getRootDomainHash() == getLongComponentFromKey(key, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); Assert.assertTrue(fp.getRootDomainHash() == getLongComponentFromComponentArray(scanArray,CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID)); System.out.println("Check Domain Key"); Assert.assertTrue(fp.getDomainHash() == getLongComponentFromKey(key, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); Assert.assertTrue(fp.getDomainHash() == getLongComponentFromComponentArray(scanArray,CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID)); System.out.println("Check URL Hash Key"); Assert.assertTrue(fp.getUrlHash() == getLongComponentFromKey(key, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID)); Assert.assertTrue(fp.getUrlHash() == getLongComponentFromComponentArray(scanArray,CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID)); System.out.println("Check Type"); Assert.assertTrue(CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() == getLongComponentFromKey(key, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID)); Assert.assertTrue(CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() == getLongComponentFromComponentArray(scanArray,CrawlDBKey.ComponentId.TYPE_COMPONENT_ID)); System.out.println("Check ExtraData"); Assert.assertTrue(new FlexBuffer("FOOBAR".getBytes()).compareTo(getByteArrayComponentFromKey(key, CrawlDBKey.ComponentId.EXTRA_DATA_COMPONENT_ID)) == 0); Assert.assertTrue(new FlexBuffer("FOOBAR".getBytes()).compareTo(getByteArrayFromComponentArray(scanArray, CrawlDBKey.ComponentId.EXTRA_DATA_COMPONENT_ID)) == 0); TextBytes statusKey1 = generateCrawlStatusKey(new Text("http://www.google.com/"),12345L); TextBytes statusKey2 = generateCrawlStatusKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),12345L); TextBytes statusKey3 = generateCrawlStatusKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),12346L); TextBytes linkKey1 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),CrawlDBKey.Type.KEY_TYPE_HTML_LINK,MD5Hash.digest("123").toString()); TextBytes linkKey2 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),CrawlDBKey.Type.KEY_TYPE_HTML_LINK,MD5Hash.digest("1234").toString()); URLFPV2 fpLink3 = URLUtils.getURLFPV2FromURL("http://www.google.com/"); fpLink3.setUrlHash(fpLink3.getUrlHash() + 1); TextBytes linkKey3 = generateLinkKey(fpLink3,CrawlDBKey.Type.KEY_TYPE_HTML_LINK,"12345"); TextBytes linkKey4 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),CrawlDBKey.Type.KEY_TYPE_ATOM_LINK,"1234"); TextBytes linkKey5 = generateLinkKey(fpLink3,CrawlDBKey.Type.KEY_TYPE_ATOM_LINK,"12345"); TextBytes mergeKey3 = generateLinkKey(fpLink3,CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD,"12345"); TextBytes rootDomainKey3 = generateLinkKey(fpLink3,CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD,"12345"); TextBytes subDomainKey3 = generateLinkKey(fpLink3,CrawlDBKey.Type.KEY_TYPE_SUBDOMAIN_METADATA_RECORD,"12345"); LinkKeyComparator comparator = new LinkKeyComparator(); CrawlDBKeyGroupByURLComparator gcomparator = new CrawlDBKeyGroupByURLComparator(); System.out.println("Comparing Similar status Keys"); compareKeys(comparator,statusKey1,statusKey2,0); compareKeys(comparator,statusKey2,statusKey1,0); System.out.println("Comparing Similar status Keys w/Grouping C"); compareKeys(gcomparator,statusKey1,statusKey2,0); compareKeys(gcomparator,statusKey2,statusKey1,0); System.out.println("Comparing Similar status Keys with different timestamps"); compareKeys(comparator,statusKey2,statusKey3,-1); compareKeys(comparator,statusKey3,statusKey2,1); System.out.println("Comparing Similar status Keys with different timestamps w/Grouping C"); compareKeys(gcomparator,statusKey2,statusKey3,0); compareKeys(gcomparator,statusKey3,statusKey2,0); System.out.println("Comparing Status Key to Link Key"); compareKeys(comparator,statusKey1,linkKey1,-1); compareKeys(comparator,linkKey1,statusKey1,1); System.out.println("Comparing Status Key to Link Key Grouping C"); compareKeys(gcomparator,statusKey1,linkKey1,0); compareKeys(gcomparator,linkKey1,statusKey1,0); System.out.println("Comparing TWO Link Keys with same hash value"); compareKeys(comparator,linkKey1,linkKey1,0); compareKeys(comparator,linkKey1,linkKey1,0); System.out.println("Comparing TWO Link Keys with same type but different hash values"); compareKeys(comparator,linkKey2,linkKey3,-1); compareKeys(comparator,linkKey3,linkKey2,1); System.out.println("Comparing TWO Link Keys with same type but different hash values - Grouping C"); compareKeys(gcomparator,linkKey2,linkKey3,-1); compareKeys(gcomparator,linkKey3,linkKey2,1); System.out.println("Comparing TWO Link Keys with different types but same hash values"); compareKeys(comparator,linkKey2,linkKey4,-1); compareKeys(comparator,linkKey4,linkKey2,1); System.out.println("Comparing TWO Link Keys with different types but same hash values - Grouping C "); compareKeys(gcomparator,linkKey2,linkKey4,0); compareKeys(gcomparator,linkKey4,linkKey2,0); System.out.println("Comparing TWO Link Keys with similar types but different hash values"); compareKeys(comparator,linkKey4,linkKey5,-1); compareKeys(comparator,linkKey5,linkKey4,1); System.out.println("Comparing TWO Link Keys with similar types but different hash values - Grouping C"); compareKeys(gcomparator,linkKey4,linkKey5,-1); compareKeys(gcomparator,linkKey5,linkKey4,1); compareKeys(comparator,mergeKey3,linkKey3,-1); compareKeys(comparator,rootDomainKey3,mergeKey3,-1); compareKeys(comparator,subDomainKey3,mergeKey3,-1); compareKeys(comparator,rootDomainKey3,subDomainKey3,-1); compareKeys(comparator,subDomainKey3,rootDomainKey3,1); compareKeys(comparator,rootDomainKey3,rootDomainKey3,0); TextBytes mergeKey = generateLinkKey(fpLink3,CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD,"12345"); TextBytes rootDomainKey = generateLinkKey(fpLink3,CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD,"12345"); TextBytes subDomainKey = generateLinkKey(fpLink3,CrawlDBKey.Type.KEY_TYPE_SUBDOMAIN_METADATA_RECORD,"12345"); TextBytes linkKeyTest = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),CrawlDBKey.Type.KEY_TYPE_HTML_LINK,""); Assert.assertTrue(scanForComponents(linkKeyTest, ':',scanArray) == scanArray.length -1); for (FlexBuffer buffer : scanArray) LOG.info("Scan Item:" + buffer.toString()); TextBytes linkKeyTest2 = generateLinkKey(URLUtils.getURLFPV2FromURL("http://www.google.com/"),CrawlDBKey.Type.KEY_TYPE_HTML_LINK,MD5Hash.digest("REALLY LONG SOMETHING OR ANOTHER").toString()); Assert.assertTrue(scanForComponents(linkKeyTest2, ':',scanArray) == scanArray.length); for (FlexBuffer buffer : scanArray) LOG.info("Scan Item:" + buffer.toString()); } } }