package org.commoncrawl.mapred.ec2.postprocess.crawldb;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import junit.framework.Assert;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Counters.Counter;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.LinkKeyComparator;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.ByteArrayUtils;
import org.commoncrawl.util.JSONUtils;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.Tuples.Pair;
import org.junit.Test;
import static org.commoncrawl.util.JSONUtils.*;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterators;
import com.google.common.collect.Sets;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.ibm.icu.util.StringTokenizer;
public class CrawlDBMergingReducerTests extends CrawlDBMergingReducer {
@Test
public void testSourceInputOutputWriters()throws IOException {
_sourceInputsBuffer = new DataOutputBuffer(16348*4);
_sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);
String sourceDomainURL = "http://sourcedomain.com/foo";
URLFPV2 sourceFP = URLUtils.getURLFPV2FromCanonicalURL(sourceDomainURL);
String urls[] = {
"http://somedomain.com/foo",
"http://someother.com/bar"
};
for (String url : urls) {
URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(url);
// double insert and validate actual single insertion
trackPotentialLinkSource(fp,url,sourceFP);
trackPotentialLinkSource(fp,url,sourceFP);
}
// validate data ...
TextBytes firstVersion = new TextBytes();
firstVersion.set(_sourceInputsBuffer.getData(),0,_sourceInputsBuffer.getLength());
StringTokenizer tokenizer = new StringTokenizer(firstVersion.toString(), "\n");
int itemIndex = 0;
while (tokenizer.hasMoreElements()) {
String nextLine = tokenizer.nextToken();
String splits[] = nextLine.split("\t");
// validate fp
URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(urls[itemIndex]);
Assert.assertEquals(fp.getDomainHash(),Long.parseLong(splits[0]));
// validate actual url ...
Assert.assertEquals(splits[1],urls[itemIndex]);
itemIndex++;
}
// reset output buffer ...
_sourceInputsBuffer = new DataOutputBuffer(16348*4);
// and source bloom filter ...
_sourceInputsTrackingFilter = new URLFPBloomFilter(10000000, NUM_HASH_FUNCTIONS, NUM_BITS);
importLinkSourceData(sourceFP,firstVersion);
// second text should match first ..
TextBytes secondVersion = new TextBytes();
secondVersion.set(_sourceInputsBuffer.getData(),0,_sourceInputsBuffer.getLength());
Assert.assertEquals(firstVersion,secondVersion);
}
/**
* Mock Collector
*/
static class MockCollectorReporter implements OutputCollector<TextBytes,TextBytes>, Reporter {
public ArrayList<Pair<TextBytes,TextBytes>> items = new ArrayList<Pair<TextBytes,TextBytes>>();
@Override
public void progress() {
}
//@Override
public float getProgress() { return 0; }
@Override
public void setStatus(String status) {
}
@Override
public Counter getCounter(Enum<?> name) {
return null;
}
@Override
public Counter getCounter(String group, String name) {
return null;
}
@Override
public void incrCounter(Enum<?> key, long amount) {
}
@Override
public void incrCounter(String group, String counter, long amount) {
}
@Override
public InputSplit getInputSplit() throws UnsupportedOperationException {
return null;
}
@Override
public void collect(TextBytes key, TextBytes value) throws IOException {
items.add(new Pair<TextBytes,TextBytes>(new TextBytes(key.toString()),new TextBytes(value.toString())));
}
}
private static final int CANNED_SIMHASH_VALUE = 1234;
private static final String CANNED_MD5_VALUE = "5d41402abc4b2a76b9719d911017c592";
private static final String CANNED_MIME_TYPE = "text/html";
private static final int CANNED_CONTENT_LEN = 100;
private static final String CANNED_IP = "1.1.1.1";
private static final String CANNED_TITLE = "title";
private static final String CANNED_META_PROPERTY_NAME = "metaProperty";
private static final String CANNED_META_PROPERTY_VALUE = "metaValue";
private static final String CANNED_LINKING_HOST_1 = "link.host.one.com";
private static final String CANNED_LINKING_HOST_2 = "link.host.two.com";
private static final String CANNED_LINKING_HOST_3 = "link.host.three.com";
private static final String EXTRA_PROPERTY_REDIRECT_SOURCE = "redirectSource";
private static final String EXTRA_PROPERTY_LINK_SOURCE = "linkSource";
private static final String CANNED_CRAWL_URL_1 = "http://cannedone.com/";
//private static final String CANNED_URL_1_HOST = "cannedone.com";
private static final String CANNED_CRAWL_URL_2 = "http://cannedtwo.com/";
//private static final String CANNED_URL_2_HOST = "cannedtwo.com";
private static final String CANNED_CRAWL_URL_3 = "http://cannedthree.com/";
private static final String CANNED_URL_3_HOST = "cannedthree.com";
private static final String CANNED_CRAWL_URL_4 = "http://cannedfour.com/";
//private static final String CANNED_URL_4_HOST = "cannedfour.com";
private static final String CANNED_CRAWL_URL_5 = "http://cannedfive.com/";
//private static final String CANNED_URL_5_HOST = "cannedfive.com";
//private static final long CANNED_TIMESTAMP_0 = 123456700L;
private static final long CANNED_TIMESTAMP_1 = 123456789L;
private static final long CANNED_TIMESTAMP_2 = 123456799L;
private static final long CANNED_TIMESTAMP_3 = 123466799L;
private static final String CANNED_FAILURE_REASON = "FailureReason";
private static final String CANNED_FAILURE_DETAIL = "FailureDetail";
static String sourceURLToLinkingHostURL(String sourceURL, String linkingHostName) {
return "http://" + linkingHostName + "/" + MD5Hash.digest(sourceURL + Long.toString(System.nanoTime())).toString();
}
enum TestRecordType {
CRAWL_STATUS,
CRAWL_STATUS_WITH_REDIRECT,
INLINK
};
public static class TestModel implements Comparable<TestModel> {
TreeMap<URLFPV2,URLStateModel> fpToModelMap = new TreeMap<URLFPV2, URLStateModel>();
/**
* update the model from the raw (generated tuples)
* @param tuple
* @throws Exception
*/
void updateModelFromInputTuple(Pair<TextBytes,TextBytes> tuple) throws Exception {
URLFPV2 fp = new URLFPV2();
// get key ...
fp.setRootDomainHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
fp.setDomainHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
fp.setUrlHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID));
long recordType = CrawlDBKey.getLongComponentFromKey(tuple.e0,CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal() || recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) {
// update model given key ...
URLStateModel urlModel = fpToModelMap.get(fp);
if (urlModel == null) {
urlModel = new URLStateModel();
urlModel.fp = fp;
fpToModelMap.put(fp, urlModel);
}
if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
JsonObject redirectJSON = urlModel.updateModelGivenCrawlStatus(tuple.e1);
if (redirectJSON != null) {
URLFPV2 redirectFP = URLUtils.getURLFPV2FromURL(redirectJSON.get("source_url").getAsString());
TextBytes key = CrawlDBKey.generateKey(redirectFP,CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS,redirectJSON.get("attempt_time").getAsLong());
Pair<TextBytes,TextBytes> redirectTuple = new Pair<TextBytes, TextBytes>(key,new TextBytes(redirectJSON.toString()));
updateModelFromInputTuple(redirectTuple);
}
}
else if (recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) {
urlModel.updateModelGivenLinkRecord(tuple.e1);
}
}
}
/**
* build a model from a set of output tuples
* (captured by the mock collector)
* @param tuples
*/
void buildModelFromOutputTuples(List<Pair<TextBytes,TextBytes>> tuples) throws IOException {
for (Pair<TextBytes,TextBytes> tuple : tuples) {
URLFPV2 fp = new URLFPV2();
// get key ...
fp.setRootDomainHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
fp.setDomainHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
fp.setUrlHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID));
// get type ...
long recordType = CrawlDBKey.getLongComponentFromKey(tuple.e0,CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
// validate that the record type is only of one of two varieties supported in final output ...
Assert.assertTrue(recordType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal() || recordType == CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE.ordinal());
// create url model object if necessary ...
URLStateModel urlModel = fpToModelMap.get(fp);
if (urlModel == null) {
urlModel = new URLStateModel();
urlModel.fp = fp;
fpToModelMap.put(fp, urlModel);
}
// ok do stuff based on record type ...
if (recordType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) {
urlModel.updateModelGivenMergedRecord(tuple.e1);
}
else {
urlModel.updateModelGivenUrlsSampleRecord(tuple.e1);
}
}
}
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
for (Map.Entry<URLFPV2,URLStateModel> urlRecord : fpToModelMap.entrySet()) {
sb.append(urlRecord.getKey() + "\n" + urlRecord.toString());
}
return sb.toString();
}
@Override
public int compareTo(TestModel other) {
int result = 0;
for (URLFPV2 fp : fpToModelMap.keySet()) {
if (!other.fpToModelMap.containsKey(fp)) {
System.out.println("FP:" + fp.getKey() + " not Found in Other Model");
result = -1;
break;
}
else {
URLStateModel leftSideModel = fpToModelMap.get(fp);
URLStateModel rightSideModel = other.fpToModelMap.get(fp);
result = leftSideModel.compareTo(rightSideModel);
if (result != 0) {
System.out.println("URLModels for FP:" + fp.getKey() + " did not match");
break;
}
}
}
return result;
}
}
public static class URLStateModel implements Comparable<URLStateModel> {
URLFPV2 fp;
public String source_url = null;
public boolean has_crawl_status = false;
public long latest_attempt_time = -1;
public long latest_crawl_time = -1;
public int attempt_count = 0;
public int crawl_count = 0;
public String parsed_as = null;
public int http_result = -1;
public String redirect_url = null;
public SortedSet<String> ext_urls = new TreeSet<String>();
public SortedSet<JsonObject> details = new TreeSet<JsonObject>(new Comparator<JsonObject>() {
@Override
public int compare(JsonObject o1, JsonObject o2) {
String md51 = MD5Hash.digest(o1.toString()).toString();
String md52 = MD5Hash.digest(o2.toString()).toString();
return md51.compareTo(md52);
}
});
public TreeMap<Long,String> incoming = new TreeMap<Long, String>();
@Override
public String toString() {
StringBuffer sb = new StringBuffer();
sb.append("source:"+ source_url + "\n");
sb.append("has_crawl_status:" + has_crawl_status + "\n");
sb.append("latest_crawl_time:" + latest_crawl_time + "\n");
sb.append("attempt_count:" + attempt_count + "\n");
sb.append("crawl_count:" + crawl_count + "\n");
sb.append("parsed_as:" + parsed_as + "\n");
sb.append("http_result:" + http_result + "\n");
sb.append("redirect_url:" + redirect_url + "\n");
for (String extURL : ext_urls) {
sb.append("extURL:" + extURL + "\n");
}
for (JsonObject jsonObj : details) {
sb.append("crawlStatus:" + jsonObj.toString() + "\n");
}
for (String value : incoming.values()) {
sb.append("incoming:" + value + "\n");
}
sb.append("\n");
return sb.toString();
}
void updateModelGivenMergedRecord(TextBytes mergedJSON) throws IOException {
JsonObject mergeObject = new JsonParser().parse(mergedJSON.toString()).getAsJsonObject();
source_url = mergeObject.get(TOPLEVEL_SOURCE_URL_PROPRETY).getAsString();
has_crawl_status = mergeObject.has(TOPLEVEL_SUMMARYRECORD_PROPRETY);
if (has_crawl_status) {
JsonObject crawlStatusObj = mergeObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY);
latest_attempt_time = crawlStatusObj.get(SUMMARYRECORD_LATEST_ATTEMPT_PROPERTY).getAsLong();
latest_crawl_time = safeGetLong(crawlStatusObj, SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY);
attempt_count = crawlStatusObj.get(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY).getAsInt();
crawl_count = (crawlStatusObj.has(SUMMARYRECORD_CRAWLCOUNT_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_CRAWLCOUNT_PROPERTY).getAsInt() : 0;
parsed_as = (crawlStatusObj.has(SUMMARYRECORD_PARSEDAS_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_PARSEDAS_PROPERTY).getAsString() : null;
http_result = (crawlStatusObj.has(SUMMARYRECORD_HTTP_RESULT_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_HTTP_RESULT_PROPERTY).getAsInt() : -1;
redirect_url = (crawlStatusObj.has(SUMMARYRECORD_REDIRECT_URL_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_REDIRECT_URL_PROPERTY).getAsString() : null;
safeJsonArrayToStringCollection(crawlStatusObj, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS, ext_urls);
if (crawlStatusObj.has(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) {
for (JsonElement crawlDetail : crawlStatusObj.getAsJsonArray(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) {
details.add(crawlDetail.getAsJsonObject());
}
}
}
}
void updateModelGivenUrlsSampleRecord(TextBytes inputData) {
int curpos =inputData.getOffset();
int endpos = inputData.getOffset() + inputData.getLength();
byte lfPattern[] = { 0xA };
byte tabPattern[] = { 0x9 };
TextBytes urlText = new TextBytes();
while (curpos != endpos) {
int tabIndex = ByteArrayUtils.indexOf(inputData.getBytes(), curpos, endpos - curpos, tabPattern);
if (tabIndex == -1) {
break;
}
else {
int lfIndex = ByteArrayUtils.indexOf(inputData.getBytes(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern);
if (lfIndex == -1) {
break;
}
else {
long sourceDomainHash = ByteArrayUtils.parseLong(inputData.getBytes(),curpos, tabIndex-curpos, 10);
urlText.set(inputData.getBytes(),tabIndex + 1,lfIndex - (tabIndex + 1));
incoming.put(sourceDomainHash, urlText.toString());
curpos = lfIndex + 1;
}
}
}
}
JsonObject updateModelGivenCrawlStatus(TextBytes statusJSON) throws IOException {
has_crawl_status = true;
JsonParser parser = new JsonParser();
JsonObject jsonObj = parser.parse(statusJSON.toString()).getAsJsonObject();
if (source_url == null) {
source_url = jsonObj.get("source_url").getAsString();
}
HashSet<String> extHrefs = new HashSet<String>();
JsonObject crawlDetailRecord = crawlDetailRecordFromCrawlStatusRecord(jsonObj, fp, extHrefs, new MockCollectorReporter());
long attemptTime = safeGetLong(crawlDetailRecord,CRAWLDETAIL_ATTEMPT_TIME_PROPERTY);
latest_attempt_time = Math.max(attemptTime, latest_attempt_time);
attempt_count++;
int httpResult = safeGetInteger(crawlDetailRecord, CRAWLDETAIL_HTTPRESULT_PROPERTY);
if (httpResult != -1) {
if (latest_attempt_time == attemptTime) {
this.http_result = httpResult;
}
if (httpResult >= 200 && httpResult <= 299) {
latest_crawl_time = Math.max(attemptTime, latest_crawl_time);
crawl_count++;
if (latest_crawl_time == attemptTime) {
this.parsed_as = crawlDetailRecord.get(CRAWLDETAIL_PARSEDAS_PROPERTY).getAsString();
this.ext_urls.clear();
this.ext_urls.addAll(extHrefs);
}
}
else if (httpResult >= 300 && httpResult <= 399) {
this.redirect_url = (crawlDetailRecord.get(CRAWLDETAIL_REDIRECT_URL) != null) ? crawlDetailRecord.get(CRAWLDETAIL_REDIRECT_URL).getAsString() : null;
}
}
this.details.add(crawlDetailRecord);
if (jsonObj.has("redirect_from")) {
JsonObject redirectObject = jsonObj.get("redirect_from").getAsJsonObject();
JsonObject redirectJSON = new JsonObject();
int redirectHttpResult = redirectObject.get("http_result").getAsInt();
redirectJSON.addProperty("disposition","SUCCESS");
redirectJSON.addProperty("http_result",redirectHttpResult);
redirectJSON.addProperty("server_ip",redirectObject.get("server_ip").getAsString());
redirectJSON.addProperty("attempt_time",jsonObj.get("attempt_time").getAsLong());
redirectJSON.addProperty("target_url",jsonObj.get("source_url").getAsString());
redirectJSON.addProperty("source_url",redirectObject.get("source_url").getAsString());
return redirectJSON;
}
return null;
}
public void updateModelGivenLinkRecord(TextBytes linkJSON) {
JsonParser parser = new JsonParser();
JsonObject jsonObj = parser.parse(linkJSON.toString()).getAsJsonObject();
if (source_url == null) {
source_url = jsonObj.get("href").getAsString();
}
String sourceURL = jsonObj.get("source_url").getAsString();
URLFPV2 urlfp = URLUtils.getURLFPV2FromURL(sourceURL);
if (urlfp != null) {
if (urlfp.getRootDomainHash() != fp.getRootDomainHash()) {
if (!incoming.containsKey(urlfp.getRootDomainHash())) {
incoming.put(urlfp.getRootDomainHash(), sourceURL);
}
}
}
}
static int compareStrings(String string1,String string2) {
int result = 0;
if (string1 == null && string2 != null || string1 != null && string2 == null) {
result = -1;
}
if (result == 0 && string1 != null && string2 != null) {
result = string1.compareTo(string2);
}
return result;
}
void printMismatch(String mismatchDetail) {
System.out.println("FP:" + fp.getKey() + " SourceURL:" + source_url + " Detail:" + mismatchDetail);
}
@Override
public int compareTo(URLStateModel o) {
if (compareStrings(source_url,o.source_url) != 0) {
printMismatch("source_url:"+source_url+ " other:" + o.source_url);
return -1;
}
if (has_crawl_status != o.has_crawl_status) {
printMismatch("has_crawl_status:"+has_crawl_status+ " other:" + o.has_crawl_status);
return -1;
}
if (latest_attempt_time != o.latest_attempt_time) {
printMismatch("latest_attempt_time:"+latest_attempt_time+ " other:" + o.latest_attempt_time);
return -1;
}
if (latest_crawl_time != o.latest_crawl_time) {
printMismatch("latest_crawl_time:"+latest_crawl_time+ " other:" + o.latest_crawl_time);
return -1;
}
if (attempt_count != o.attempt_count) {
printMismatch("attempt_count:"+attempt_count+ " other:" + o.attempt_count);
return -1;
}
if (crawl_count != o.crawl_count) {
printMismatch("crawl_count:"+crawl_count+ " other:" + o.crawl_count);
return -1;
}
if (compareStrings(parsed_as,o.parsed_as) !=0) {
printMismatch("parsed_as:"+parsed_as+ " other:" + o.parsed_as);
return -1;
}
if (http_result != o.http_result) {
printMismatch("http_result:"+http_result+ " other:" + o.http_result);
return -1;
}
if (compareStrings(redirect_url,o.redirect_url) != 0) {
printMismatch("redirect_url:"+redirect_url+ " other:" + o.redirect_url);
return -1;
}
Set<String> urlDiff = Sets.difference(ext_urls, o.ext_urls);
if (urlDiff.size() != 0) {
System.out.println("URLS:" + urlDiff + " missing from other Model");
return -1;
}
Set<JsonObject> jsonDiff = Sets.difference(this.details, o.details);
if (jsonDiff.size() != 0) {
System.out.println("JSON Objects:" + jsonDiff + " missing from other Model");
return -1;
}
return 0;
}
}
static Pair<TextBytes,TextBytes> generateTestRecord(TestModel model,String url, long timestamp,boolean success,TestRecordType recordType,Map<String,String> extraProperties)throws Exception {
Pair<TextBytes,TextBytes> result = null;
if (recordType == TestRecordType.CRAWL_STATUS || recordType == TestRecordType.CRAWL_STATUS_WITH_REDIRECT) {
JsonObject topObject = new JsonObject();
topObject.addProperty("source_url", url);
topObject.addProperty("disposition", (success) ? "SUCCESS" : "FAILURE");
topObject.addProperty("attempt_time", timestamp);
if (success) {
topObject.addProperty("http_result",200);
topObject.addProperty("server_ip",CANNED_IP);
topObject.addProperty("content_len",CANNED_CONTENT_LEN);
topObject.addProperty("mime_type",CANNED_MIME_TYPE);
topObject.addProperty("md5",CANNED_MD5_VALUE);
topObject.addProperty("text_simhash",CANNED_SIMHASH_VALUE);
JsonObject headers = new JsonObject();
headers.addProperty("date", timestamp);
topObject.add("http_headers",headers);
topObject.addProperty("parsed_as","html");
JsonObject content = new JsonObject();
content.addProperty("title", CANNED_TITLE);
JsonArray metaTags = new JsonArray();
JsonObject metaTag = new JsonObject();
metaTag.addProperty(CANNED_META_PROPERTY_NAME, CANNED_META_PROPERTY_VALUE);
metaTags.add(metaTag);
content.add("meta_tags", metaTags);
topObject.add("content", content);
JsonArray links = new JsonArray();
JsonObject link1 = new JsonObject();
link1.addProperty("href", sourceURLToLinkingHostURL(url, CANNED_LINKING_HOST_1));
JsonObject link2 = new JsonObject();
link2.addProperty("href", sourceURLToLinkingHostURL(url, CANNED_LINKING_HOST_2));
links.add(link1);
links.add(link2);
topObject.add("links", links);
if (recordType == TestRecordType.CRAWL_STATUS_WITH_REDIRECT) {
JsonObject redirectObject = new JsonObject();
redirectObject.addProperty("source_url", extraProperties.get(EXTRA_PROPERTY_REDIRECT_SOURCE));
redirectObject.addProperty("http_result", 301);
redirectObject.addProperty("server_ip",CANNED_IP);
topObject.add("redirect_from", redirectObject);
}
}
else {
topObject.addProperty("failure_reason",CANNED_FAILURE_REASON);
topObject.addProperty("failure_detail",CANNED_FAILURE_DETAIL);
}
TextBytes keyOut = new TextBytes(CrawlDBKey.generateCrawlStatusKey(new Text(url), timestamp));
TextBytes valueOut = new TextBytes(topObject.toString());
result = new Pair<TextBytes, TextBytes>(keyOut, valueOut);
}
else {
JsonObject linkData = new JsonObject();
linkData.addProperty("href",url);
linkData.addProperty("source_url", extraProperties.get(EXTRA_PROPERTY_LINK_SOURCE));
linkData.addProperty("http_result", 200);
linkData.addProperty("server_ip",CANNED_IP);
linkData.addProperty("source_type","html");
linkData.addProperty("type","a");
linkData.addProperty("rel","text/html");
TextBytes keyOut = new TextBytes(
CrawlDBKey.generateLinkKey(
new TextBytes(url),
CrawlDBKey.Type.KEY_TYPE_HTML_LINK,
MD5Hash.digest(extraProperties.get(EXTRA_PROPERTY_LINK_SOURCE)).toString()));
TextBytes valueOut = new TextBytes(linkData.toString());
result = new Pair<TextBytes,TextBytes> (keyOut,valueOut);
}
model.updateModelFromInputTuple(result);
return result;
}
static class TestDataComparator implements java.util.Comparator<Pair<TextBytes,TextBytes>> {
CrawlDBKey.LinkKeyComparator comparator = new LinkKeyComparator();
@Override
public int compare(Pair<TextBytes,TextBytes> o1, Pair<TextBytes,TextBytes> o2) {
return comparator.compare(o1.e0, o2.e0);
}
}
void reduceTestTuples(ArrayList<Pair<TextBytes,TextBytes>> tupleList,OutputCollector<TextBytes, TextBytes> collector, Reporter reporter)throws Exception {
Collections.sort(tupleList, new TestDataComparator());
for (Pair<TextBytes,TextBytes> tuple : tupleList) {
reduce(tuple.e0, Iterators.forArray(tuple.e1),collector,reporter);
}
}
static void dumpTuples(List<Pair<TextBytes,TextBytes>> items) {
for (Pair<TextBytes,TextBytes> outputTuple : items) {
System.out.println("Key:" + outputTuple.e0);
try {
JsonParser parser = new JsonParser();
JsonElement e = parser.parse(outputTuple.e1.toString());
JSONUtils.prettyPrintJSON(e);
}
catch (Exception e) {
System.out.println("Value:" + outputTuple.e1);
}
}
}
@Test
public void testMerge()throws Exception {
_sourceInputsBuffer = new DataOutputBuffer(16348*4);
_sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);
MockCollectorReporter collector = new MockCollectorReporter();
TestModel inputModel = new TestModel();
TestModel outputModel = new TestModel();
ArrayList<Pair<TextBytes,TextBytes>> initialTupleList = new ArrayList<Pair<TextBytes,TextBytes>>();
///////////////////////////
// STEP:1
///////////////////////////
// populate an initial list of tuples ...
// crawl url 1 with http 200 crawl status
initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_2,false,TestRecordType.CRAWL_STATUS,null));
// crawl url 1 with failure
initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_1,true,TestRecordType.CRAWL_STATUS,null));
// inlinks into crawl url 1 ...
initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
new ImmutableMap.Builder<String, String>()
.put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_1, CANNED_LINKING_HOST_1))
.build()));
initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
new ImmutableMap.Builder<String, String>()
.put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_1, CANNED_LINKING_HOST_2))
.build()));
// crawl url 2 crawl status
initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_2,CANNED_TIMESTAMP_1,true,TestRecordType.CRAWL_STATUS,null));
// inlink into crawl url 3 with no crawl status
initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_3,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
new ImmutableMap.Builder<String, String>()
.put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_3, CANNED_LINKING_HOST_1))
.build()));
initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_3,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
new ImmutableMap.Builder<String, String>()
.put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_3, CANNED_LINKING_HOST_2))
.build()));
initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_3,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
new ImmutableMap.Builder<String, String>()
.put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_3, CANNED_URL_3_HOST))
.build()));
// run reducer ...
reduceTestTuples(initialTupleList, collector, collector);
close();
System.out.println("STEP:1 DONE#########################################");
///////////////////////////
// STEP:2
///////////////////////////
// reset bloom filter ...
_sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);
// swap items ...
ArrayList<Pair<TextBytes,TextBytes>> tuples = collector.items;
collector.items = new ArrayList<Pair<TextBytes,TextBytes>>();
// tuples for step 2.
// another failure for crawl url 1
tuples.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_3,true,TestRecordType.CRAWL_STATUS,null));
// inlink to crawl url 1
tuples.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_3,true,TestRecordType.INLINK,
new ImmutableMap.Builder<String, String>()
.put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_1, CANNED_LINKING_HOST_3))
.build()));
// crawl status from crawl url4 with a redirect from crawl url 5
tuples.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_4,CANNED_TIMESTAMP_1,true,TestRecordType.CRAWL_STATUS_WITH_REDIRECT,
new ImmutableMap.Builder<String, String>()
.put(EXTRA_PROPERTY_REDIRECT_SOURCE, CANNED_CRAWL_URL_5)
.build()));
// reduce
reduceTestTuples(tuples, collector, collector);
close();
System.out.println("STEP:2 DONE#########################################");
///////////////////////////
// STEP:3
///////////////////////////
// reset bloom filter ...
_sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);
// swap items ...
tuples = collector.items;
collector.items = new ArrayList<Pair<TextBytes,TextBytes>>();
// run reducer again to pick up redirect record
reduceTestTuples(tuples, collector, collector);
close();
System.out.println("STEP:3 DONE#########################################");
// dump tuples
System.out.println("TUPLES DUMP STARTING#########################################");
dumpTuples(collector.items);
System.out.println("TUPLES DUMP COMPLETE#########################################");
// feed final tuple set to output model ...
outputModel.buildModelFromOutputTuples(collector.items);
// model comparison
System.out.println("MODEL COMP STARTING#########################################");
Assert.assertEquals(inputModel.compareTo(outputModel),0);
System.out.println("MODEL COMP DONE #########################################");
}
}