CrawlDBMergingReducerTests.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.ec2.postprocess.crawldb;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;

import junit.framework.Assert;

import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.Counters.Counter;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.LinkKeyComparator;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.ByteArrayUtils;
import org.commoncrawl.util.JSONUtils;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.Tuples.Pair;
import org.junit.Test;
import static org.commoncrawl.util.JSONUtils.*;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterators;
import com.google.common.collect.Sets;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.ibm.icu.util.StringTokenizer;

public class CrawlDBMergingReducerTests extends CrawlDBMergingReducer {
  @Test
  public void testSourceInputOutputWriters()throws IOException { 
    _sourceInputsBuffer = new DataOutputBuffer(16348*4);
    _sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);
    
    String sourceDomainURL = "http://sourcedomain.com/foo";
    URLFPV2 sourceFP = URLUtils.getURLFPV2FromCanonicalURL(sourceDomainURL);
    
    String urls[] = { 
        "http://somedomain.com/foo",
        "http://someother.com/bar"
    };
    
    for (String url : urls) { 
      URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(url);
      // double insert and validate actual single insertion 
      trackPotentialLinkSource(fp,url,sourceFP);
      trackPotentialLinkSource(fp,url,sourceFP);
    }
    
    //  validate data ... 
    TextBytes firstVersion = new TextBytes();
    firstVersion.set(_sourceInputsBuffer.getData(),0,_sourceInputsBuffer.getLength());
    
    StringTokenizer tokenizer = new StringTokenizer(firstVersion.toString(), "\n");
    int itemIndex = 0;
    while (tokenizer.hasMoreElements()) { 
      String nextLine = tokenizer.nextToken();
      String splits[] = nextLine.split("\t");
      // validate fp 
      URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(urls[itemIndex]);
      Assert.assertEquals(fp.getDomainHash(),Long.parseLong(splits[0]));
      // validate actual url ... 
      Assert.assertEquals(splits[1],urls[itemIndex]);
      itemIndex++;
    }
    
    // reset output buffer ... 
    _sourceInputsBuffer = new DataOutputBuffer(16348*4);
    // and source bloom filter ... 
    _sourceInputsTrackingFilter = new URLFPBloomFilter(10000000, NUM_HASH_FUNCTIONS, NUM_BITS);
    importLinkSourceData(sourceFP,firstVersion);
    // second text should match first .. 
    TextBytes secondVersion = new TextBytes();
    secondVersion.set(_sourceInputsBuffer.getData(),0,_sourceInputsBuffer.getLength());
    Assert.assertEquals(firstVersion,secondVersion);
  }
  
  /**
   * Mock Collector 
   */
  
  static class MockCollectorReporter implements OutputCollector<TextBytes,TextBytes>, Reporter {
    
    public ArrayList<Pair<TextBytes,TextBytes>> items = new ArrayList<Pair<TextBytes,TextBytes>>();
    
    @Override
    public void progress() {
    }

    //@Override
    public float getProgress() { return 0; }

    @Override
    public void setStatus(String status) {
    }

    @Override
    public Counter getCounter(Enum<?> name) {
      return null;
    }

    @Override
    public Counter getCounter(String group, String name) {
      return null;
    }

    @Override
    public void incrCounter(Enum<?> key, long amount) {
    }

    @Override
    public void incrCounter(String group, String counter, long amount) {
    }

    @Override
    public InputSplit getInputSplit() throws UnsupportedOperationException {
      return null;
    }

    @Override
    public void collect(TextBytes key, TextBytes value) throws IOException {
      items.add(new Pair<TextBytes,TextBytes>(new TextBytes(key.toString()),new TextBytes(value.toString())));
    } 
  }
  
  private static final int CANNED_SIMHASH_VALUE = 1234;
  private static final String CANNED_MD5_VALUE = "5d41402abc4b2a76b9719d911017c592";
  private static final String CANNED_MIME_TYPE = "text/html";
  private static final int CANNED_CONTENT_LEN = 100;
  private static final String CANNED_IP = "1.1.1.1";
  private static final String CANNED_TITLE = "title";
  private static final String CANNED_META_PROPERTY_NAME =  "metaProperty";
  private static final String CANNED_META_PROPERTY_VALUE =  "metaValue";
  private static final String CANNED_LINKING_HOST_1 = "link.host.one.com";
  private static final String CANNED_LINKING_HOST_2 = "link.host.two.com";
  private static final String CANNED_LINKING_HOST_3 = "link.host.three.com";
  private static final String EXTRA_PROPERTY_REDIRECT_SOURCE = "redirectSource";
  private static final String EXTRA_PROPERTY_LINK_SOURCE = "linkSource";
  private static final String CANNED_CRAWL_URL_1 = "http://cannedone.com/";
  //private static final String CANNED_URL_1_HOST = "cannedone.com";
  private static final String CANNED_CRAWL_URL_2 = "http://cannedtwo.com/";
  //private static final String CANNED_URL_2_HOST = "cannedtwo.com";
  private static final String CANNED_CRAWL_URL_3 = "http://cannedthree.com/";
  private static final String CANNED_URL_3_HOST = "cannedthree.com";
  private static final String CANNED_CRAWL_URL_4 = "http://cannedfour.com/";
  //private static final String CANNED_URL_4_HOST = "cannedfour.com";
  private static final String CANNED_CRAWL_URL_5 = "http://cannedfive.com/";
  //private static final String CANNED_URL_5_HOST = "cannedfive.com";

  //private static final long   CANNED_TIMESTAMP_0 = 123456700L;
  private static final long   CANNED_TIMESTAMP_1 = 123456789L;
  private static final long   CANNED_TIMESTAMP_2 = 123456799L;
  private static final long   CANNED_TIMESTAMP_3 = 123466799L;
  private static final String CANNED_FAILURE_REASON = "FailureReason";
  private static final String CANNED_FAILURE_DETAIL = "FailureDetail";
  
  static String sourceURLToLinkingHostURL(String sourceURL, String linkingHostName) {
    return "http://" + linkingHostName + "/" + MD5Hash.digest(sourceURL + Long.toString(System.nanoTime())).toString();
  }
  enum TestRecordType { 
    CRAWL_STATUS,
    CRAWL_STATUS_WITH_REDIRECT,
    INLINK
  };
  
  public static class TestModel implements Comparable<TestModel> { 
    
    TreeMap<URLFPV2,URLStateModel> fpToModelMap = new TreeMap<URLFPV2, URLStateModel>();
    
    /** 
     * update the model from the raw (generated tuples) 
     * @param tuple
     * @throws Exception
     */
    void updateModelFromInputTuple(Pair<TextBytes,TextBytes> tuple) throws Exception { 
      URLFPV2 fp = new URLFPV2();
      // get key ... 
      fp.setRootDomainHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
      fp.setDomainHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
      fp.setUrlHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID));
      
      long recordType = CrawlDBKey.getLongComponentFromKey(tuple.e0,CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
      
      if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal() || recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) {
        // update model given key ...
        URLStateModel urlModel = fpToModelMap.get(fp);
        if (urlModel == null) { 
          urlModel = new URLStateModel();
          urlModel.fp = fp;
          fpToModelMap.put(fp, urlModel);
        }
        
        if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { 
          JsonObject redirectJSON = urlModel.updateModelGivenCrawlStatus(tuple.e1);
          
          if (redirectJSON != null) { 
            URLFPV2 redirectFP = URLUtils.getURLFPV2FromURL(redirectJSON.get("source_url").getAsString());
            TextBytes key = CrawlDBKey.generateKey(redirectFP,CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS,redirectJSON.get("attempt_time").getAsLong());
            Pair<TextBytes,TextBytes> redirectTuple = new Pair<TextBytes, TextBytes>(key,new TextBytes(redirectJSON.toString()));
            updateModelFromInputTuple(redirectTuple);
          }
          
        }
        else if (recordType == CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal()) { 
          urlModel.updateModelGivenLinkRecord(tuple.e1);
        }
      }
    }
    
    /** 
     * build a model from a set of output tuples 
     * (captured by the mock collector)
     * @param tuples
     */
    void buildModelFromOutputTuples(List<Pair<TextBytes,TextBytes>> tuples) throws IOException { 
      for (Pair<TextBytes,TextBytes> tuple : tuples) { 
        URLFPV2 fp = new URLFPV2();
        // get key ... 
        fp.setRootDomainHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
        fp.setDomainHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
        fp.setUrlHash(CrawlDBKey.getLongComponentFromKey(tuple.e0, CrawlDBKey.ComponentId.URL_HASH_COMPONENT_ID));
        // get type ... 
        long recordType = CrawlDBKey.getLongComponentFromKey(tuple.e0,CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
        // validate that the record type is only of one of two varieties supported in final output ... 
        Assert.assertTrue(recordType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal() || recordType == CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE.ordinal());
        // create url model object if necessary ... 
        URLStateModel urlModel = fpToModelMap.get(fp);
        if (urlModel == null) { 
          urlModel = new URLStateModel();
          urlModel.fp = fp;
          fpToModelMap.put(fp, urlModel);
        }
        
        // ok do stuff based on record type ... 
        if (recordType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) { 
          urlModel.updateModelGivenMergedRecord(tuple.e1);
        }
        else { 
          urlModel.updateModelGivenUrlsSampleRecord(tuple.e1);
        }
      }
    }
    
    
    @Override
    public String toString() {
      StringBuffer sb = new StringBuffer();
      
      for (Map.Entry<URLFPV2,URLStateModel>  urlRecord : fpToModelMap.entrySet()) { 
        sb.append(urlRecord.getKey() + "\n" + urlRecord.toString());
      }
      return sb.toString();
    }

    @Override
    public int compareTo(TestModel other) {
      int result = 0;
      for (URLFPV2 fp : fpToModelMap.keySet()) { 
        if (!other.fpToModelMap.containsKey(fp)) { 
          System.out.println("FP:" + fp.getKey() + " not Found in Other Model");
          result = -1;
          break;
        }
        else { 
          URLStateModel leftSideModel = fpToModelMap.get(fp);
          URLStateModel rightSideModel = other.fpToModelMap.get(fp);
          
          result = leftSideModel.compareTo(rightSideModel);
          if (result != 0)  { 
            System.out.println("URLModels for FP:" + fp.getKey() + " did not match");
            break;
          }
        }
      }
      return result;
    }
    
  }
  
  public static class URLStateModel implements Comparable<URLStateModel> { 
    URLFPV2       fp;
    public String  source_url = null;
    public boolean has_crawl_status = false;
    public long   latest_attempt_time = -1;
    public long   latest_crawl_time = -1;
    public int    attempt_count = 0;
    public int    crawl_count = 0;
    public String parsed_as = null;
    public int    http_result = -1;
    public String redirect_url = null;
    public SortedSet<String> ext_urls = new TreeSet<String>();
    public SortedSet<JsonObject> details  = new TreeSet<JsonObject>(new Comparator<JsonObject>() {

      @Override
      public int compare(JsonObject o1, JsonObject o2) {
        String md51 = MD5Hash.digest(o1.toString()).toString();
        String md52 = MD5Hash.digest(o2.toString()).toString();
        return md51.compareTo(md52);
      }
    });
    
    public TreeMap<Long,String> incoming = new TreeMap<Long, String>();
    
    
    @Override
    public String toString() {
      StringBuffer sb = new StringBuffer();
      sb.append("source:"+ source_url + "\n");
      sb.append("has_crawl_status:" +  has_crawl_status + "\n");
      sb.append("latest_crawl_time:" + latest_crawl_time + "\n");
      sb.append("attempt_count:" + attempt_count + "\n");
      sb.append("crawl_count:" + crawl_count + "\n");
      sb.append("parsed_as:" + parsed_as + "\n");
      sb.append("http_result:" + http_result + "\n");
      sb.append("redirect_url:" + redirect_url + "\n");
      for (String extURL : ext_urls) { 
        sb.append("extURL:" + extURL + "\n");
      }
      for (JsonObject jsonObj : details) { 
        sb.append("crawlStatus:" + jsonObj.toString() + "\n");
      }
      for (String value : incoming.values()) { 
        sb.append("incoming:" + value + "\n");
      }
      sb.append("\n");
      return sb.toString();
    }
    
    void updateModelGivenMergedRecord(TextBytes mergedJSON) throws IOException { 
      JsonObject mergeObject = new JsonParser().parse(mergedJSON.toString()).getAsJsonObject();
      
      source_url = mergeObject.get(TOPLEVEL_SOURCE_URL_PROPRETY).getAsString();
      has_crawl_status = mergeObject.has(TOPLEVEL_SUMMARYRECORD_PROPRETY);
      if (has_crawl_status) {
        JsonObject crawlStatusObj = mergeObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY);
      
        latest_attempt_time =  crawlStatusObj.get(SUMMARYRECORD_LATEST_ATTEMPT_PROPERTY).getAsLong();
        latest_crawl_time   =  safeGetLong(crawlStatusObj, SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY);
        attempt_count = crawlStatusObj.get(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY).getAsInt();
        crawl_count = (crawlStatusObj.has(SUMMARYRECORD_CRAWLCOUNT_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_CRAWLCOUNT_PROPERTY).getAsInt() : 0;
        parsed_as = (crawlStatusObj.has(SUMMARYRECORD_PARSEDAS_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_PARSEDAS_PROPERTY).getAsString() : null;
        http_result = (crawlStatusObj.has(SUMMARYRECORD_HTTP_RESULT_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_HTTP_RESULT_PROPERTY).getAsInt() : -1;  
        redirect_url = (crawlStatusObj.has(SUMMARYRECORD_REDIRECT_URL_PROPERTY)) ? crawlStatusObj.get(SUMMARYRECORD_REDIRECT_URL_PROPERTY).getAsString() : null;
        safeJsonArrayToStringCollection(crawlStatusObj, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS, ext_urls);
        if (crawlStatusObj.has(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) { 
          for (JsonElement crawlDetail : crawlStatusObj.getAsJsonArray(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) { 
            details.add(crawlDetail.getAsJsonObject());
          }
        }
      }
    }
    
    void updateModelGivenUrlsSampleRecord(TextBytes inputData) { 
      int curpos =inputData.getOffset();
      int endpos = inputData.getOffset() + inputData.getLength();
      
      byte lfPattern[] = { 0xA };
      byte tabPattern[] = { 0x9 };
      
      TextBytes urlText = new TextBytes();
      
      while (curpos != endpos) { 
        int tabIndex = ByteArrayUtils.indexOf(inputData.getBytes(), curpos, endpos - curpos, tabPattern);
        if (tabIndex == -1) { 
          break;
        }
        else { 
          int lfIndex = ByteArrayUtils.indexOf(inputData.getBytes(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern);
          if (lfIndex == -1) { 
            break;
          }
          else {
            long sourceDomainHash = ByteArrayUtils.parseLong(inputData.getBytes(),curpos, tabIndex-curpos, 10);
            urlText.set(inputData.getBytes(),tabIndex + 1,lfIndex - (tabIndex + 1));
            incoming.put(sourceDomainHash, urlText.toString());
            curpos = lfIndex + 1;
          }
        }
      }
    }
    
    JsonObject updateModelGivenCrawlStatus(TextBytes statusJSON) throws IOException {
      has_crawl_status = true;
      JsonParser parser = new JsonParser();
      JsonObject jsonObj = parser.parse(statusJSON.toString()).getAsJsonObject();
      if (source_url == null) { 
        source_url = jsonObj.get("source_url").getAsString();
      }
      
      HashSet<String> extHrefs = new HashSet<String>();
      JsonObject crawlDetailRecord = crawlDetailRecordFromCrawlStatusRecord(jsonObj, fp, extHrefs, new MockCollectorReporter());
      long attemptTime = safeGetLong(crawlDetailRecord,CRAWLDETAIL_ATTEMPT_TIME_PROPERTY);
      latest_attempt_time = Math.max(attemptTime, latest_attempt_time);
      attempt_count++;
      int httpResult = safeGetInteger(crawlDetailRecord, CRAWLDETAIL_HTTPRESULT_PROPERTY);
      if (httpResult != -1) {
        if (latest_attempt_time == attemptTime) { 
          this.http_result = httpResult;
        }
        if (httpResult >= 200 && httpResult <= 299) { 
          latest_crawl_time = Math.max(attemptTime, latest_crawl_time);
          crawl_count++;
          if (latest_crawl_time == attemptTime) {
            this.parsed_as = crawlDetailRecord.get(CRAWLDETAIL_PARSEDAS_PROPERTY).getAsString();
            this.ext_urls.clear();
            this.ext_urls.addAll(extHrefs);
          }
        }
        else if (httpResult >= 300 && httpResult <= 399) { 
          this.redirect_url = (crawlDetailRecord.get(CRAWLDETAIL_REDIRECT_URL) != null) ? crawlDetailRecord.get(CRAWLDETAIL_REDIRECT_URL).getAsString() : null;
        }
      }
      this.details.add(crawlDetailRecord);
      
      if (jsonObj.has("redirect_from")) { 

        JsonObject redirectObject = jsonObj.get("redirect_from").getAsJsonObject();
        
        
        JsonObject redirectJSON = new JsonObject();
        
        int redirectHttpResult = redirectObject.get("http_result").getAsInt();
        
        redirectJSON.addProperty("disposition","SUCCESS");
        redirectJSON.addProperty("http_result",redirectHttpResult);
        redirectJSON.addProperty("server_ip",redirectObject.get("server_ip").getAsString());
        redirectJSON.addProperty("attempt_time",jsonObj.get("attempt_time").getAsLong());
        redirectJSON.addProperty("target_url",jsonObj.get("source_url").getAsString());
        redirectJSON.addProperty("source_url",redirectObject.get("source_url").getAsString());
        
        return redirectJSON;
      }
      return null;
    }
    
    public void updateModelGivenLinkRecord(TextBytes linkJSON) {

      JsonParser parser = new JsonParser();
      JsonObject jsonObj = parser.parse(linkJSON.toString()).getAsJsonObject();
      if (source_url == null) { 
        source_url = jsonObj.get("href").getAsString();
      }
      
      String sourceURL = jsonObj.get("source_url").getAsString();
      URLFPV2 urlfp = URLUtils.getURLFPV2FromURL(sourceURL);
      if (urlfp != null) {
        if (urlfp.getRootDomainHash() != fp.getRootDomainHash()) { 
          if (!incoming.containsKey(urlfp.getRootDomainHash())) {
            incoming.put(urlfp.getRootDomainHash(), sourceURL);
          }
        }
      }
    }

    static int compareStrings(String string1,String string2) {
      int result = 0;
      if (string1 == null && string2 != null || string1 != null && string2 == null) {
        result = -1;
      }
      if (result == 0 && string1 != null && string2 != null) { 
        result = string1.compareTo(string2);
      }
      return result;
    }
    
    void printMismatch(String mismatchDetail) { 
      System.out.println("FP:" + fp.getKey() + " SourceURL:" + source_url + " Detail:" + mismatchDetail);
    }
        
    @Override
    public int compareTo(URLStateModel o) {
      if (compareStrings(source_url,o.source_url) != 0) {
        printMismatch("source_url:"+source_url+ " other:" + o.source_url);
        return -1;
      }

      if (has_crawl_status != o.has_crawl_status) {
        printMismatch("has_crawl_status:"+has_crawl_status+ " other:" + o.has_crawl_status);
        return -1;
      }

      if (latest_attempt_time != o.latest_attempt_time) {
        printMismatch("latest_attempt_time:"+latest_attempt_time+ " other:" + o.latest_attempt_time);
        return -1;
      }

      if (latest_crawl_time != o.latest_crawl_time) {
        printMismatch("latest_crawl_time:"+latest_crawl_time+ " other:" + o.latest_crawl_time);
        return -1;
      }

      if (attempt_count != o.attempt_count) {
        printMismatch("attempt_count:"+attempt_count+ " other:" + o.attempt_count);
        return -1;
      }

      if (crawl_count != o.crawl_count) {
        printMismatch("crawl_count:"+crawl_count+ " other:" + o.crawl_count);
        return -1;
      }

      if (compareStrings(parsed_as,o.parsed_as) !=0) {
        printMismatch("parsed_as:"+parsed_as+ " other:" + o.parsed_as);
        return -1;
      }

      if (http_result != o.http_result) {
        printMismatch("http_result:"+http_result+ " other:" + o.http_result);
        return -1;
      }

      if (compareStrings(redirect_url,o.redirect_url) != 0) {
        printMismatch("redirect_url:"+redirect_url+ " other:" + o.redirect_url);
        return -1;
      }
      
      Set<String> urlDiff = Sets.difference(ext_urls, o.ext_urls);
      
      if (urlDiff.size() != 0) { 
        System.out.println("URLS:" + urlDiff + " missing from other Model");
        return -1;
      }
      
      Set<JsonObject> jsonDiff = Sets.difference(this.details, o.details);
      if (jsonDiff.size() != 0) {
        System.out.println("JSON Objects:" + jsonDiff + " missing from other Model");
        return -1;
      }
      return 0;
    }
  }
  
  static Pair<TextBytes,TextBytes> generateTestRecord(TestModel model,String url, long timestamp,boolean success,TestRecordType recordType,Map<String,String> extraProperties)throws Exception { 
    Pair<TextBytes,TextBytes> result = null;
    if (recordType == TestRecordType.CRAWL_STATUS || recordType == TestRecordType.CRAWL_STATUS_WITH_REDIRECT) {
      JsonObject topObject = new JsonObject();
    
      topObject.addProperty("source_url", url);
      topObject.addProperty("disposition", (success) ? "SUCCESS" : "FAILURE");
      topObject.addProperty("attempt_time", timestamp);
      if (success) { 
   
          topObject.addProperty("http_result",200);
          topObject.addProperty("server_ip",CANNED_IP);
          topObject.addProperty("content_len",CANNED_CONTENT_LEN);
          topObject.addProperty("mime_type",CANNED_MIME_TYPE);
          topObject.addProperty("md5",CANNED_MD5_VALUE);
          topObject.addProperty("text_simhash",CANNED_SIMHASH_VALUE);
          JsonObject headers = new JsonObject();
          headers.addProperty("date", timestamp);
          topObject.add("http_headers",headers);
          topObject.addProperty("parsed_as","html");
          JsonObject content = new JsonObject();
          content.addProperty("title", CANNED_TITLE);
          JsonArray metaTags = new JsonArray();
          JsonObject metaTag = new JsonObject();
          metaTag.addProperty(CANNED_META_PROPERTY_NAME, CANNED_META_PROPERTY_VALUE);
          metaTags.add(metaTag);
          content.add("meta_tags", metaTags);
          topObject.add("content", content);
          
          JsonArray links = new JsonArray();
          JsonObject link1 = new JsonObject();
          link1.addProperty("href", sourceURLToLinkingHostURL(url, CANNED_LINKING_HOST_1));
          JsonObject link2 = new JsonObject();
          link2.addProperty("href", sourceURLToLinkingHostURL(url, CANNED_LINKING_HOST_2));
          links.add(link1);
          links.add(link2);
          topObject.add("links", links);
          
          if (recordType == TestRecordType.CRAWL_STATUS_WITH_REDIRECT) { 
            JsonObject redirectObject = new JsonObject();
            redirectObject.addProperty("source_url", extraProperties.get(EXTRA_PROPERTY_REDIRECT_SOURCE));
            redirectObject.addProperty("http_result", 301);
            redirectObject.addProperty("server_ip",CANNED_IP);
            topObject.add("redirect_from", redirectObject);
          }
      }
      else { 
        topObject.addProperty("failure_reason",CANNED_FAILURE_REASON);
        topObject.addProperty("failure_detail",CANNED_FAILURE_DETAIL);
      }
      
      
      TextBytes keyOut = new TextBytes(CrawlDBKey.generateCrawlStatusKey(new Text(url), timestamp));
      TextBytes valueOut = new TextBytes(topObject.toString());
      
      result =  new Pair<TextBytes, TextBytes>(keyOut, valueOut);
    }
    else { 
      JsonObject linkData = new JsonObject();
      
      linkData.addProperty("href",url);
      linkData.addProperty("source_url", extraProperties.get(EXTRA_PROPERTY_LINK_SOURCE));
      linkData.addProperty("http_result", 200);
      linkData.addProperty("server_ip",CANNED_IP);
      linkData.addProperty("source_type","html");
      linkData.addProperty("type","a");
      linkData.addProperty("rel","text/html");

      TextBytes keyOut = new TextBytes(
          CrawlDBKey.generateLinkKey(
              new TextBytes(url),
              CrawlDBKey.Type.KEY_TYPE_HTML_LINK,
              MD5Hash.digest(extraProperties.get(EXTRA_PROPERTY_LINK_SOURCE)).toString()));
      
      TextBytes valueOut = new TextBytes(linkData.toString());
      
      result =  new Pair<TextBytes,TextBytes> (keyOut,valueOut);
    }
    
    model.updateModelFromInputTuple(result);
    return result;
  }

  static class TestDataComparator  implements java.util.Comparator<Pair<TextBytes,TextBytes>> { 
    CrawlDBKey.LinkKeyComparator comparator = new LinkKeyComparator();
    
    @Override
    public int compare(Pair<TextBytes,TextBytes> o1, Pair<TextBytes,TextBytes> o2) {
      return comparator.compare(o1.e0, o2.e0);
    } 
  }
  
  void reduceTestTuples(ArrayList<Pair<TextBytes,TextBytes>> tupleList,OutputCollector<TextBytes, TextBytes> collector, Reporter reporter)throws Exception { 
    Collections.sort(tupleList, new TestDataComparator());
    for (Pair<TextBytes,TextBytes> tuple : tupleList) {
      reduce(tuple.e0, Iterators.forArray(tuple.e1),collector,reporter);
    }
  }
  
  static void dumpTuples(List<Pair<TextBytes,TextBytes>> items) { 
    for (Pair<TextBytes,TextBytes> outputTuple : items) { 
      System.out.println("Key:" + outputTuple.e0);
      try { 
        JsonParser parser = new JsonParser();
        JsonElement e = parser.parse(outputTuple.e1.toString());
        JSONUtils.prettyPrintJSON(e);
      }
      catch (Exception e) { 
        System.out.println("Value:" + outputTuple.e1);
      }
    }
  }
  

  
  @Test
  public void testMerge()throws Exception  {
    _sourceInputsBuffer = new DataOutputBuffer(16348*4);
    _sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);

    MockCollectorReporter collector = new MockCollectorReporter();
    
    TestModel inputModel = new TestModel();
    TestModel outputModel = new TestModel();
    
    ArrayList<Pair<TextBytes,TextBytes>> initialTupleList = new ArrayList<Pair<TextBytes,TextBytes>>();

    ///////////////////////////
    // STEP:1
    ///////////////////////////
    
    // populate an initial list of tuples ... 
    
    // crawl url 1 with http 200 crawl status  
    initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_2,false,TestRecordType.CRAWL_STATUS,null));
    // crawl url 1 with failure 
    initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_1,true,TestRecordType.CRAWL_STATUS,null));
    // inlinks into crawl url 1 ...  
    initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
        new ImmutableMap.Builder<String, String>()
          .put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_1, CANNED_LINKING_HOST_1))
          .build()));
    initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
        new ImmutableMap.Builder<String, String>()
          .put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_1, CANNED_LINKING_HOST_2))
          .build()));

    // crawl url 2 crawl status 
    initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_2,CANNED_TIMESTAMP_1,true,TestRecordType.CRAWL_STATUS,null));

    // inlink into crawl url 3 with no crawl status 
    initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_3,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
        new ImmutableMap.Builder<String, String>()
          .put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_3, CANNED_LINKING_HOST_1))
          .build()));
    initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_3,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
        new ImmutableMap.Builder<String, String>()
          .put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_3, CANNED_LINKING_HOST_2))
          .build()));
    initialTupleList.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_3,CANNED_TIMESTAMP_1,true,TestRecordType.INLINK,
        new ImmutableMap.Builder<String, String>()
          .put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_3, CANNED_URL_3_HOST))
          .build()));
    
    // run reducer ... 
    reduceTestTuples(initialTupleList, collector, collector);
    close();
    System.out.println("STEP:1 DONE#########################################");
    
    ///////////////////////////
    // STEP:2
    ///////////////////////////

    // reset bloom filter ... 
    _sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);
    // swap items ... 
    ArrayList<Pair<TextBytes,TextBytes>> tuples = collector.items;
    collector.items = new ArrayList<Pair<TextBytes,TextBytes>>();

    // tuples for step 2. 
    
    // another failure for crawl url 1
    tuples.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_3,true,TestRecordType.CRAWL_STATUS,null));
    // inlink to crawl url 1 
    tuples.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_1,CANNED_TIMESTAMP_3,true,TestRecordType.INLINK,
        new ImmutableMap.Builder<String, String>()
          .put(EXTRA_PROPERTY_LINK_SOURCE, sourceURLToLinkingHostURL(CANNED_CRAWL_URL_1, CANNED_LINKING_HOST_3))
          .build()));
    // crawl status from crawl url4 with a redirect from crawl url 5
    tuples.add(generateTestRecord(inputModel,CANNED_CRAWL_URL_4,CANNED_TIMESTAMP_1,true,TestRecordType.CRAWL_STATUS_WITH_REDIRECT,
        new ImmutableMap.Builder<String, String>()
        .put(EXTRA_PROPERTY_REDIRECT_SOURCE, CANNED_CRAWL_URL_5)
        .build()));

    // reduce
    reduceTestTuples(tuples, collector, collector);
    close();
    System.out.println("STEP:2 DONE#########################################");
    
    ///////////////////////////
    // STEP:3
    ///////////////////////////

    // reset bloom filter ... 
    _sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);
    // swap items ... 
    tuples = collector.items;
    collector.items = new ArrayList<Pair<TextBytes,TextBytes>>();
    
    // run reducer again to pick up redirect record 
    reduceTestTuples(tuples, collector, collector);
    close();
    System.out.println("STEP:3 DONE#########################################");
    
    
    // dump tuples
    System.out.println("TUPLES DUMP STARTING#########################################");
    dumpTuples(collector.items);
    System.out.println("TUPLES DUMP COMPLETE#########################################");

    
    // feed final tuple set to output model ... 
    outputModel.buildModelFromOutputTuples(collector.items);
    
    // model comparison
    System.out.println("MODEL COMP STARTING#########################################");
    Assert.assertEquals(inputModel.compareTo(outputModel),0);
    System.out.println("MODEL COMP DONE    #########################################");
  }
}