CrawlDBMergingReducer.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.ec2.postprocess.crawldb;

import java.io.IOException;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.ComponentId;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.ByteArrayUtils;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.HttpHeaderInfoExtractor;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import static org.commoncrawl.util.JSONUtils.*;

import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

/**
 * map reduce job that produces a crawldb given link graph/crawl status data emitted
 * from both the LinkGraphDataEmitter job and previous runs of the CrawlDBWriter itself.
 * 
 * @author rana
 *
 */
public class CrawlDBMergingReducer implements Reducer<TextBytes, TextBytes ,TextBytes,TextBytes>, CrawlDBCommon {

  static final Log LOG = LogFactory.getLog(CrawlDBMergingReducer.class);

  // The crawldb job emits data in the form a JSON data structure
  // The top level JSON object contains optionally, a link_status object, a summary object 
  // and a source_url string. 
  // The Summary object has the properties defined by the SUMMARYRECORD_ constant prefix. 
  // The LinkStatus object has properties defined by the LINKSTATUS_ prefix
  // The Summary object can contain zero to N CrawlDetail objects, one for each 
  // crawl attempt. The properties defined by CrawlDetail object are prefixed with 
  // the CRAWLDETAIL_ prefix.
  

  
  ///////////////////////////////////////////////////////////////////////////
  // EC2 PATHS 
  ///////////////////////////////////////////////////////////////////////////
  static final String S3N_BUCKET_PREFIX = "s3n://aws-publicdatasets";
  static final String MERGE_INTERMEDIATE_OUTPUT_PATH = "/common-crawl/crawl-db/intermediate/";
  static final String MERGE_DB_PATH = "/common-crawl/crawl-db/mergedDB/";

  ///////////////////////////////////////////////////////////////////////////
  // CONSTANTS 
  ///////////////////////////////////////////////////////////////////////////
  
  static final int MAX_TYPE_SAMPLES = 5;
  
  static final int DEFAULT_OUTGOING_URLS_BUFFER_SIZE = 1 << 18; // 262K 
  static final int DEFAULT_OUTGOING_URLS_BUFFER_PAD_AMOUNT = 16384;
  static final int DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_SIZE = 1 << 27; // 134 MB
  static final int DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT = 16384;
  static final int MAX_EXTERNALLY_REFERENCED_URLS = 100;
  
  //private int OUTGOING_URLS_BUFFER_SIZE = DEFAULT_OUTGOING_URLS_BUFFER_SIZE;
  //private int OUTGOING_URLS_BUFFER_PAD_AMOUNT =DEFAULT_OUTGOING_URLS_BUFFER_PAD_AMOUNT;
  private int EXT_SOURCE_SAMPLE_BUFFER_SIZE = DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_SIZE;
  private int EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT = DEFAULT_EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT;
  
  ///////////////////////////////////////////////////////////////////////////
  // Counters 
  ///////////////////////////////////////////////////////////////////////////
  
  enum Counters { 
    FAILED_TO_GET_LINKS_FROM_HTML,
    NO_HREF_FOR_HTML_LINK,
    EXCEPTION_IN_MAP,
    GOT_HTML_METADATA,
    GOT_FEED_METADATA,
    EMITTED_ATOM_LINK,
    EMITTED_HTML_LINK,
    EMITTED_RSS_LINK,
    GOT_PARSED_AS_ATTRIBUTE,
    GOT_LINK_OBJECT,
    NULL_CONTENT_OBJECT,
    NULL_LINKS_ARRAY,
    FP_NULL_IN_EMBEDDED_LINK,
    SKIPPED_ALREADY_EMITTED_LINK,
    FOUND_HTTP_DATE_HEADER,
    FOUND_HTTP_AGE_HEADER,
    FOUND_HTTP_LAST_MODIFIED_HEADER,
    FOUND_HTTP_EXPIRES_HEADER,
    FOUND_HTTP_CACHE_CONTROL_HEADER,
    FOUND_HTTP_PRAGMA_HEADER,
    REDUCER_GOT_LINK,
    REDUCER_GOT_STATUS,
    ONE_REDUNDANT_LINK_IN_REDUCER,
    TWO_REDUNDANT_LINKS_IN_REDUCER,
    THREE_REDUNDANT_LINKS_IN_REDUCER,
    GT_THREE_REDUNDANT_LINKS_IN_REDUCER,
    ONE_REDUNDANT_STATUS_IN_REDUCER,
    TWO_REDUNDANT_STATUS_IN_REDUCER,
    THREE_REDUNDANT_STATUS_IN_REDUCER,
    GT_THREE_REDUNDANT_STATUS_IN_REDUCER,
    GOT_RSS_FEED,
    GOT_ATOM_FEED,
    GOT_ALTERNATE_LINK_FOR_ATOM_ITEM,
    GOT_CONTENT_FOR_ATOM_ITEM,
    GOT_ITEM_LINK_FROM_RSS_ITEM,
    GOT_TOP_LEVEL_LINK_FROM_RSS_ITEM,
    GOT_TOP_LEVEL_LINK_FROM_ATOM_ITEM,
    EMITTED_REDIRECT_RECORD,
    DISCOVERED_NEW_LINK,
    GOT_LINK_FOR_ITEM_WITH_STATUS,
    FAILED_TO_GET_SOURCE_HREF,
    GOT_CRAWL_STATUS_RECORD,
    GOT_EXTERNAL_DOMAIN_SOURCE,
    NO_SOURCE_URL_FOR_CRAWL_STATUS,
    OUTPUT_KEY_FROM_INTERNAL_LINK,
    OUTPUT_KEY_FROM_EXTERNAL_LINK, GOT_HTTP_200_CRAWL_STATUS, GOT_REDIRECT_CRAWL_STATUS, BAD_REDIRECT_URL, GOT_MERGED_RECORD, MERGED_OBJECT_FIRST_OBJECT, ADOPTED_SOURCE_SUMMARY_RECORD, MERGED_SOURCE_SUMMARY_RECORD_INTO_DEST, ADOPTED_SOURCE_LINKSUMMARY_RECORD, MERGED_SOURCE_LINKSUMMARY_RECORD_INTO_DEST, ALLOCATED_TOP_LEVEL_OBJECT_IN_FLUSH, ENCOUNTERED_EXISTING_TOP_LEVEL_OBJECT_IN_FLUSH, ENCOUNTERED_SUMMARY_RECORD_IN_FLUSH, ENCOUNTERED_LINKSUMMARY_RECORD_IN_FLUSH, EMITTED_SOURCEINPUTS_RECORD, GOT_NULL_REDIRECT_URL, INTERDOMAIN_LINKS_LTEQ_100, INTERDOMAIN_LINKS_LTEQ_1000, INTERDOMAIN_LINKS_GT_1000, EMITTED_SOURCEINPUTS_DATA_BYTES_EMITTED,
    INPUT_RECORD_COUNT, ADOPTED_NEW_BLEKKO_METADATA_RECORD, BLEKKO_METADATA_WITH_NO_SOURCE_CC_RECORD, MERGE_RECORD_HAS_BLEKKO_METADATA, EMITTED_RECORD_WITH_BLEKKO_METADATA, BLEKKO_RECORD_ALREADY_IN_DATABASE
    
  , BLEKKO_CRAWLED_CC_CRAWLED, BLEKKO_NOT_CRAWLED_CC_CRAWLED}
  
  ///////////////////////////////////////////////////////////////////////////
  // Data Members 
  ///////////////////////////////////////////////////////////////////////////

  public static final int  NUM_HASH_FUNCTIONS = 10;
  public static final int  NUM_BITS = 11;
  public static final int  NUM_ELEMENTS = 1 << 26;
  public static final int  FLUSH_INTERVAL = 1 << 17;

  private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
  static {
    NUMBER_FORMAT.setMinimumIntegerDigits(5);
    NUMBER_FORMAT.setGroupingUsed(false);
  }    

  // parser 
  JsonParser _parser = new JsonParser();
  // the top level object 
  JsonObject _topLevelJSONObject;
  // the current summary record ... 
  JsonObject _summaryRecord = null;
  // the current link summary record 
  JsonObject _linkSummaryRecord = null;
  // collection of types detected for current url 
  HashSet<String> _types = new HashSet<String>();
  // collection of external references urls in current document  
  HashSet<String> _extHrefs = new HashSet<String>();
  // the url string to use as the output key ... 
  String _outputKeyString = null;
  // freeze url key ...
  boolean _urlKeyForzen = false;
  // url object representing the current key 
  GoogleURL _outputKeyURLObj = null;
  // source inputs tracking bloomfilter 
  URLFPBloomFilter _sourceInputsTrackingFilter;
  // a count of the number of urls processed 
  long _urlsProcessed = 0;
  // key used to test bloomfilter 
  URLFPV2 _bloomFilterKey = new URLFPV2();
  // captured job conf
  JobConf _conf;
  // file system 
  FileSystem _fs;
  // partition id 
  int _partitionId;
  //SequenceFile.Writer _redirectWriter = null;
  // input buffer used to collect referencing urls  
  DataOutputBuffer _sourceInputsBuffer; 
  // count of referencing domains 
  int              _sourceSampleSize = 0;
  // current input key 
  URLFPV2 _currentKey = null;
  // temporary key used to transition input keys 
  URLFPV2 _tempKey = new URLFPV2();
  // cached collector pointer ... 
  OutputCollector<TextBytes, TextBytes> _outputCollector;
  Reporter _reporter;
    
  @Override
  public void reduce(TextBytes keyBytes, Iterator<TextBytes> values,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException {
    
    if (_outputCollector == null) { 
      _outputCollector = output;
      _reporter = reporter;
    }
    
    // potentially transition to new url
    readFPCheckForTransition(keyBytes,output,reporter);
    
    // extract link type .. 
    long linkType = CrawlDBKey.getLongComponentFromKey(keyBytes,CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
    
    while (values.hasNext()) {  
      
      reporter.incrCounter(Counters.INPUT_RECORD_COUNT, 1);
      
      TextBytes valueBytes = values.next();
      
      //LOG.debug("ValueBytes:"+ valueBytes.toString());
      
      if (linkType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) {
        reporter.incrCounter(Counters.GOT_MERGED_RECORD, 1);
        JsonObject mergedObject = _parser.parse(valueBytes.toString()).getAsJsonObject();
        if (mergedObject != null) { 
          setSourceURLFromJSONObject(mergedObject,linkType);
          processMergedRecord(mergedObject,_currentKey,reporter);
        }
      }
      else if (linkType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
        
          reporter.incrCounter(Counters.GOT_CRAWL_STATUS_RECORD,1);
          try { 
            JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
            if (object != null) {
              // update url key if necessary ... 
              setSourceURLFromJSONObject(object,linkType);
              // emit a redirect record if necessary ... 
              JsonElement redirectObject = object.get("redirect_from");
              if (redirectObject != null) {
                emitRedirectRecord(object, redirectObject.getAsJsonObject(),output, reporter);
              }
              
              // get latest crawl time
              long latestCrawlTime = (_summaryRecord != null) ? safeGetLong(_summaryRecord,SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY) : -1;
              long   attemptTime = safeGetLong(object, "attempt_time");
              // if this is the latest crawl event, then we want to track the links associated with this crawl status ... 
              HashSet<String> extHrefs = (attemptTime > latestCrawlTime) ? _extHrefs : null;
              // create a crawl detail record from incoming JSON 
              JsonObject crawlDetail = crawlDetailRecordFromCrawlStatusRecord(object,_currentKey,extHrefs,reporter);
              // add to our list of crawl detail records ... 
              safeAddCrawlDetailToSummaryRecord(crawlDetail);
              // ok, now update summary stats based on incoming crawl detail record ... 
              updateSummaryRecordFromCrawlDetailRecord(crawlDetail,_currentKey,reporter);
            }
          }
          catch (Exception e) {
            LOG.error("Error Parsing JSON:" + valueBytes.toString());
            throw new IOException(e);
          }
          break;
        }
        else if (linkType >= CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() && linkType <= CrawlDBKey.Type.KEY_TYPE_RSS_LINK.ordinal()) { 
          JsonObject object = _parser.parse(valueBytes.toString()).getAsJsonObject();
          if (object != null) {
            setSourceURLFromJSONObject(object,linkType);
            // LOG.debug("Got LinkData:" + JSONUtils.prettyPrintJSON(object));
            // ok this is a link ... 
            updateLinkStatsFromLinkJSONObject(object,_currentKey,reporter);
          }
        }
        else if (linkType == CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE.ordinal()) { 
          importLinkSourceData(_currentKey, valueBytes);
        }
        reporter.progress();
      }
  }

  @Override
  public void configure(JobConf job) {
    _sourceInputsBuffer = new DataOutputBuffer(EXT_SOURCE_SAMPLE_BUFFER_SIZE);
    _sourceInputsTrackingFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
    _conf = job;
    try {
      _fs = FileSystem.get(_conf);
      _partitionId = _conf.getInt("mapred.task.partition", 0);
    } catch (IOException e) {
      e.printStackTrace();
    }
  }

  @Override
  public void close() throws IOException {
    flushCurrentRecord(_outputCollector,_reporter);
  }  

  /** 
   * internal helper - emit a redirect record give a source crawl status record  
   * 
   * @param jsonObject
   * @param redirectObj
   * @param output
   * @param reporter
   * @throws IOException
   */
  void emitRedirectRecord(JsonObject jsonObject,JsonObject redirectObj,OutputCollector<TextBytes, TextBytes> output,Reporter reporter)throws IOException { 

    // ok first things first, generate a fingerprint for redirect SOURCE
    URLFPV2 redirectFP = URLUtils.getURLFPV2FromURL(redirectObj.get("source_url").getAsString());
    if (redirectFP == null) { 
      reporter.incrCounter(Counters.BAD_REDIRECT_URL, 1);
    }
    else { 
      int httpResult = redirectObj.get("http_result").getAsInt();
      JsonObject redirectJSON = new JsonObject();
      
      redirectJSON.addProperty("disposition","SUCCESS");
      redirectJSON.addProperty("http_result",httpResult);
      redirectJSON.addProperty("server_ip",redirectObj.get("server_ip").getAsString());
      redirectJSON.addProperty("attempt_time",jsonObject.get("attempt_time").getAsLong());
      redirectJSON.addProperty("target_url",jsonObject.get("source_url").getAsString());
      redirectJSON.addProperty("source_url",redirectObj.get("source_url").getAsString());
  
      // ok emit the redirect record ... 
      TextBytes key = CrawlDBKey.generateKey(redirectFP,CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS,jsonObject.get("attempt_time").getAsLong());
      LOG.debug("!!!!!!Emitting Redirect Record:" + redirectJSON.toString());

      output.collect(key, new TextBytes(redirectJSON.toString()));

      reporter.incrCounter(Counters.EMITTED_REDIRECT_RECORD, 1);

      //_redirectWriter.append(new TextBytes(redirectObj.get("source_url").getAsString()), new TextBytes(redirectJSON.toString()));
    }
  }
    
  /** 
   * grab date headers and incorporate them into the crawl detail object 
   * 
   * @param jsonObject
   * @param crawlStatsJSON
   */
  static void populateDateHeadersFromJSONObject(JsonObject jsonObject,JsonObject crawlStatsJSON) { 
    JsonObject headers = jsonObject.getAsJsonObject("http_headers");
    if (headers != null) { 
      JsonElement httpDate = headers.get("date");
      JsonElement age             = headers.get("age");
      JsonElement lastModified    = headers.get("last-modified");
      JsonElement expires         = headers.get("expires");
      JsonElement cacheControl    = headers.get("cache-control");
      JsonElement pragma    = headers.get("pragma");
      JsonElement etag    = headers.get("etag");
      
      if (httpDate != null) { 
        crawlStatsJSON.addProperty(CRAWLDETAIL_HTTP_DATE_PROPERTY, HttpHeaderInfoExtractor.getTime(httpDate.getAsString()));
      }
      if (age != null) { 
        crawlStatsJSON.add(CRAWLDETAIL_HTTP_AGE_PROPERTY, age);
      }
      if (lastModified != null) { 
        crawlStatsJSON.addProperty(CRAWLDETAIL_HTTP_LAST_MODIFIED_PROPERTY, HttpHeaderInfoExtractor.getTime(lastModified.getAsString()));
      }
      if (expires != null) { 
        crawlStatsJSON.addProperty(CRAWLDETAIL_HTTP_EXPIRES_PROPERTY, HttpHeaderInfoExtractor.getTime(expires.getAsString()));
      }
      if (cacheControl != null) { 
        crawlStatsJSON.add(CRAWLDETAIL_HTTP_CACHE_CONTROL_PROPERTY, cacheControl);
      }
      if (pragma != null) { 
        crawlStatsJSON.add(CRAWLDETAIL_HTTP_PRAGMA_PROPERTY, pragma);
      }
      if (etag != null) { 
        crawlStatsJSON.add(CRAWLDETAIL_HTTP_ETAG_PROPERTY, etag);
      }
    }
  }
  

  /** 
   * 
   * @param contentObj
   * @param crawlStatsJSON
   */
  static void addMinMaxFeedItemTimes(JsonObject contentObj,JsonObject crawlStatsJSON) { 
    JsonArray items = contentObj.getAsJsonArray("items");
    
    if (items != null) {
      long minPubDate = -1L;
      long maxPubDate = -1L;
      int  itemCount = 0;
      
      for (JsonElement item : items) {
        long pubDateValue = -1;
        JsonElement pubDate = item.getAsJsonObject().get("published");
        
        if (pubDate != null) {
          pubDateValue = pubDate.getAsLong();
        }
        JsonElement updateDate = item.getAsJsonObject().get("updated");
        if (updateDate != null) { 
          if (updateDate.getAsLong() > pubDateValue) { 
            pubDateValue = updateDate.getAsLong();
          }
        }
        
        if (minPubDate == -1L || pubDateValue < minPubDate) { 
          minPubDate = pubDateValue;
        }
        if (maxPubDate == -1L || pubDateValue > maxPubDate) { 
          maxPubDate = pubDateValue;
        }
        itemCount++;
      }
      crawlStatsJSON.addProperty(RSS_MIN_PUBDATE_PROPERTY,minPubDate);
      crawlStatsJSON.addProperty(RSS_MAX_PUBDATE_PROPERTY,maxPubDate);
      crawlStatsJSON.addProperty(RSS_ITEM_COUNT_PROPERTY,itemCount);
    }
  }
  
  /** 
   * we need to extract source url from the JSON because it is not available via 
   * the key
   * 
   * @param jsonObject
   * @param keyType
   */
  void setSourceURLFromJSONObject(JsonObject jsonObject, long keyType) { 
    if (!_urlKeyForzen) { 
      JsonElement sourceElement = jsonObject.get("source_url");
      if (keyType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { 

        _outputKeyString = sourceElement.getAsString();
        _outputKeyURLObj = new GoogleURL(_outputKeyString);
        
        JsonElement httpResultElem = jsonObject.get("http_result");
        
        if (httpResultElem != null) {
          int httpResult = httpResultElem.getAsInt();
          if (httpResult >= 200 && httpResult <= 299) { 
            if (sourceElement != null && _outputKeyString == null) {
              _outputKeyString = sourceElement.getAsString();
              _outputKeyURLObj = new GoogleURL(_outputKeyString);
              if (_outputKeyURLObj.isValid())
                _urlKeyForzen = true;
            }
          }
        }
      }
      else if (keyType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) { 
        _outputKeyString = sourceElement.getAsString();
        _outputKeyURLObj = new GoogleURL(_outputKeyString);
        _urlKeyForzen = true;
      }
      else if (keyType >= CrawlDBKey.Type.KEY_TYPE_HTML_LINK.ordinal() && keyType <= CrawlDBKey.Type.KEY_TYPE_RSS_LINK.ordinal()) {
        if (_outputKeyString == null) { 
          JsonElement hrefElement = jsonObject.get("href");
          if (sourceElement != null && hrefElement != null) { 
            GoogleURL hrefSource = new GoogleURL(sourceElement.getAsString());
            if (hrefSource.isValid()) {
              _outputKeyString = hrefElement.getAsString();
              _outputKeyURLObj = new GoogleURL(_outputKeyString);
            }
          }
        }
      }
    }
  }
  
  void mergeBlekkoMetadata(JsonObject newBlekkoMetadata,JsonObject existingTopLevelObj,Reporter reporter) { 
    if (newBlekkoMetadata != null) { 
      if (!existingTopLevelObj.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) { 
        existingTopLevelObj.add(TOPLEVEL_BLEKKO_METADATA_PROPERTY,newBlekkoMetadata);
      }
      else {
        JsonObject existingBlkkoMetadata = existingTopLevelObj.getAsJsonObject(TOPLEVEL_BLEKKO_METADATA_PROPERTY);
        
        long existingTimestamp = existingBlkkoMetadata.get(BLEKKO_METADATA_TIMESTAMP_PROPERTY).getAsLong();
        long newTimestamp = newBlekkoMetadata.get(BLEKKO_METADATA_TIMESTAMP_PROPERTY).getAsLong(); 
        
        if (newTimestamp > existingTimestamp){ 
          existingTopLevelObj.add(TOPLEVEL_BLEKKO_METADATA_PROPERTY, newBlekkoMetadata);
          reporter.incrCounter(Counters.ADOPTED_NEW_BLEKKO_METADATA_RECORD, 1);
        }
      }
    }
  }
  
  void mergeLinkRecords(JsonObject sourceRecord,JsonObject topLevelJSONObject,Reporter reporter) {
    JsonElement destRecord = topLevelJSONObject.get(TOPLEVEL_LINKSTATUS_PROPERTY);
    if (destRecord == null) {
      if (sourceRecord != null) {
        reporter.incrCounter(Counters.ADOPTED_SOURCE_LINKSUMMARY_RECORD, 1);
        topLevelJSONObject.add(TOPLEVEL_LINKSTATUS_PROPERTY,sourceRecord);
        JsonArray typeAndRels = sourceRecord.getAsJsonArray(LINKSTATUS_TYPEANDRELS_PROPERTY);
        if (typeAndRels != null) { 
          for (JsonElement typeAndRel : typeAndRels) { 
            _types.add(typeAndRel.getAsString());
          }
        }
      }
    }
    else { 
      if (sourceRecord != null) { 
        reporter.incrCounter(Counters.MERGED_SOURCE_LINKSUMMARY_RECORD_INTO_DEST, 1);
        
        safeIncrementJSONCounter(destRecord.getAsJsonObject(),LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY,sourceRecord.get(LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY));
        safeIncrementJSONCounter(destRecord.getAsJsonObject(),LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY,sourceRecord.get(LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY));
        safeSetMinLongValue(destRecord.getAsJsonObject(),LINKSTATUS_EARLIEST_DATE_PROPERTY,sourceRecord.get(LINKSTATUS_EARLIEST_DATE_PROPERTY));
        safeSetMaxLongValue(destRecord.getAsJsonObject(),LINKSTATUS_LATEST_DATE_PROPERTY,sourceRecord.get(LINKSTATUS_LATEST_DATE_PROPERTY));
        
        JsonArray typeAndRels = sourceRecord.getAsJsonArray(LINKSTATUS_TYPEANDRELS_PROPERTY);
        if (typeAndRels != null) { 
          for (JsonElement typeAndRel : typeAndRels) { 
            _types.add(typeAndRel.getAsString());
          }
        }
      }
    }
  }
  
  /** 
   * merge two crawl summary records
   * @param incomingRecord
   * @param topLevelJSONObject
   * @param reporter
   * @throws IOException
   */
  void mergeSummaryRecords(JsonObject incomingRecord,JsonObject topLevelJSONObject,Reporter reporter)throws IOException { 
    JsonObject destinationSummaryRecord = topLevelJSONObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY);

    if (destinationSummaryRecord == null) { 
      if (incomingRecord != null) { 
        reporter.incrCounter(Counters.ADOPTED_SOURCE_SUMMARY_RECORD, 1);
        // adopt source ... 
        topLevelJSONObject.add(TOPLEVEL_SUMMARYRECORD_PROPRETY,incomingRecord);
        _summaryRecord = incomingRecord;
      }
    }
    else { 
      if (incomingRecord != null) { 
        reporter.incrCounter(Counters.MERGED_SOURCE_SUMMARY_RECORD_INTO_DEST, 1);

        // walk crawl detail records in incoming record and merge them into destination record ...
        JsonElement crawlStatsArray = incomingRecord.get(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY);
        if (crawlStatsArray != null) { 
          for (JsonElement crawlDetail : crawlStatsArray.getAsJsonArray()) { 
            // add to our list of crawl detail records ... 
            safeAddCrawlDetailToSummaryRecord(crawlDetail.getAsJsonObject());
            // ok, now update summary stats based on incoming crawl detail record ... 
            updateSummaryRecordFromCrawlDetailRecord(crawlDetail.getAsJsonObject(),_currentKey,reporter);

          }
        }
      }
    }
  }
  
  /** 
   * for the current url, merge the currently accumulated information with a previously generated crawl summary record  
   * @param jsonObject
   * @param destFP
   * @param reporter
   * @throws IOException
   */
  void processMergedRecord(JsonObject jsonObject,URLFPV2 destFP,Reporter reporter)throws IOException { 
    if (jsonObject.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) { 
      reporter.incrCounter(Counters.MERGE_RECORD_HAS_BLEKKO_METADATA, 1);
    }
    if (_topLevelJSONObject == null) {
      reporter.incrCounter(Counters.MERGED_OBJECT_FIRST_OBJECT, 1);
      _topLevelJSONObject = jsonObject;
      _summaryRecord = jsonObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY);
      _linkSummaryRecord = jsonObject.getAsJsonObject(TOPLEVEL_LINKSTATUS_PROPERTY);
      if (_linkSummaryRecord != null) { 
        // read in type and rels collection ...
        safeJsonArrayToStringCollection(_linkSummaryRecord,LINKSTATUS_TYPEANDRELS_PROPERTY, _types);
      }
      // and ext hrefs ..
      if (_summaryRecord != null) { 
        safeJsonArrayToStringCollection(_summaryRecord, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS,_extHrefs);
      }
      
      // special blekko import stats 
      if (_topLevelJSONObject.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) { 
        if (_summaryRecord == null && _linkSummaryRecord == null) { 
          reporter.incrCounter(Counters.BLEKKO_METADATA_WITH_NO_SOURCE_CC_RECORD, 1);
        }
      }
    }
    else { 
      mergeSummaryRecords(jsonObject.getAsJsonObject(TOPLEVEL_SUMMARYRECORD_PROPRETY),_topLevelJSONObject,reporter);
      mergeLinkRecords(jsonObject.getAsJsonObject(TOPLEVEL_LINKSTATUS_PROPERTY),_topLevelJSONObject,reporter);
      mergeBlekkoMetadata(jsonObject.getAsJsonObject(TOPLEVEL_BLEKKO_METADATA_PROPERTY),_topLevelJSONObject,reporter);
    }
  }
  
  /** 
   * given a incoming link record, track the link source and also update stats and 
   * also capture document type information (if available via the href).
   * 
   * @param jsonObject
   * @param destFP
   * @param reporter
   * @throws IOException
   */
  void updateLinkStatsFromLinkJSONObject(JsonObject jsonObject,URLFPV2 destFP,Reporter reporter) throws IOException { 
    JsonElement sourceElement = jsonObject.get("source_url");
    JsonElement hrefElement = jsonObject.get("href");
    
    if (sourceElement != null && hrefElement != null) {
      //LOG.info("source:" + sourceElement.getAsString() + " href:" + hrefElement.getAsString());
      GoogleURL sourceURLObj = new GoogleURL(sourceElement.getAsString());
      
      if (sourceURLObj.isValid()) {
        if (_linkSummaryRecord == null) { 
          _linkSummaryRecord = new JsonObject();
        }
        
        // ok, first compare known host name with incoming link host name ... 
        // if not a match then ... 
        if (!_outputKeyURLObj.getHost().equals(sourceURLObj.getHost())) { 
          // ok now deeper check ...
          URLFPV2 sourceFP = URLUtils.getURLFPV2FromURLObject(sourceURLObj);
          if (sourceFP != null) { 
            reporter.incrCounter(Counters.GOT_EXTERNAL_DOMAIN_SOURCE, 1);
            // increment external source count
            safeIncrementJSONCounter(_linkSummaryRecord,LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY);
            
            //LOG.info("sourceFP:" + sourceFP.getKey() + " hrefFP:" + destFP.getKey());
            // ok track sources if from a different root domain (for now) 
            if (sourceFP.getRootDomainHash()  != destFP.getRootDomainHash()) {
              trackPotentialLinkSource(sourceFP,sourceElement.getAsString(),destFP);
            }
          }
        }
        // otherwise, count it as an internal link 
        else {
          // internal for sure ... 
          safeIncrementJSONCounter(_linkSummaryRecord,LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY);
        }
                  
        JsonObject sourceHeaders = jsonObject.getAsJsonObject("source_headers");
        if (sourceHeaders != null) { 
          long httpDate = safeGetHttpDate(sourceHeaders, "date");
          long lastModified = safeGetHttpDate(sourceHeaders, "last-modified");
          if (lastModified != -1 && lastModified < httpDate)
            httpDate = lastModified;
          if (httpDate != -1L) { 
            safeSetMinLongValue(_linkSummaryRecord, LINKSTATUS_EARLIEST_DATE_PROPERTY, httpDate);
            safeSetMaxLongValue(_linkSummaryRecord, LINKSTATUS_LATEST_DATE_PROPERTY, httpDate);
          }
        }
        JsonElement typeElement = jsonObject.get("type");
        JsonElement relElement = jsonObject.get("rel");
        
        String sourceTypeAndRel = jsonObject.get("source_type").getAsString() + ":";
        
        if (typeElement != null) { 
          sourceTypeAndRel += typeElement.getAsString();
        }
        if (relElement != null) { 
          sourceTypeAndRel += ":" + relElement.getAsString();
        }
        
        if (_types.size() < MAX_TYPE_SAMPLES)
          _types.add(sourceTypeAndRel);
      }
    }
  }
  
  /** 
   * take linking href data and add it to our list of incoming hrefs
   * (used during the intermediate merge process)  
   * 
   * @param destFP
   * @param inputData
   * @throws IOException
   */
  void importLinkSourceData(URLFPV2 destFP,TextBytes inputData) throws IOException {
    

    TextBytes urlText = new TextBytes();
    
    int curpos =inputData.getOffset();
    int endpos = inputData.getOffset() + inputData.getLength();
    
    byte lfPattern[] = { 0xA };
    byte tabPattern[] = { 0x9 };
    
    while (curpos != endpos) { 
      int tabIndex = ByteArrayUtils.indexOf(inputData.getBytes(), curpos, endpos - curpos, tabPattern);
      if (tabIndex == -1) { 
        break;
      }
      else { 
        int lfIndex = ByteArrayUtils.indexOf(inputData.getBytes(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern);
        if (lfIndex == -1) { 
          break;
        }
        else {
          long sourceDomainHash = ByteArrayUtils.parseLong(inputData.getBytes(),curpos, tabIndex-curpos, 10);
          urlText.set(inputData.getBytes(),tabIndex + 1,lfIndex - (tabIndex + 1));
          URLFPV2 bloomKey = sourceKeyFromSourceAndDest(sourceDomainHash,destFP.getUrlHash());
          if (!_sourceInputsTrackingFilter.isPresent(bloomKey)) {
            // if not, check to see that we are not about to overflow sample buffer ...  
            if (_sourceInputsBuffer.getLength() < EXT_SOURCE_SAMPLE_BUFFER_SIZE - EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT) {
              _sourceInputsBuffer.write(inputData.getBytes(),curpos,(lfIndex + 1) - curpos);
              _sourceSampleSize++;
            }
          }
          
          curpos = lfIndex + 1;
        }
      }
    }
  }
  
  /** 
   * given an incoming link for a given url, store it in a accumulation buffer IFF we have not 
   * seen a url from the given domain before 
   * 
   * @param sourceFP
   * @param sourceURL
   * @param destFP
   * @throws IOException
   */
  void trackPotentialLinkSource(URLFPV2 sourceFP,String sourceURL,URLFPV2 destFP) throws IOException { 
    URLFPV2 bloomKey = sourceKeyFromSourceAndDest(sourceFP.getDomainHash(),destFP.getUrlHash());
    // check to see if we have collected a sample for this source domain / destination url combo or not ... 
    if (!_sourceInputsTrackingFilter.isPresent(bloomKey)) {
      LOG.debug("sourceFP:" + sourceFP.getKey() + " passed BloomFilter Test");
      // if not, check to see that we are not about to overflow sample buffer ...  
      if (_sourceInputsBuffer.getLength() < EXT_SOURCE_SAMPLE_BUFFER_SIZE - EXT_SOURCE_SAMPLE_BUFFER_PAD_AMOUNT) {
        // ok store the external reference sample ... 
        // write source domain hash 
        _sourceInputsBuffer.write(Long.toString(sourceFP.getDomainHash()).getBytes());
        // delimiter 
        _sourceInputsBuffer.write(0x09);// TAB
        // and source url ... 
        _sourceInputsBuffer.write(sourceURL.getBytes(Charset.forName("UTF-8")));
        _sourceInputsBuffer.write(0x0A);// LF 
        _sourceSampleSize++;
        
        // add to bloom filter ... 
        _sourceInputsTrackingFilter.add(bloomKey);
      }
    }
    else { 
      LOG.debug("sourceFP:" + sourceFP.getKey() + " failed BloomFilter Test");
    }
  }

  /** 
   * construct a (hacked) fingerprint key consisting of the source domain and destination 
   * url fingerprint to be used for the purposes of setting bits in a bloomfilter
   * 
   * @param sourceDomain
   * @param destURLHash
   * @return
   */
  private URLFPV2 sourceKeyFromSourceAndDest(long sourceDomain,long destURLHash) { 
    _bloomFilterKey.setDomainHash(sourceDomain);
    _bloomFilterKey.setUrlHash(destURLHash);
    return _bloomFilterKey;
  }
  
  
  /** 
   * construct crawl detail record from incoming crawl status JSON 
   *  
   * @param jsonObject
   * @param fpSource
   * @param extHRefs
   * @param reporter
   * @return
   * @throws IOException
   */
  static JsonObject crawlDetailRecordFromCrawlStatusRecord(JsonObject jsonObject,URLFPV2 fpSource,HashSet<String> extHRefs,Reporter reporter)throws IOException { 
    
    String disposition = jsonObject.get("disposition").getAsString();
    long   attemptTime = jsonObject.get("attempt_time").getAsLong();

    // inject all the details into a JSONObject 
    JsonObject crawlStatsJSON = new JsonObject();

    crawlStatsJSON.addProperty(CRAWLDETAIL_ATTEMPT_TIME_PROPERTY, attemptTime);

    if (disposition.equals("SUCCESS")) {

      // basic stats ... starting with crawl time ...
      int httpResult = jsonObject.get("http_result").getAsInt();
      crawlStatsJSON.addProperty(CRAWLDETAIL_HTTPRESULT_PROPERTY,httpResult);
      crawlStatsJSON.addProperty(CRAWLDETAIL_SERVERIP_PROPERTY, jsonObject.get("server_ip").getAsString());

      //populate date headers ... 
      populateDateHeadersFromJSONObject(jsonObject,crawlStatsJSON);
      
      // if http 200 ... 
      if (httpResult >= 200 && httpResult <= 299) {

        reporter.incrCounter(Counters.GOT_HTTP_200_CRAWL_STATUS,1);
              
        crawlStatsJSON.addProperty(CRAWLDETAIL_CONTENTLEN_PROPERTY,jsonObject.get("content_len").getAsInt());
        if (jsonObject.get("mime_type") != null) { 
          crawlStatsJSON.addProperty(CRAWLDETAIL_MIMETYPE_PROPERTY,jsonObject.get("mime_type").getAsString());
        }
        if (jsonObject.get("md5") != null) {
          crawlStatsJSON.addProperty(CRAWLDETAIL_MD5_PROPERTY,jsonObject.get("md5").getAsString());
        }
        if (jsonObject.get("text_simhash") != null) {
          crawlStatsJSON.addProperty(CRAWLDETAIL_TEXTSIMHASH_PROPERTY,jsonObject.get("text_simhash").getAsLong());
        }
        
        JsonElement parsedAs = jsonObject.get("parsed_as");
        
        if (parsedAs != null) {
          // populate some info based on type ... 
          crawlStatsJSON.addProperty(CRAWLDETAIL_PARSEDAS_PROPERTY,parsedAs.getAsString());
          
          String parsedAsString = parsedAs.getAsString();
          
          // if html ... 
          if (parsedAsString.equals("html")) { 
            JsonObject content = jsonObject.get("content").getAsJsonObject();
            if (content != null) { 
              JsonElement titleElement = content.get("title");
              JsonElement metaElement = content.get("meta_tags");
              if (titleElement != null) { 
                crawlStatsJSON.add(CRAWLDETAIL_TITLE_PROPERTY, titleElement);
              }
              if (metaElement != null) { 
                crawlStatsJSON.add(CRAWLDETAIL_METATAGS_PROPERTY, metaElement);
              }
              // collect link stats for json ... 
              updateLinkStatsFromHTMLContent(crawlStatsJSON,jsonObject,extHRefs,fpSource,reporter);
            }
            
          }
          // if feed ... 
          else if (parsedAsString.equals("feed")) { 
            // get content ... 
            JsonObject content = jsonObject.get("content").getAsJsonObject();
            JsonElement titleElement = content.get("title");
            if (titleElement != null) { 
              crawlStatsJSON.add(CRAWLDETAIL_TITLE_PROPERTY, titleElement);
            }
            // set update time ... 
            long updateTime = safeGetLong(content, "updated");
            if (updateTime != -1) { 
              crawlStatsJSON.addProperty(CRAWLDETAIL_UPDATED_PROPERTY, updateTime);
            }
            
            addMinMaxFeedItemTimes(content,crawlStatsJSON);
          }
        }
      }
      // redirect ... 
      else if (httpResult >=300 && httpResult <= 399) {
        reporter.incrCounter(Counters.GOT_REDIRECT_CRAWL_STATUS,1);
        
        // get the target url ... 
        JsonElement targetURL = jsonObject.get("target_url");
        if (targetURL != null) {
          // redirect details ...
          crawlStatsJSON.addProperty(CRAWLDETAIL_REDIRECT_URL, targetURL.getAsString());
        }
        else { 
          reporter.incrCounter(Counters.GOT_NULL_REDIRECT_URL, 1);
        }
      }
    }
    else { 
      // inject all the details into a JSONObject 
      
      // basic stats ... starting with crawl time ...
      crawlStatsJSON.addProperty(CRAWLDETAIL_FAILURE,true);
      crawlStatsJSON.addProperty(CRAWLDETAIL_FAILURE_REASON,safeGetStringFromElement(jsonObject,"failure_reason"));
      crawlStatsJSON.addProperty(CRAWLDETAIL_FAILURE_DETAIL,safeGetStringFromElement(jsonObject,"failure_detail"));
    }
    
    return crawlStatsJSON;
  
  }
  
  /** 
   * given a crawl detail json record, update summary record stats 
   * 
   * @param crawlDetailRecord
   * @param fpSource
   * @param reporter
   * @throws IOException
   */
  void updateSummaryRecordFromCrawlDetailRecord(JsonObject crawlDetailRecord,URLFPV2 fpSource,Reporter reporter) throws IOException { 
    
    if (_summaryRecord == null) { 
      _summaryRecord = new JsonObject();
    }
    
    boolean failure = safeGetBoolean(crawlDetailRecord,CRAWLDETAIL_FAILURE);
    long   attemptTime = crawlDetailRecord.get(CRAWLDETAIL_ATTEMPT_TIME_PROPERTY).getAsLong();
      
    // set latest attempt time ... 
    long latestAttemptTime = safeSetMaxLongValue(_summaryRecord,SUMMARYRECORD_LATEST_ATTEMPT_PROPERTY,attemptTime);
    // increment attempt count 
    safeIncrementJSONCounter(_summaryRecord,SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY);
    
    // if this is the latest attempt ... 
    if (latestAttemptTime == attemptTime) {
      // add latest http result to summary 
      if (!failure && crawlDetailRecord.has(CRAWLDETAIL_HTTPRESULT_PROPERTY)) { 
        int httpResult = crawlDetailRecord.get(CRAWLDETAIL_HTTPRESULT_PROPERTY).getAsInt();
        // set last http result 
        _summaryRecord.addProperty(SUMMARYRECORD_HTTP_RESULT_PROPERTY,httpResult);
        if (httpResult >= 200 && httpResult <= 299) {
          // update the crawl timestamp 
          _summaryRecord.addProperty(SUMMARYRECORD_LATEST_CRAWLTIME_PROPERTY,attemptTime);
          // and the crawl count .... 
          safeIncrementJSONCounter(_summaryRecord,SUMMARYRECORD_CRAWLCOUNT_PROPERTY);

          // update parsed as 
          if (crawlDetailRecord.has(CRAWLDETAIL_PARSEDAS_PROPERTY)) { 
            _summaryRecord.addProperty(SUMMARYRECORD_PARSEDAS_PROPERTY, safeGetStringFromElement(crawlDetailRecord,CRAWLDETAIL_PARSEDAS_PROPERTY));
          }
        }
        else if (httpResult >=300 && httpResult <= 399) {
          if (crawlDetailRecord.has(CRAWLDETAIL_REDIRECT_URL)) { 
            _summaryRecord.addProperty(SUMMARYRECORD_REDIRECT_URL_PROPERTY, safeGetStringFromElement(crawlDetailRecord,CRAWLDETAIL_REDIRECT_URL));
          }
        }
      }
    }
  }
  
  /** 
   * given html content (json object), extract out of domain hrefs and cache them 
   * and ... update stats 
   * @param crawlStats
   * @param incomingJSONObject
   * @param extHRefs
   * @param fpSource
   * @param reporter
   */
  static void updateLinkStatsFromHTMLContent(JsonObject crawlStats,JsonObject incomingJSONObject,HashSet<String> extHRefs,URLFPV2 fpSource,Reporter reporter) { 
    JsonArray links = incomingJSONObject.getAsJsonArray("links");
    
    if (links == null) { 
      reporter.incrCounter(Counters.NULL_LINKS_ARRAY, 1);
    }
    else {
      
      // clear our snapshot of externally referenced urls 
      // we only want to capture this information from 
      // the links extracted via the latest content
      if (extHRefs != null) 
        extHRefs.clear();
      
      int intraDomainLinkCount = 0;
      int intraRootLinkCount = 0;
      int interDomainLinkCount = 0;
      
      for (JsonElement link : links) { 
        JsonObject linkObj = link.getAsJsonObject();
        if (linkObj != null && linkObj.has("href")) {
          String href = linkObj.get("href").getAsString();
          GoogleURL urlObject = new GoogleURL(href);
          if (urlObject.isValid()) { 
            URLFPV2 linkFP = URLUtils.getURLFPV2FromURLObject(urlObject);
            if (linkFP != null) { 
              if (linkFP.getRootDomainHash() == fpSource.getRootDomainHash()) {
                if (linkFP.getDomainHash() == fpSource.getDomainHash()) { 
                  intraDomainLinkCount ++;
                }
                else { 
                  intraRootLinkCount ++;
                }
              }
              else {
                interDomainLinkCount++;
                // track domains we link to
                if (extHRefs != null) {
                  if (extHRefs.size() <= MAX_EXTERNALLY_REFERENCED_URLS) { 
                    extHRefs.add(urlObject.getCanonicalURL());
                  }
                }
              }
            }
          }
        }
      }
      // update counts in crawl stats data structure ... 
      crawlStats.addProperty(CRAWLDETAIL_INTRADOMAIN_LINKS, intraDomainLinkCount);
      crawlStats.addProperty(CRAWLDETAIL_INTRAROOT_LINKS, intraRootLinkCount);
      crawlStats.addProperty(CRAWLDETAIL_INTERDOMAIN_LINKS, interDomainLinkCount);
      
      if (interDomainLinkCount <= 100) { 
        reporter.incrCounter(Counters.INTERDOMAIN_LINKS_LTEQ_100, 1);
      }
      else if (interDomainLinkCount <= 1000) { 
        reporter.incrCounter(Counters.INTERDOMAIN_LINKS_LTEQ_1000, 1);
      }
      else { 
        reporter.incrCounter(Counters.INTERDOMAIN_LINKS_GT_1000, 1);
      }
    }
  }
  
  /** 
   * flush currently accumulated JSON record
   * 
   * @param output
   * @param reporter
   * @throws IOException
   */
  private void flushCurrentRecord(OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException {
    _urlsProcessed++;

    
    if (_outputKeyString == null || !_outputKeyURLObj.isValid()) {
      if (reporter != null) { 
        reporter.incrCounter(Counters.FAILED_TO_GET_SOURCE_HREF, 1);
      }
    }
    else { 
    
      if (_topLevelJSONObject != null || _summaryRecord != null || _linkSummaryRecord != null) { 
        
        if (_topLevelJSONObject == null) {
          reporter.incrCounter(Counters.ALLOCATED_TOP_LEVEL_OBJECT_IN_FLUSH, 1);
          _topLevelJSONObject = new JsonObject();
          _topLevelJSONObject.addProperty(TOPLEVEL_SOURCE_URL_PROPRETY,_outputKeyString);
        }
        else { 
          reporter.incrCounter(Counters.ENCOUNTERED_EXISTING_TOP_LEVEL_OBJECT_IN_FLUSH, 1);
        }
        
        if (_summaryRecord != null) {
          
          _summaryRecord.remove(SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS);
          _summaryRecord.remove(SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS_TRUNCATED);
          
          if (_extHrefs.size() != 0) { 
            // output links in the top level object ...
            stringCollectionToJsonArrayWithMax(_summaryRecord, SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS, _extHrefs,MAX_EXTERNALLY_REFERENCED_URLS);
            if (_extHrefs.size() > MAX_EXTERNALLY_REFERENCED_URLS) { 
              _summaryRecord.addProperty(SUMMARYRECORD_EXTERNALLY_REFERENCED_URLS_TRUNCATED,true);
            }
          }
          
          reporter.incrCounter(Counters.ENCOUNTERED_SUMMARY_RECORD_IN_FLUSH, 1);
          _topLevelJSONObject.add(TOPLEVEL_SUMMARYRECORD_PROPRETY, _summaryRecord);
        }
        if (_linkSummaryRecord != null) {
          reporter.incrCounter(Counters.ENCOUNTERED_LINKSUMMARY_RECORD_IN_FLUSH, 1);

          if (_types != null && _types.size() != 0) { 
            stringCollectionToJsonArray(_linkSummaryRecord,LINKSTATUS_TYPEANDRELS_PROPERTY,_types);
          }
          _topLevelJSONObject.add(TOPLEVEL_LINKSTATUS_PROPERTY, _linkSummaryRecord);
        }        
        
        //System.out.println("Emitting Key:" + CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, 0));
        
        if (_topLevelJSONObject.has(TOPLEVEL_BLEKKO_METADATA_PROPERTY)) {
          JsonObject blekkoMetadata = _topLevelJSONObject.getAsJsonObject(TOPLEVEL_BLEKKO_METADATA_PROPERTY);
          reporter.incrCounter(Counters.EMITTED_RECORD_WITH_BLEKKO_METADATA, 1);
          if (_linkSummaryRecord != null || _summaryRecord != null ) { 
            reporter.incrCounter(Counters.BLEKKO_RECORD_ALREADY_IN_DATABASE, 1);
            if (_summaryRecord != null) { 
              if (_summaryRecord.has(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY) 
                  && _summaryRecord.get(SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY).getAsInt() != 0) { 
                
                String status = blekkoMetadata.get(BLEKKO_METADATA_STATUS).getAsString();
                if (status.equalsIgnoreCase("crawled")) { 
                  reporter.incrCounter(Counters.BLEKKO_CRAWLED_CC_CRAWLED, 1);
                }
                else { 
                  reporter.incrCounter(Counters.BLEKKO_NOT_CRAWLED_CC_CRAWLED, 1);
                }
              }
            }
          }
        }
        
        // output top level record ... 
        output.collect(CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD, 0),new TextBytes(_topLevelJSONObject.toString()));
        // if there is link status available ...
        if (_sourceSampleSize != 0) {
          reporter.incrCounter(Counters.EMITTED_SOURCEINPUTS_RECORD, 1);
          TextBytes sourceInputsText= new TextBytes();
          sourceInputsText.set(_sourceInputsBuffer.getData(),0,_sourceInputsBuffer.getLength());
          //System.out.println("Emitting Key:" + CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE, 0));
          output.collect(CrawlDBKey.generateKey(_currentKey, CrawlDBKey.Type.KEY_TYPE_INCOMING_URLS_SAMPLE, 0),sourceInputsText);
          reporter.incrCounter(Counters.EMITTED_SOURCEINPUTS_DATA_BYTES_EMITTED, sourceInputsText.getLength());
        }
      }
      
      if (_urlsProcessed % FLUSH_INTERVAL == 0) { 
        _sourceInputsTrackingFilter.clear();
      }
    }
    
    _sourceInputsBuffer.reset();
    _sourceSampleSize = 0;
    _topLevelJSONObject = null;
    _summaryRecord = null;
    _linkSummaryRecord = null;
    _types.clear();
    _extHrefs.clear();
    _outputKeyString = null;
    _urlKeyForzen = false;
    _outputKeyURLObj = null;        
  }
    
  
  /** 
   * Extract the fingerprint from the incoming key and potentially trigger a flush if it is indicative of a 
   * primary key transition 
   * @param key
   * @param output
   * @param reporter
   * @throws IOException
   */
  private void readFPCheckForTransition(TextBytes key,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException { 
    if (_tempKey == null) { 
      _tempKey = new URLFPV2();
    }
    
    _tempKey.setRootDomainHash(CrawlDBKey.getLongComponentFromKey(key, ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
    _tempKey.setDomainHash(CrawlDBKey.getLongComponentFromKey(key, ComponentId.DOMAIN_HASH_COMPONENT_ID));
    _tempKey.setUrlHash(CrawlDBKey.getLongComponentFromKey(key, ComponentId.URL_HASH_COMPONENT_ID));
    
    if (_currentKey == null) { 
      _currentKey = _tempKey;
      _tempKey = null;
    }
    else {
      // check for key transition ... 
      if (_currentKey.compareTo(_tempKey) != 0) { 
        // transition 
        flushCurrentRecord(output,reporter);
        
        // swap keys ... 
        URLFPV2 oldKey = _currentKey;
        _currentKey = _tempKey;
        _tempKey = oldKey;
      }
    }
  }

  /**
   * add crawl detail to summary record. construct a summary detail if none exists ... 
   * 
   * @param crawlStatsJSON
   */
  void safeAddCrawlDetailToSummaryRecord(JsonObject crawlStatsJSON) {
    if (_summaryRecord == null) { 
      _summaryRecord = new JsonObject();
    }
    // construct crawl stats array if necessary 
    JsonArray crawlStatsArray = _summaryRecord.getAsJsonArray(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY);
    if (crawlStatsArray == null) { 
      crawlStatsArray = new JsonArray();
      _summaryRecord.add(SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY, crawlStatsArray);
    }
    // add crawl stats to it 
    crawlStatsArray.add(crawlStatsJSON);
  }

  
  /** 
   * scan the merge db path and find the latest crawl database timestamp 
   * 
   * @param fs
   * @param conf
   * @return
   * @throws IOException
   */
  static long findLatestMergeDBTimestamp(FileSystem fs,Configuration conf)throws IOException {
    long timestampOut = -1L;
    
    FileStatus files[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_DB_PATH,"[0-9]*"));
    
    for (FileStatus candidate : files) { 
      Path successPath = new Path(candidate.getPath(),"_SUCCESS");
      if (fs.exists(successPath)) { 
        long timestamp = Long.parseLong(candidate.getPath().getName());
        timestampOut = Math.max(timestamp, timestampOut);
      }
    }
    return timestampOut;
  }
  
  /** 
   * iterate the intermediate link graph data and extract unmerged set ... 
   * 
   * @param fs
   * @param conf
   * @param latestMergeDBTimestamp
   * @return
   * @throws IOException
   */
  static List<Path> filterMergeCandidtes(FileSystem fs,Configuration conf, long latestMergeDBTimestamp )throws IOException { 
    ArrayList<Path> list = new ArrayList<Path>();
    FileStatus candidates[] = fs.globStatus(new Path(S3N_BUCKET_PREFIX + MERGE_INTERMEDIATE_OUTPUT_PATH,"[0-9]*"));
    
    for (FileStatus candidate : candidates) {
      LOG.info("Found Merge Candidate:" + candidate.getPath());
      long candidateTimestamp = Long.parseLong(candidate.getPath().getName());
      if (candidateTimestamp > latestMergeDBTimestamp) { 
        Path successPath = new Path(candidate.getPath(),"_SUCCESS");
        if (fs.exists(successPath)) { 
          list.add(candidate.getPath());
        }
        else { 
          LOG.info("Rejected Merge Candidate:" + candidate.getPath());
        }
      }
    }
    return list;
  }

  
  
  ///////////////////////////////////////////////////////////////////////////
  // TEST CODE 
  ///////////////////////////////////////////////////////////////////////////
  
  
  /* 
  // PARK THIS CODE FOR NOW SINCE WE ARE TRANSFERRING DATA PROCESSING TO EC2

    if (_skipPartition)
   return;
 // collect all incoming paths first
 Vector<Path> incomingPaths = new Vector<Path>();
 
 while(values.hasNext()){ 
   String path = values.next().toString();
   LOG.info("Found Incoming Path:" + path);
   incomingPaths.add(new Path(path));
 }
 
 FlexBuffer scanArray[] = LinkKey.allocateScanArray();


 // set up merge attributes
 Configuration localMergeConfig = new Configuration(_conf);
 
 localMergeConfig.setClass(
     MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS,
     LinkKeyGroupingComparator.class, RawComparator.class);
 localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS,
     TextBytes.class, WritableComparable.class);
 
 
 // ok now spawn merger
 MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(
     _fs, incomingPaths, localMergeConfig);

 TextBytes keyBytes = new TextBytes();
 TextBytes valueBytes = new TextBytes();
 DataInputBuffer inputBuffer = new DataInputBuffer();
 
 int processedKeysCount = 0;
 
 Pair<KeyAndValueData<TextBytes>,Iterable<RawRecordValue>> nextItem = null;
 while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
   
   urlsProcessed++;
   _sourceInputsBuffer.reset();
   _sourceSampleSize = 0;
   summaryRecord = null;
   linkSummaryRecord = null;
   types.clear();
   outputKeyString = null;
   outputKeyFromInternalLink = false;
   outputKeyURLObj = null;
   extLinkedDomains.clear();
   
   int statusCount = 0;
   int linkCount = 0;
   
   // scan key components 
   LinkKey.scanForComponents(nextItem.e0._keyObject, ':',scanArray);
   
   // pick up source fp from key ... 
   URLFPV2 fpSource = new URLFPV2();
   
   fpSource.setRootDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,LinkKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID));
   fpSource.setDomainHash(LinkKey.getLongComponentFromComponentArray(scanArray,LinkKey.ComponentId.DOMAIN_HASH_COMPONENT_ID));
   fpSource.setUrlHash(LinkKey.getLongComponentFromComponentArray(scanArray,LinkKey.ComponentId.URL_HASH_COMPONENT_ID));
   
   for (RawRecordValue rawValue: nextItem.e1) { 
     
     inputBuffer.reset(rawValue.key.getData(),0,rawValue.key.getLength());
     int length = WritableUtils.readVInt(inputBuffer);
     keyBytes.set(rawValue.key.getData(),inputBuffer.getPosition(),length);
     inputBuffer.reset(rawValue.data.getData(),0,rawValue.data.getLength());
     length = WritableUtils.readVInt(inputBuffer);
     valueBytes.set(rawValue.data.getData(),inputBuffer.getPosition(),length);
*/

}