NewGenBundlesStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.pipelineV3.crawllistgen;

import java.io.IOException;
import java.net.URI;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.SegmentGeneratorBundleKey;
import org.commoncrawl.mapred.SegmentGeneratorItem;
import org.commoncrawl.mapred.SegmentGeneratorItemBundle;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergeInputFormat;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergePartitioner;
import org.commoncrawl.util.S3NFileSystem;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.Tuples.Pair;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;

import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

public class NewGenBundlesStep extends CrawlPipelineStep {

  private static final Log LOG = LogFactory.getLog(NewGenBundlesStep.class);

  public static final String OUTPUT_DIR_NAME = "bundlesGenerator";

  static final int NUM_BITS = 11;

  static final int NUM_ELEMENTS = 1 << 28;

  static final int FLUSH_THRESHOLD = 1 << 23;

  public static final int SPILL_THRESHOLD = 250;
  
  
  enum Counters {
    SPILLED_1_MILLION_SKIPPED_REST, DOMAIN_WITH_GT_10MILLION_URLS, DOMAIN_WITH_GT_1MILLION_URLS, DOMAIN_WITH_GT_100K_URLS, DOMAIN_WITH_GT_50K_URLS, DOMAIN_WITH_GT_10K_URLS, DOMAIN_WITH_GT_1K_URLS, DOMAIN_WITH_GT_100_URLS, DOMAIN_WITH_GT_10_URLS, DOMAIN_WITH_LT_10_URLS, DOMAIN_WITH_1_URL, INVALID_SCHEME, INVALID_URL_OBJECT, SKIPPING_ALREADY_EMITTED_URL, NULL_FP_FOR_URL, NO_SOURCE_URL_IN_JSON, GENERATING_HOME_PAGE_URL, EMITTING_URL_OBJECT, GOT_RAW_RECORD_ITERATOR, GET_NEXT_RECORD_FROM_MERGER, GOT_RAW_RECORD_FROM_ITERATOR 
    
  }
  
  public NewGenBundlesStep(CrawlPipelineTask task) {
    super(task, "Generate Bundles", OUTPUT_DIR_NAME);
  }
  
  
  
  
  @Override
  public Log getLogger() {
    return LOG;
  }

  private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
  static {
    NUMBER_FORMAT.setMinimumIntegerDigits(5);
    NUMBER_FORMAT.setGroupingUsed(false);
  }    

  
  static final String SINGLE_PARTITION_PROPERTY = "bundlegen.singlePartition";

  public void addCrawlListPaths(Configuration conf,int restrictedPartitionId,ArrayList<Path> pathsOut) throws IOException { 
    // get paritioned list path 
    Path partitionedListPath = getOutputDirForStep(NewPartitionUrlsStep.class);
    FileSystem fs = FileSystem.get(partitionedListPath.toUri(),conf);
    Path filterPath = new Path(partitionedListPath,"[0-9]*");
    for (FileStatus partitionPath : fs.globStatus(filterPath)) { 
      if (restrictedPartitionId != -1) { 
        pathsOut.add(new Path(partitionPath.getPath(),"part-" + NUMBER_FORMAT.format(restrictedPartitionId)));
      }
      else { 
        for (FileStatus part : fs.globStatus(new Path(partitionPath.getPath(),"part-*"))) { 
          pathsOut.add(part.getPath());
        }
      }
    }
  }
  
  public void addWikipediaPaths(Configuration conf,int restrictedPartitionId,ArrayList<Path> pathsOut) throws IOException { 
    // get partitioned list path 
    Path wikipediaURLSPath = getOutputDirForStep(PartitionWikipediaUrlsStep.class);
    FileSystem fs = FileSystem.get(wikipediaURLSPath.toUri(),conf);
    if (restrictedPartitionId != -1) { 
      pathsOut.add(new Path(wikipediaURLSPath,"part-" + NUMBER_FORMAT.format(restrictedPartitionId)));
    }
    else { 
      for (FileStatus part : fs.globStatus(new Path(wikipediaURLSPath,"part-*"))) {
        pathsOut.add(part.getPath());
      }
    }
  }
  
    
  @Override
  public void runStep(Path outputPathLocation) throws IOException {
    LOG.info("Task Identity Path is:" + getTaskIdentityPath());
    LOG.info("Temp Path is:" + outputPathLocation);
    
    DomainMetadataTask rootTask = (DomainMetadataTask) getRootTask();
    
    Configuration conf = new Configuration(rootTask.getConf());
    
    // check for restricted partition id ... 
    int restrictedPartitionId = rootTask.getConf().getInt(SINGLE_PARTITION_PROPERTY, -1);

    // collect paths ... 
    ArrayList<Path> paths = new ArrayList<Path>();
    
    addCrawlListPaths(conf,restrictedPartitionId,paths);
    addWikipediaPaths(conf,restrictedPartitionId,paths);
    
    JobConf jobConf = new JobBuilder("Generate Bundles", getConf())

    .inputs(paths)
    .inputFormat(MultiFileMergeInputFormat.class)
    .mapperKeyValue(IntWritable.class, Text.class)
    .outputKeyValue(SegmentGeneratorBundleKey.class, SegmentGeneratorItemBundle.class)
    .outputIsSeqFile()
    .reducer(BundleGenerator.class, false)
    .partition(MultiFileMergePartitioner.class)
    .speculativeExecution(false)
    .output(outputPathLocation)
    .compressMapOutput(false).compressor(CompressionType.BLOCK, SnappyCodec.class)
    .build();

    jobConf.setBoolean(MultiFileMergeInputFormat.PARTS_ARE_FILES_PROPERTY,true);
    
    if (restrictedPartitionId != -1) { 
      jobConf.setNumReduceTasks(1);
    }
    else { 
      jobConf.setNumReduceTasks(CrawlListGenCommon.NUM_LIST_PARTITIONS);
    }
    
    LOG.info("Starting JOB");
    JobClient.runJob(jobConf);
    LOG.info("Finsihed JOB");
    
  }

  public static class BundleGenerator implements Reducer<IntWritable,Text,SegmentGeneratorBundleKey,SegmentGeneratorItemBundle> {
    Configuration _conf;
    
    boolean _skipDomain = false;
    boolean _currentRootDomainIdValid = false;
    long _currentRootDomainId = -1;
    boolean _currentSubDomainIdValid = false;
    boolean _genHomePageURLForSubDomain = false;
    long _currentSubDomainId = -1;
    int _currentRootDomainURLCount = 0;
    int _currentRootDomainSpilledItemCount = 0;
    // spill state ...
    ArrayList<SegmentGeneratorItem> items = new ArrayList<SegmentGeneratorItem>();
    int currentDomainCrawlIdx = -1;
    SegmentGeneratorItemBundle currentBundle = null;
    double accumulatedRank = 0.0;
    int currentBundleId = 0;
    OutputCollector<SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> _collector = null;
    int crawlerCount = CrawlEnvironment.NUM_CRAWLERS;
    
    static final int NUM_HASH_FUNCTIONS = 10;
    static final int NUM_BITS = 11;
    static final int NUM_ELEMENTS = 1 << 28;
    static final int FLUSH_THRESHOLD = 1 << 23;
    
    URLFPBloomFilter emittedTuplesFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
    long urlsInFilter = 0;
    
    @Override
    public void configure(JobConf job) {
      _conf = job;
    }

    @Override
    public void close() throws IOException {
    }

    
    private static FileSystem getFileSystemForMergePath(Path path,Configuration conf)throws IOException { 
      // override S3N 
      if (path.toUri().getScheme().equalsIgnoreCase("s3n")) { 
        FileSystem fs = new S3NFileSystem();
        fs.initialize(path.toUri(), conf);
        return fs;
      }
      // conf.setClass("fs.s3n.impl", S3NFileSystem.class,FileSystem.class);
      return FileSystem.get(path.toUri(),conf);
    }

    @SuppressWarnings("resource")
    Pair<FileSystem,List<Path>> buildInputPathList(Configuration conf,Iterator<Text> values)throws IOException { 
      // collect all incoming paths first
      ArrayList<Path> incomingPaths = Lists.newArrayList();
      
      Set<String> fsType = new HashSet<String>();
      
      while(values.hasNext()){ 
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
        // convert to uri ... 
        URI uri = new Path(path).toUri();
        // get scheme if present ... 
        String scheme = uri.getScheme();
        if (scheme == null || scheme.length() == 0) { 
          fsType.add("default");
        }
        else { 
          fsType.add(scheme);
        }
      }
      
      if (fsType.size() != 1) { 
        throw new IOException("Only One Input Scheme at a time supported!");
      }
      
      // determine filesytem 
      FileSystem fs = null;
      if (fsType.contains("s3n")) { 
        fs = new S3NFileSystem();
        fs.initialize(incomingPaths.get(0).toUri(), conf);
      }
      else { 
        fs = FileSystem.get(incomingPaths.get(0).toUri(), conf);
      }
      return new Pair<FileSystem, List<Path>>(fs,incomingPaths);
    }
    
    static class RawValueIterator implements Iterator<TextBytes>  {

      CrawlListKey key = new CrawlListKey();
      TextBytes valueBytes = new TextBytes();
      DataInputBuffer keyInputBuffer = new DataInputBuffer();
      DataInputBuffer inputBuffer = new DataInputBuffer();
      Path currentSource = null;
      
      Iterator<RawRecordValue> rawIterator;
      void reset(Iterable<RawRecordValue> rawIterable) { 
        this.rawIterator = rawIterable.iterator();
      }
      
      @Override
      public boolean hasNext() {
        return rawIterator.hasNext();
      }
      
      CrawlListKey currentKey() { 
        return key;
      }
      
      Path currentSource() { 
        return currentSource;
      }

      @Override
      public TextBytes next(){
        try { 
          RawRecordValue nextRawValue = rawIterator.next();
          // read in text bytes key ... 
          keyInputBuffer.reset(nextRawValue.key.getData(),0,nextRawValue.key.getLength());
          inputBuffer.reset(nextRawValue.data.getData(),0,nextRawValue.data.getLength());
          int valueTextLen = WritableUtils.readVInt(inputBuffer);
          valueBytes.set(nextRawValue.data.getData(),inputBuffer.getPosition(),valueTextLen);
          key.readFields(keyInputBuffer);
          currentSource = nextRawValue.source;
          
          return valueBytes;
        }
        catch (IOException e) { 
          LOG.error(CCStringUtils.stringifyException(e));
          throw new RuntimeException(e);
        }
      }

      @Override
      public void remove() {
        throw new UnsupportedOperationException("remove");
      } 
    }
    

    
    /** helper method **/
    private SegmentGeneratorItemBundle getBundleForDomain(long domainFP) throws IOException {

      currentBundle = new SegmentGeneratorItemBundle();
      currentBundle.setHostFP(domainFP);

      return currentBundle;
    }

    
    /** generate a bundle from the given list of items and simultaneously flush it **/
    private void generateABundle(long domainFP, List<SegmentGeneratorItem> items, Reporter reporter) throws IOException {

      SegmentGeneratorItemBundle bundle = getBundleForDomain(domainFP);

      // LOG.info("Generating Bundle:" + currentBundleId + " for DH:" + domainFP);
      float maxPageRank = 0.0f;
      for (SegmentGeneratorItem item : items) {
        // LOG.info("URL:" + item.getUrl() + " Status:" +
        // CrawlDatum.getStatusName(item.getStatus()) +" PR:" +
        // item.getMetadata().getPageRank());
        bundle.getUrls().add(item);
        _currentRootDomainURLCount++;
        maxPageRank = Math.max(maxPageRank, item.getPageRank());
      }
      // LOG.info("Done Generating Bunlde - PR is:" + maxPageRank);

      // set page rank for bundle
      bundle.setMaxPageRank(maxPageRank);

      flushCurrentBundle(reporter);
    }

    
    /** flush the currently active bundle **/
    private void flushCurrentBundle(Reporter reporter) throws IOException {
      if (currentBundle != null && currentBundle.getUrls().size() != 0) {
        int crawlerIndex = (((Long)currentBundle.getHostFP()).hashCode() & Integer.MAX_VALUE) % crawlerCount;
        // generate a bundle key
        SegmentGeneratorBundleKey bundleKey = new SegmentGeneratorBundleKey();

        bundleKey.setRecordType(0);
        bundleKey.setCrawlerId(crawlerIndex);
        bundleKey.setDomainFP(_currentRootDomainId);
        // and increment bundle id ...
        bundleKey.setBundleId(currentBundleId++);
        bundleKey.setAvgPageRank((float) accumulatedRank / (float)currentBundle.getUrls().size());

        if (reporter != null) {
          reporter.incrCounter("CRAWLER_", Long.toString(crawlerIndex) + "_BUNDLE_COUNT", 1);
        }

        // ok spill bundle ...
        _collector.collect(bundleKey, currentBundle);
      }
      // current bundle is now null
      currentBundle = null;
      accumulatedRank = 0.0;
    }


    /** spill cached items **/
    private void spillItems(Reporter reporter) throws IOException {
      // if item count exceeds spill threshold .. or we ran out of data ...
      if (items.size() != 0) {
        // LOG.info("Spilling Bundle:" + currentBundleId + " for DH:" +
        // currentDomain + " ItemCount:" + subList.size());
        // flush items
        generateABundle(_currentRootDomainId, items, reporter);
        if (reporter != null) {
          reporter.progress();
        }
        // ok, increment counts ...
        _currentRootDomainSpilledItemCount += items.size();

        //if (_currentRootDomainSpilledItemCount >= 1000000) {
          reporter.incrCounter(Counters.SPILLED_1_MILLION_SKIPPED_REST, 1);
          //_skipDomain = true;
        //}
      }
      // reset list ...
      items.clear();
    }

    
    private void flushRootDomain(Reporter reporter) throws IOException {
      if (items.size() != 0) {
        spillItems(reporter);
      }

      if (reporter != null) {
        if (_currentRootDomainSpilledItemCount >= 10000000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_10MILLION_URLS, 1);
        } else if (_currentRootDomainSpilledItemCount >= 1000000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_1MILLION_URLS, 1);
        } else if (_currentRootDomainSpilledItemCount >= 100000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_100K_URLS, 1);
        } else if (_currentRootDomainSpilledItemCount >= 50000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_50K_URLS, 1);
        } else if (_currentRootDomainSpilledItemCount >= 10000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_10K_URLS, 1);
        } else if (_currentRootDomainSpilledItemCount >= 1000) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_1K_URLS, 1);
        } else if (_currentRootDomainSpilledItemCount >= 100) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_100_URLS, 1);
        } else if (_currentRootDomainSpilledItemCount >= 10) {
          reporter.incrCounter(Counters.DOMAIN_WITH_GT_10_URLS, 1);
        } else if (_currentRootDomainSpilledItemCount > 1) {
          reporter.incrCounter(Counters.DOMAIN_WITH_LT_10_URLS, 1);
        } else if (_currentRootDomainSpilledItemCount == 1) {
          reporter.incrCounter(Counters.DOMAIN_WITH_1_URL, 1);
        }
      }

      int crawlerIndex = (((Long)_currentRootDomainId).hashCode() & Integer.MAX_VALUE) % crawlerCount;
      
      if (reporter != null) {
        reporter.incrCounter("CRAWLER_", Long.toString(crawlerIndex), 1);
      }

      
      _currentRootDomainIdValid = false;
      _currentRootDomainId = -1;
      _currentSubDomainIdValid = false;
      _genHomePageURLForSubDomain = true;
      _currentSubDomainId = -1;
      currentDomainCrawlIdx = -1;
      _currentRootDomainSpilledItemCount = 0;
      _currentRootDomainURLCount = 0;
    }
    
    /** potentially reset state based on domain id transition **/
    private void rootDomainTransition(long newDomainFP,Reporter reporter) throws IOException {
      if (_currentRootDomainIdValid) {
        flushRootDomain(reporter);
      }

      _skipDomain = false;

      // zero out item count ...
      items.clear();
      // reset domain id
      _currentRootDomainId = newDomainFP;
      _currentRootDomainIdValid = true;
      currentDomainCrawlIdx = (((int) _currentRootDomainId & Integer.MAX_VALUE) % crawlerCount);
      // reset current domain url count
      _currentRootDomainURLCount = 0;
      // and reset last bundle id
      currentBundleId = 0;
      // reset spill count for domain
      _currentRootDomainSpilledItemCount = 0;
    }
    
    Set<String> validSchemes = new ImmutableSet.Builder<String>()
        .add("http")
        .add("https")
        .build();
    
    //static Set<String> = ImmutableSet.Builder<String> 
    
    static String makeHomePageURLFromUrlObject(GoogleURL urlObject) {
      String urlOut = urlObject.getScheme();
      urlOut += ("://");

      if (urlObject.getUserName() != GoogleURL.emptyString) {
        urlOut += (urlObject.getUserName());
        if (urlObject.getPassword() != GoogleURL.emptyString) {
          urlOut += (":");
          urlOut += (urlObject.getPassword());
        }
        urlOut += ("@");
      }

      String host = urlObject.getHost();
      if (host.endsWith(".")) {
        host = host.substring(0, host.length() - 1);
      }
      urlOut += (host);
      urlOut += "/";
      
      return urlOut;
    }
    
    void emitURLObject(GoogleURL urlObject,JsonObject originalJSON,float rank,Reporter reporter)throws IOException { 
      URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
      if (fp != null) {
        if (emittedTuplesFilter.isPresent(fp)) {
          reporter.incrCounter(Counters.SKIPPING_ALREADY_EMITTED_URL, 1);
        } else {
          
          reporter.incrCounter(Counters.EMITTING_URL_OBJECT, 1);
          
          emittedTuplesFilter.add(fp);
          urlsInFilter++;

          SegmentGeneratorItem itemValue = new SegmentGeneratorItem();

          itemValue.setDomainFP(fp.getDomainHash());
          itemValue.setRootDomainFP(fp.getRootDomainHash());
          itemValue.setUrlFP(fp.getUrlHash());
          itemValue.setUrl(urlObject.getCanonicalURL());
          itemValue.setPageRank(rank);
          itemValue.setModifiedStatus((byte) 0);
          
          if (originalJSON != null) { 
            if (originalJSON.has(CrawlListGenCommon.CRAWLLIST_METADATA_ETAG)) { 
              itemValue.setEtag(originalJSON.get(CrawlListGenCommon.CRAWLLIST_METADATA_ETAG).getAsString());
            }
            if (originalJSON.has(CrawlListGenCommon.CRAWLLIST_METADATA_LAST_MODIFIED_TIME)){
              itemValue.setLastModifiedTime(originalJSON.get(CrawlListGenCommon.CRAWLLIST_METADATA_LAST_MODIFIED_TIME).getAsLong());
            }
          }
          
          items.add(itemValue);

          if (items.size() >= SPILL_THRESHOLD)
            spillItems(reporter);

        }
      } else {
        reporter.incrCounter(Counters.NULL_FP_FOR_URL, 1);
      }
      
    }
    
    void emitURL(String url,float rank, JsonObject originalJSON, Reporter reporter)throws IOException { 
      GoogleURL urlObject = new GoogleURL(url);
      if (urlObject.isValid()) { 
        String scheme = urlObject.getScheme().toLowerCase();
        if (!validSchemes.contains(scheme)) { 
          reporter.incrCounter(Counters.INVALID_SCHEME, 1);
        }
        else { 
          if (_genHomePageURLForSubDomain) {
            reporter.incrCounter(Counters.GENERATING_HOME_PAGE_URL, 1);
            _genHomePageURLForSubDomain = false;
            // generate homepage url ... 
            String homePageURL = makeHomePageURLFromUrlObject(urlObject);
            if (homePageURL != null) { 
              GoogleURL homePageURLObj = new GoogleURL(homePageURL);
              if (homePageURLObj.isValid()) { 
                emitURLObject(homePageURLObj, null, 10000.00f, reporter);
              }
            }
          }
          
          emitURLObject(urlObject, originalJSON, rank, reporter);
        }
      }
      else { 
        reporter.incrCounter(Counters.INVALID_URL_OBJECT, 1);
      }
    }
    
    @Override
    public void reduce(IntWritable key,Iterator<Text> values,OutputCollector<SegmentGeneratorBundleKey, SegmentGeneratorItemBundle> output,Reporter reporter) throws IOException {
      
      // set up merge attributes
      Configuration localMergeConfig = new Configuration(_conf);

      localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, CrawlListKey.CrawListKeyComparator.class,
          RawComparator.class);
      localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, CrawlListKey.class, WritableComparable.class);
      
      // ingest input paths .  
      Pair<FileSystem,List<Path>> fileSystemPathTuple = buildInputPathList(localMergeConfig, values);
      
      RawValueIterator rawValueIterator = new RawValueIterator();

      JsonParser parser = new JsonParser();
      
      _collector = output;
      
      // startup merger ...
      LOG.info("FileSystem is:" + fileSystemPathTuple.e0);
      LOG.info("Merger Input Paths are:" + fileSystemPathTuple.e1);
      MultiFileInputReader<CrawlListKey> multiFileInputReader = new MultiFileInputReader<CrawlListKey>(fileSystemPathTuple.e0, fileSystemPathTuple.e1, localMergeConfig);

      try { 
        Pair<KeyAndValueData<CrawlListKey>,Iterable<RawRecordValue>> nextItem = null;

        // walk tuples and feed them to the actual reducer ...  
        while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
          reporter.incrCounter(Counters.GET_NEXT_RECORD_FROM_MERGER, 1);
          // check the current domain id to see if need to do a domain transition  
          long newRootDomainId = nextItem.e0._keyObject.partitionDomainKey;
          if (!_currentRootDomainIdValid || newRootDomainId != _currentRootDomainId) { 
            // domain transition detected ... 
            rootDomainTransition(newRootDomainId, reporter);
          }
          long newSubDomainId = nextItem.e0._keyObject.comparisonDomainKey;
          // now check for subdomain transition ... 
          if (!_currentSubDomainIdValid || newSubDomainId != _currentSubDomainId) {
            _currentSubDomainId = newSubDomainId;
            _genHomePageURLForSubDomain = true;
          }
          
          // reset values iterator ...  
          rawValueIterator.reset(nextItem.e1);
          
          while (rawValueIterator.hasNext()) {
            reporter.incrCounter(Counters.GOT_RAW_RECORD_FROM_ITERATOR, 1);
            //LOG.info("Got Record From Source:" + rawValueIterator.currentSource);
            String json = rawValueIterator.next().toString();
            JsonObject jsonObj = parser.parse(json).getAsJsonObject();
            if (jsonObj.has(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY)) {
              emitURL(
                  jsonObj.get(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY).getAsString(),
                  (float)rawValueIterator.currentKey().rank0,
                  jsonObj,
                  reporter);
            }
            else { 
              reporter.incrCounter(Counters.NO_SOURCE_URL_IN_JSON, 1);
            }
          }
          reporter.progress();
        }
        // flush trailing domain
        rootDomainTransition(Long.MAX_VALUE, reporter);
      }
      finally { 
        multiFileInputReader.close();
      }
        
    } 
    
  }
}