NewPartitionUrlsStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.pipelineV3.crawllistgen;

import java.io.IOException;
import java.net.MalformedURLException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCommon;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JSONUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.S3NFileSystem;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.Tuples.Pair;
import org.commoncrawl.util.URLUtils;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

public class NewPartitionUrlsStep extends CrawlPipelineStep {

  public NewPartitionUrlsStep(CrawlPipelineTask task) {
    super(task, "Partition URLS", OUTPUT_DIR_NAME);
  }

  public static final String ROOTDOMAIN_METADATA_PATH = "root.meta.path";
  public static final String SUBDOMAIN_METADATA_PATH = "subdomain.meta.path";
  
  public static final int NUM_PARTITIONS = 100;
  
  private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
  static {
    NUMBER_FORMAT.setMinimumIntegerDigits(5);
    NUMBER_FORMAT.setGroupingUsed(false);
  }    

  private static final Log LOG = LogFactory.getLog(NewPartitionUrlsStep.class);

  public static final String OUTPUT_DIR_NAME = "paritionUrlsStep";
  
  @Override
  public Log getLogger() {
    return LOG;
  }

  static final String PARTITION_ID_START_PROPERTY = "listgen.partitionIdStart";
  static final String NUM_PARTITIONS_PROPERTY = "listgen.numPartitions";
  
  
  @Override
  public void runStep(Path outputPathLocation) throws IOException {
    
    
    // get entire set of input crawl db paths ... 
    DomainMetadataTask rootTask = (DomainMetadataTask)getRootTask();
    
    Configuration conf = new Configuration(rootTask.getConf()); 

    int partitionStartId = conf.getInt(PARTITION_ID_START_PROPERTY,0);
    int numPartitions = conf.getInt(NUM_PARTITIONS_PROPERTY,NUM_PARTITIONS);
    
    LOG.info("Partition Id Start:" + partitionStartId);
    LOG.info("Num Partitions:" + numPartitions);

    List<Path> crawlDBPaths = rootTask.getRestrictedMergeDBDataPaths();
    
    LOG.info("Input Paths are:" + crawlDBPaths);
    
    // partition ... 
    Iterable<List<Path>> partitions = Iterables.partition(crawlDBPaths, crawlDBPaths.size() / numPartitions);
    
    // get a file system object ... 
    FileSystem fs = FileSystem.get(outputPathLocation.toUri(),conf);
    // iterate partitions
    int partitionIndex = partitionStartId;
    for (List<Path> partitionPaths : partitions) {
      
      // construct output path ... 
      Path partitionOutputPath = new Path(outputPathLocation,NUMBER_FORMAT.format(partitionIndex));
      // if not present ... 
      if (!fs.exists(partitionOutputPath)) { 
        runStepForPartition(rootTask,partitionIndex,partitionPaths,partitionOutputPath);
      }
      ++partitionIndex;
    }
  }
  
  
  void runStepForPartition(DomainMetadataTask rootTask,int partitionIndex,List<Path> inputPaths,Path outputPath)throws IOException { 
    // build the basic job config ... 
    JobConf job = new JobBuilder("Parition URL List", new Configuration()) 
    
    .inputFormat(PartitionJoinInputFormat.class)
    .mapper(RankAndFilterMapper.class)
    .keyValue(CrawlListKey.class, TextBytes.class)
    .sort(CrawlListKey.CrawListKeyComparator.class)
    .partition(CrawlListKey.CrawlListKeyPartitioner.class)
    .numReducers(CrawlListGenCommon.NUM_LIST_PARTITIONS)
    .compressor(CompressionType.BLOCK, SnappyCodec.class)
    .output(outputPath)
    .outputIsSeqFile()
    .build();
    
    job.setInt("mapred.task.timeout",4*(60*(60*1000)));
    
    // write partition paths ... 
    PartitionJoinInputFormat.writeSinglePathPerPartition(inputPaths, job);
    // ok, figure out locations of dependent metadata ... 
    job.set(ROOTDOMAIN_METADATA_PATH, rootTask.getOutputDirForStep(ShardRootDomainClassificationStep.class).toString());
    job.set(SUBDOMAIN_METADATA_PATH, rootTask.getOutputDirForStep(ShardSubDomainMetadataStep.class).toString());
    
    // run it ... 
    JobClient.runJob(job);
  }

  
  @Override
  public boolean isComplete() throws IOException {
    
    /*
    // get entire set of input crawl db paths ... 
    DomainMetadataTask rootTask = (DomainMetadataTask)getRootTask();
    Configuration conf = new Configuration(rootTask.getConf());

    List<Path> crawlDBPaths = rootTask.getMergeDBDataPaths();
    
    int partitionStartId = conf.getInt(PARTITION_ID_START_PROPERTY,0);
    int numPartitions = conf.getInt(NUM_PARTITIONS_PROPERTY,NUM_PARTITIONS);

    Path outputPath = getOutputDir();
    
    // get a file system object ... 
    FileSystem fs = FileSystem.get(outputPath.toUri(),conf);
    // iterate partitions
    for (int partitionIndex=partitionStartId;partitionIndex<partitionStartId+numPartitions;++partitionIndex) { 
      // construct output path ... 
      Path partitionOutputPath = new Path(outputPath,NUMBER_FORMAT.format(partitionIndex));
      
      // if not present ... 
      if (!fs.exists(partitionOutputPath)) {
        LOG.info("Partition output path:" + partitionOutputPath + " not found!");
        return false;
      }
      else { 
        LOG.info("Found partition output path:" + partitionOutputPath);
      }
      ++partitionIndex;
    }
    */
    return true;
  }
  
  public static class RankAndFilterMapper implements  Mapper<IntWritable,Text,CrawlListKey,TextBytes> {

    Path rootDomainMetaPath;
    Path subDomainMetaPath;
    JobConf _conf;
    
    @Override
    public void configure(JobConf job) {
      rootDomainMetaPath = new Path(job.get(ROOTDOMAIN_METADATA_PATH));
      subDomainMetaPath  = new Path(job.get(SUBDOMAIN_METADATA_PATH));
      _conf = job;
    }

    @Override
    public void close() throws IOException {      
    }

    OutputCollector<CrawlListKey, TextBytes> _collector;

    enum Counters { 
      ROOT_DOMAIN_ID_MISMATCH, SKIPPING_BLACKLISTED_URL, SKIPPING_LIMITED_CRAWL_URL, SUBDOMAIN_METADATA_WITHOUT_MATCHING_ROOT_DOMAIN_METADATA, URL_METADATA_MISSING_URL, FOUND_LINK_RECORD, FILTERED_OUT_URL, SKIPPING_NON_RANKED_URL, BLEKKO_URL, CC_URL, EMITTED_URL_WITH_QUERY, COULD_NOT_RESOLVE_WWW_PREFIX, USING_WWW_PREFIX, SKIPPING_QUERY_URL, NO_SOURCE_URL
    }
    
    
    static void extractHTTPHeaderData(JsonObject metadataObject,JsonObject jsonOut) throws IOException { 
      if (metadataObject.has(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY)) { 
        JsonObject summaryRecord = metadataObject.getAsJsonObject(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY);
        if (summaryRecord.has(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) { 
          JsonArray crawlDetails = summaryRecord.getAsJsonArray(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY);
          
          long latestAttemptTime = -1;
          long lastModifiedTime = -1;
          String etag = null;
          
          for (JsonElement detailElement : crawlDetails) { 
            JsonObject detailRecord = detailElement.getAsJsonObject();
            int httpResult = JSONUtils.safeGetInteger(detailRecord, "http_result");
            if (httpResult >= 200 && httpResult < 300) {
              long attemptTime = JSONUtils.safeGetLong(detailRecord, "attempt_time");
              if (attemptTime != -1 && attemptTime > latestAttemptTime) { 
                if (detailRecord.has("last-modified")) { 
                  lastModifiedTime = detailRecord.get("last-modified").getAsLong();
                }
                else { 
                  lastModifiedTime = -1L;
                }
                if (detailRecord.has("etag")) { 
                  etag = detailRecord.get("etag").getAsString();
                }
                else { 
                  etag = null;
                }
              }
            }
          }
          
          if (lastModifiedTime != -1L) { 
            jsonOut.addProperty(CrawlListGenCommon.CRAWLLIST_METADATA_LAST_MODIFIED_TIME, lastModifiedTime);
          }
          if (etag != null) { 
            jsonOut.addProperty(CrawlListGenCommon.CRAWLLIST_METADATA_ETAG, etag);
          }
        }
      }
    }
    
    
    static final String stripWWW(String host) {
      if (host.startsWith("www.")) {
        return host.substring("www.".length());
      }
      return host;
    }
    
    
    public static String canonicalizeURL(GoogleURL urlObject,boolean prefixWithWWW) throws MalformedURLException {

      StringBuilder urlOut = new StringBuilder();

      urlOut.append(urlObject.getScheme());
      urlOut.append("://");

      if (urlObject.getUserName() != GoogleURL.emptyString) {
        urlOut.append(urlObject.getUserName());
        if (urlObject.getPassword() != GoogleURL.emptyString) {
          urlOut.append(":");
          urlOut.append(urlObject.getPassword());
        }
        urlOut.append("@");
      }

      String host = urlObject.getHost();
      if (host.endsWith(".")) {
        host = host.substring(0, host.length() - 1);
      }

      // and if we should prefix with www. add it back in ... 
      if (!host.startsWith("www.") && prefixWithWWW) {
        host = "www." + host;
      }
      urlOut.append(host);

      if (urlObject.getPort() != GoogleURL.emptyString
          && !urlObject.getPort().equals("80")) {
        urlOut.append(":");
        urlOut.append(urlObject.getPort());
      }
      if (urlObject.getPath() != GoogleURL.emptyString) {
        int indexOfSemiColon = urlObject.getPath().indexOf(';');
        if (indexOfSemiColon != -1) {
          urlOut.append(urlObject.getPath().substring(0, indexOfSemiColon));
        } else {
          urlOut.append(urlObject.getPath());
        }
      }
      if (urlObject.getQuery() != GoogleURL.emptyString) {
        urlOut.append("?");
        urlOut.append(urlObject.getQuery());
      }

      String canonicalizedURL = urlOut.toString();

      // phase 2 - remove common session id patterns
      canonicalizedURL = URLUtils.sessionIdNormalizer.normalize(canonicalizedURL, "");
      
      // phase 3 - stir back in ref if #!
      if (urlObject.getRef().length() != 0 && urlObject.getRef().charAt(0) == '!') { 
        canonicalizedURL += "#" + urlObject.getRef();
      }
      return canonicalizedURL;
    }
    
    static void addPartFileGivenPath(List<Path> paths,FileSystem fs,Path path) throws IOException { 
      FileStatus files[] = fs.globStatus(new Path(path,"part-*"));
      for (FileStatus file : files) { 
        paths.add(file.getPath());
      }
    }
    
    private static FileSystem getFileSystemForMergePath(Path path,Configuration conf)throws IOException { 
      // override S3N 
      if (path.toUri().getScheme().equalsIgnoreCase("s3n")) { 
        FileSystem fs = new S3NFileSystem();
        fs.initialize(path.toUri(), conf);
        return fs;
      }
      // conf.setClass("fs.s3n.impl", S3NFileSystem.class,FileSystem.class);
      return FileSystem.get(path.toUri(),conf);
    }
    
    
    @Override
    public void map(IntWritable key, Text value,OutputCollector<CrawlListKey, TextBytes> output, Reporter reporter) throws IOException {
      // set up merge attributes
      Configuration localMergeConfig = new Configuration(_conf);

      localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, CrawlDBKey.CrawlDBKeyComparator.class,
          Comparator.class);
      localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, CrawlDBKey.class, WritableComparable.class);
      
      // get the single input path... 
      Path inputPath = new Path(value.toString());
      
      // get fs based on path ... 
      FileSystem fs = FileSystem.get(inputPath.toUri(),_conf);
      
      ArrayList<Path> paths = Lists.newArrayList();
      // add join paths
      addPartFileGivenPath(paths, fs, rootDomainMetaPath);
      addPartFileGivenPath(paths, fs, subDomainMetaPath);
      paths.add(inputPath);
      
      LOG.info("Input Paths for Shard:" + key.get() + " Are:" + paths);
      
      
      // replace emr s3n for inputs ... 
      FileSystem mergefs = getFileSystemForMergePath(paths.get(0),localMergeConfig);
      
      // ok now spawn merger
      MultiFileInputReader<TextBytes> multiFileInputReader 
        = new MultiFileInputReader<TextBytes>(mergefs, paths, localMergeConfig);

      
      try { 
        Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;
        
        TextBytes valueText = new TextBytes();
        DataInputBuffer valueStream = new DataInputBuffer();
        JsonParser parser = new JsonParser();
        _collector = output;
        long       _rootDomainId = -1L;
        JsonObject _rootDomainMetadata = null;
        long       _subDomainId = -1L;
        boolean    _isSuperDomain = false;
        boolean    _isBlacklisted = false;
        boolean    _limitedCrawl = false;
        boolean    _prefixWithWWW = false;
        URLFilter filter = new URLFilter();
        CrawlListKey keyOut = new CrawlListKey();
        TextBytes valueOut = new TextBytes();
        
        while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
        
          //LOG.info("Key:"+ nextItem.e0._keyObject.toString());
          
          long recordType = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
          
          if (recordType == CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD.ordinal()) {
            RawRecordValue rawValue = Iterables.getFirst(nextItem.e1,null);
  
            valueStream.reset(rawValue.data.getData(),0,rawValue.data.getLength());
            valueText.setFromRawTextBytes(valueStream);
  
            _rootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID);
            //LOG.info("Got Root Domain Record:"+  _rootDomainId);

            _rootDomainMetadata = parser.parse(valueText.toString()).getAsJsonObject();
            
            _isSuperDomain = JSONUtils.safeGetBoolean(_rootDomainMetadata,CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_SUPERDOMAIN);
            _isBlacklisted = JSONUtils.safeGetBoolean(_rootDomainMetadata,CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_BLACKLISTED);
            _limitedCrawl  = JSONUtils.safeGetBoolean(_rootDomainMetadata,CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_LIMITED_CRAWL);
            _prefixWithWWW = false;
          }
          else if (recordType == CrawlDBKey.Type.KEY_TYPE_SUBDOMAIN_METADATA_RECORD.ordinal()) {

            _subDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID);
            
            RawRecordValue rawValue = Iterables.getFirst(nextItem.e1,null);
            valueStream.reset(rawValue.data.getData(),0,rawValue.data.getLength());
            
            long rootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID);
            if (rootDomainId != _rootDomainId) { 
              LOG.error("SubDomain:" + _subDomainId + " Root Id:" + rootDomainId 
                  + " did not match current root domain id:" + _rootDomainId);
              
              reporter.incrCounter(Counters.SUBDOMAIN_METADATA_WITHOUT_MATCHING_ROOT_DOMAIN_METADATA, 1);
              _isSuperDomain = false;
              _isBlacklisted = false;
              _limitedCrawl = false;
              _prefixWithWWW = false;
            }
            _prefixWithWWW = valueStream.readBoolean();
          }
          else if (recordType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) { 
            
            long currentRootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID);
            long currentDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID);
            if (currentRootDomainId == _rootDomainId && currentDomainId == _subDomainId) { 
              if (_isBlacklisted) { 
                reporter.incrCounter(Counters.SKIPPING_BLACKLISTED_URL, 1);
              }
              else if (_limitedCrawl) { 
                reporter.incrCounter(Counters.SKIPPING_LIMITED_CRAWL_URL, 1);
              }
              else {
                
                // get first record, which will be merge record ... 
                RawRecordValue firstRawValue = Iterables.getFirst(nextItem.e1, null);
  
                // convert to json object ... 
                valueStream.reset(firstRawValue.data.getData(),0,firstRawValue.data.getLength());
                valueText.setFromRawTextBytes(valueStream);
                              
                JsonObject mergedCrawlDBRecord = parser.parse(valueText.toString()).getAsJsonObject();
                
                int extRefCount = 0;
                int intRefCount = 0;
                // get external ref count 
                if (mergedCrawlDBRecord.has(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY)) { 
                  extRefCount = JSONUtils.safeGetInteger(mergedCrawlDBRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY),
                      CrawlDBCommon.LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY,0);
                  intRefCount = JSONUtils.safeGetInteger(mergedCrawlDBRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY),
                      CrawlDBCommon.LINKSTATUS_INTRADOMAIN_SOURCES_COUNT_PROPERTY,0);
                }
                
                
                String sourceURL = JSONUtils.safeGetStringFromElement(mergedCrawlDBRecord,CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY);
                
                if (sourceURL == null || sourceURL.length() == 0) { 
                  reporter.incrCounter(Counters.URL_METADATA_MISSING_URL, 1);
                }
                else { 
                  GoogleURL urlObject = new GoogleURL(sourceURL);
                  
                  if (filter.isURLCrawlable(urlObject, mergedCrawlDBRecord)) {
                    double ccRank = calcualteScore(intRefCount, extRefCount);
                    double blekkoRank = 0.0;
                    boolean inBlekkoFrontier = false;
                    boolean crawledByBlekko = false;
                    if (mergedCrawlDBRecord.has(CrawlDBCommon.TOPLEVEL_BLEKKO_METADATA_PROPERTY)) {
                      inBlekkoFrontier = true;
                      JsonObject blekkoStatus = mergedCrawlDBRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_BLEKKO_METADATA_PROPERTY);
                      if (blekkoStatus.has(CrawlDBCommon.BLEKKO_METADATA_STATUS)) { 
                        String statusStr = blekkoStatus.get(CrawlDBCommon.BLEKKO_METADATA_STATUS).getAsString();
                        if (statusStr.equalsIgnoreCase("crawled")) { 
                          crawledByBlekko = true;
                        }
                      }
                      if (blekkoStatus.has(CrawlDBCommon.BLEKKO_METADATA_RANK_10)) { 
                        blekkoRank = blekkoStatus.get(CrawlDBCommon.BLEKKO_METADATA_RANK_10).getAsDouble();
                      }                    
                    }
                    //if (inBlekkoFrontier || crawledByBlekko || blekkoRank != 0 || ((intRefCount >2  || extRefCount !=0))) 
                    {
                      boolean allowQueryURL = (inBlekkoFrontier && blekkoRank != 0 || crawledByBlekko);
                      if (urlObject.has_query() && !allowQueryURL ) { 
                        reporter.incrCounter(Counters.SKIPPING_QUERY_URL, 1);
                        /*
                        LOG.info("SKIPPED QueryURL Flags[" 
                            + " intRef:" + intRefCount
                            + " extRef:" + extRefCount
                            +"] URL:" 
                            + urlObject.getCanonicalURL());
                        */
                        
                      }
                      else { 
                        if (crawledByBlekko || blekkoRank != 0 || inBlekkoFrontier) { 
                          reporter.incrCounter(Counters.BLEKKO_URL, 1);
                        }
                        else { 
                          reporter.incrCounter(Counters.CC_URL, 1);
                        }
                        double cumilativeRank = 10000 * (crawledByBlekko ? 0 : 1);
                        cumilativeRank += 1000 * blekkoRank;
                        cumilativeRank += 10 * ccRank;
                        
                        // build output json 
                        JsonObject outputJSON = new JsonObject();
                                            
                        // extract http metadata ...
                        extractHTTPHeaderData(mergedCrawlDBRecord,outputJSON);
                        
                        if (urlObject.has_query()) { 
                          reporter.incrCounter(Counters.EMITTED_URL_WITH_QUERY, 1);
                          /*
                          LOG.info("Query URL Flags[" 
                              + " BlekkoF:" + inBlekkoFrontier
                              + " CByBlekko:" + crawledByBlekko
                              + " BR:" + blekkoRank
                              + " intRef:" + intRefCount
                              + " extRef:" + extRefCount
                              +"] URL:" 
                              + urlObject.getCanonicalURL());
                           */
                        }
                        
                        // canonicalize url ... 
                        String canonicalURL = canonicalizeURL(urlObject, _prefixWithWWW);
                        
                        // append it to output json  
                        outputJSON.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, canonicalURL);
    
                        // figure out partition domain ... 
                        // if not super domain, then partition on root domain id 
                        long partitionDomain = currentRootDomainId;
                        // if super domain then partition on root domain ... 
                        if (_isSuperDomain)
                          partitionDomain = currentDomainId;
                        // construct output key ... 
                        CrawlListKey.generateKey(keyOut, partitionDomain, currentDomainId, CrawlListKey.KEY_TYPE_URL, cumilativeRank, 0);
                        // set value text ... 
                        valueOut.set(outputJSON.toString());
                        // set 
                        output.collect(keyOut, valueOut);
                      }
                    }
                    /*
                    else { 
                      reporter.incrCounter(Counters.SKIPPING_NON_RANKED_URL, 1);
                      LOG.info("SKIPPED NonRankedURL Flags[" 
                          + " intRef:" + intRefCount
                          + " extRef:" + extRefCount
                          +"] URL:" 
                          + urlObject.getCanonicalURL());
                    }
                    */
                  }
                  else { 
                    reporter.incrCounter(Counters.FILTERED_OUT_URL, 1);
                  }
                }
              }
            }
            else { 
              reporter.incrCounter(Counters.ROOT_DOMAIN_ID_MISMATCH, 1);
              LOG.error("RootDomain Id Mismatch: Expected RH:" + _rootDomainId + " DH:" + _subDomainId 
                  + " Got:" + currentRootDomainId + ":" + currentDomainId);
            }
          }
          
          // keep pump primed... 
          reporter.progress();
        }
      }
      finally { 
        multiFileInputReader.close();
      }
    }
  }
  
  static double calcualteScore(int inlinksFromSameRoot, int inlinksFromDifferentRoot) {
    inlinksFromDifferentRoot = inlinksFromDifferentRoot+1;
    inlinksFromSameRoot = inlinksFromSameRoot+1;
    return (Math.min(Math.sqrt(Math.pow(Math.log(inlinksFromSameRoot) * .2, 2)
        + Math.pow(Math.log(inlinksFromDifferentRoot), 2)), 14) / 14.0) * 10.0;
  }
}