PartitionWikipediaUrlsStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.pipelineV3.crawllistgen;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCommon;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JSONUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.S3NFileSystem;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.Tuples.Pair;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

public class PartitionWikipediaUrlsStep extends CrawlPipelineStep {

  public static final String ROOTDOMAIN_METADATA_PATH = "root.meta.path";
  public static final String SUBDOMAIN_METADATA_PATH = "subdomain.meta.path";
  
  public static final String OUTPUT_DIR_NAME = "wikipedaURLS";
  
  public PartitionWikipediaUrlsStep(CrawlPipelineTask task) {
    super(task, "Partition Wikipedia", OUTPUT_DIR_NAME);
  }

  private static final Log LOG = LogFactory.getLog(PartitionWikipediaUrlsStep.class);
  
  @Override
  public Log getLogger() {
    return LOG;
  }

  @Override
  public void runStep(Path outputPathLocation) throws IOException {
    
    // get entire set of input crawl db paths ... 
    DomainMetadataTask rootTask = (DomainMetadataTask)getRootTask();
    
    Configuration conf = new Configuration();
    
    List<Path> inputPaths = Lists.newArrayList();
    
    Path dbpediaDataPath = new Path("s3n://aws-publicdatasets/common-crawl/wikipedia/dbpedia/3.8"); 
    
    FileSystem fs = FileSystem.get(dbpediaDataPath.toUri(),conf);
    
    for (FileStatus file : fs.globStatus(new Path(dbpediaDataPath,"*.nt"))) { 
      inputPaths.add(file.getPath());
    }
        
    Path tempPath = new Path("s3n://aws-publicdatasets/common-crawl/wikipedia/dbpedia/3.8/partitioned");
    
    JobConf job = new JobBuilder("Pre-Partition Wikipedia URLS", new Configuration())
    
      .inputFormat(TextInputFormat.class)
      .inputs(inputPaths)
      .mapper(DBPediaEntryParser.class)
      .keyValue(TextBytes.class, TextBytes.class)
      .sort(CrawlDBKey.LinkKeyComparator.class)
      .numReducers(100)
      .compressor(CompressionType.BLOCK, SnappyCodec.class)
      .output(tempPath)
      .outputIsSeqFile()
      .build();
    
    //  
    JobClient.runJob(job);
    
    Path tempPath2 = new Path("s3n://aws-publicdatasets/common-crawl/wikipedia/dbpedia/3.8/joined");
    
    // join root domain metadata and wikipedia data
    job  = new JobBuilder("Join to Root Domain Metadata", new Configuration())
    
    .input(tempPath)
    .input(rootTask.getOutputDirForStep(ShardRootDomainClassificationStep.class))
    .inputIsSeqFile()
    .mapperKeyValue(TextBytes.class, TextBytes.class)
    .outputKeyValue(CrawlListKey.class, TextBytes.class)
    .sort(CrawlDBKey.CrawlDBKeyGroupByRootDomainComparator.class)
    .partition(CrawlDBKey.PartitionBySuperDomainPartitioner.class)
    .reducer(JoinRootDomainMetadataEmitLinkKeyReducer.class, false)
    .outputIsSeqFile()
    .output(tempPath2)
    .build();
    
    // JobClient.runJob(job);
    
    // partition and sort by list key 
    job  = new JobBuilder("Sort by ListKey", new Configuration())
    
    .input(tempPath2)
    .inputIsSeqFile()
    .keyValue(CrawlListKey.class, TextBytes.class)
    .sort(CrawlListKey.CrawListKeyComparator.class)
    .partition(CrawlListKey.CrawlListKeyPartitioner.class)
    .outputIsSeqFile()
    .compressor(CompressionType.BLOCK, SnappyCodec.class)
    .output(outputPathLocation)
    .jarByClass(PartitionWikipediaUrlsStep.class)
    .numReducers(CrawlListGenCommon.NUM_LIST_PARTITIONS)
    .build();
    
    // JobClient.runJob(job);
    
    /*    
    // collect input paths from first stage  
    List<Path> secondStageInputs = Lists.newArrayList();
    
    for (FileStatus file : fs.globStatus(new Path(tempPath,"part-*"))) { 
      secondStageInputs.add(file.getPath());
    }
    

    // build the basic job config ... 
    job = new JobBuilder("Parition Wikipedia URLS", new Configuration()) 
    
    .inputFormat(PartitionJoinInputFormat.class)
    .mapper(WikipediaURLPartitioner.class)
    .keyValue(CrawlListKey.class, TextBytes.class)
    .sort(CrawlListKey.CrawListKeyComparator.class)
    .partition(CrawlListKey.CrawlListKeyPartitioner.class)
    .numReducers(CrawlListGenCommon.NUM_LIST_PARTITIONS)
    .compressor(CompressionType.BLOCK, SnappyCodec.class)
    .output(outputPathLocation)
    .outputIsSeqFile()
    .build();
    
    
    
    
    job.setInt("mapred.task.timeout",4*(60*(60*1000)));
    
    // write partition paths ... 
    PartitionJoinInputFormat.writeSinglePathPerPartition(secondStageInputs, job);
    // ok, figure out locations of dependent metadata ... 
    job.set(ROOTDOMAIN_METADATA_PATH, rootTask.getOutputDirForStep(ShardRootDomainClassificationStep.class).toString());
    
    // run it ... 
    JobClient.runJob(job);
    */
    
    
  }
  
  static String parseDBPediaLine(String str) {
    int lastIndexOfGT = str.lastIndexOf('>');
    if (lastIndexOfGT >= 0) { 
      int lastIndexOfLT = str.lastIndexOf('<',lastIndexOfGT);
      if (lastIndexOfLT < lastIndexOfGT) { 
        return str.substring(lastIndexOfLT + 1,lastIndexOfGT);
      }
    }
    return null;
  }
  
  /** 
   * 
   * @author rana
   *
   */
  public static class DBPediaEntryParser implements Mapper<LongWritable,Text,TextBytes,TextBytes> {

    @Override
    public void configure(JobConf job) {
      
    }

    @Override
    public void close() throws IOException {
      
    }
    
    enum Counters { 
      FAILED_TO_PARSE_ENTRY
    , INVALID_URL, NULL_FP}

    @Override
    public void map(LongWritable key, Text value,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException {
      
      String url = parseDBPediaLine(value.toString());
      
      if (url == null) { 
        reporter.incrCounter(Counters.FAILED_TO_PARSE_ENTRY, 1);
      }
      else { 
        GoogleURL urlObject = new GoogleURL(url);
        
        if (!urlObject.isValid()) { 
          reporter.incrCounter(Counters.INVALID_URL, 1);
        }
        else { 
          // generate a fingerprint 
          URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
          
          if (fp == null) { 
            reporter.incrCounter(Counters.NULL_FP, 1);
          }
          else { 
            JsonObject outputJSON = new JsonObject();
            // append it to output json  
            outputJSON.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, urlObject.getCanonicalURL());
            // emit a CrawlDBKey 
            TextBytes outputKey = CrawlDBKey.generateCrawlStatusKey(fp, 0);
            // write out
            output.collect(outputKey, new TextBytes(outputJSON.toString()));
          }
        }
      }
    } 
  }
  
  private static FileSystem getFileSystemForMergePath(Path path,Configuration conf)throws IOException { 
    // override S3N 
    if (path.toUri().getScheme().equalsIgnoreCase("s3n")) { 
      FileSystem fs = new S3NFileSystem();
      fs.initialize(path.toUri(), conf);
      return fs;
    }
    // conf.setClass("fs.s3n.impl", S3NFileSystem.class,FileSystem.class);
    return FileSystem.get(path.toUri(),conf);
  }

  static void addPartFileGivenPath(List<Path> paths,FileSystem fs,Path path) throws IOException { 
    FileStatus files[] = fs.globStatus(new Path(path,"part-*"));
    for (FileStatus file : files) { 
      paths.add(file.getPath());
    }
  }
  
  public static  class JoinRootDomainMetadataEmitLinkKeyReducer implements Reducer<TextBytes,TextBytes,CrawlListKey,TextBytes> {

    @Override
    public void configure(JobConf job) {
    }

    @Override
    public void close() throws IOException {      
    }

    JsonParser parser = new JsonParser();
    
    enum Counters { 
      FOUND_ROOT_DOMAIN_RECORD,
      DID_NOT_FIND_ROOT_DOMAIN_RECORD, BAD_URL, BAD_FP, EMITTED_HOMEPAGE_URL, JOINED_ROOT_DOMAIN_AND_WIKI_URL, BAD_JOIN_MORE_THAN_ONE_ROOT_DOMAIN, PARTITIONING_URL_WITH_SUBDOMAIN
    }
    
    static final int NUM_HASH_FUNCTIONS = 10;
    static final int NUM_BITS = 11;
    static final int NUM_ELEMENTS = 1 << 28;
    static final int FLUSH_THRESHOLD = 1 << 23;
    
    URLFPBloomFilter emittedTuplesFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);

    
    static String makeHomePageURLFromUrlObject(GoogleURL urlObject) {
      String urlOut = urlObject.getScheme();
      urlOut += (urlObject.getScheme());
      urlOut += ("://");

      if (urlObject.getUserName() != GoogleURL.emptyString) {
        urlOut += (urlObject.getUserName());
        if (urlObject.getPassword() != GoogleURL.emptyString) {
          urlOut += (":");
          urlOut += (urlObject.getPassword());
        }
        urlOut += ("@");
      }

      String host = urlObject.getHost();
      if (host.endsWith(".")) {
        host = host.substring(0, host.length() - 1);
      }
      urlOut += (host);
      urlOut += "/";
      
      return urlOut;
    }

    @Override
    public void reduce(TextBytes key, Iterator<TextBytes> values,OutputCollector<CrawlListKey, TextBytes> output, Reporter reporter)throws IOException {
      ArrayList<String> urls = new ArrayList<String>();
      boolean isSuperDomain = false;
      int     rootDomainRecordCount = 0;
      while (values.hasNext()) { 
        TextBytes nextValue = values.next();
        JsonObject object = parser.parse(nextValue.toString()).getAsJsonObject();
        if (object.has(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY)) { 
          urls.add(object.get(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY).getAsString());
        }
        else { 
          if (object.has(CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_SUPERDOMAIN)) {
            rootDomainRecordCount++;
            reporter.incrCounter(Counters.FOUND_ROOT_DOMAIN_RECORD, 1);
            isSuperDomain = object.get(CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_SUPERDOMAIN).getAsBoolean();
          }
        }
      }
      
      if (urls.size() != 0 && rootDomainRecordCount != 0) { 
        reporter.incrCounter(Counters.JOINED_ROOT_DOMAIN_AND_WIKI_URL, 1);
        if (rootDomainRecordCount > 1) { 
          reporter.incrCounter(Counters.BAD_JOIN_MORE_THAN_ONE_ROOT_DOMAIN,1);
        }
      }
      
      JsonObject objectOut = new JsonObject();
      CrawlListKey keyOut = new CrawlListKey();
      TextBytes valueOut = new TextBytes();
      URLFPV2 testKey = new URLFPV2();
      
      for (String url : urls) { 
        GoogleURL urlObject = new GoogleURL(url);
        if (!urlObject.isValid()) { 
          reporter.incrCounter(Counters.BAD_URL, 1);
        }
        else { 
          URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
          if (fp == null) { 
            reporter.incrCounter(Counters.BAD_FP, 1);
          }
          else { 
            // if not super domain, then partition on root domain id 
            long partitionDomain = fp.getRootDomainHash();
            // if super domain then partition on root domain ... 
            if (isSuperDomain) { 
              partitionDomain = fp.getDomainHash();
              reporter.incrCounter(Counters.PARTITIONING_URL_WITH_SUBDOMAIN, 1);
            }

            // populate json ... 
            objectOut.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, url);
            
            // set into text output
            valueOut.set(objectOut.toString());
            
            // construct output key ... 
            CrawlListKey.generateKey(keyOut, partitionDomain, fp.getDomainHash(), CrawlListKey.KEY_TYPE_URL, 100000, 0);
            
            // output 
            output.collect(keyOut, valueOut);
            
            // ok check to see if we emitted this tuple ...
            testKey.setDomainHash(fp.getDomainHash());
            testKey.setUrlHash(fp.getDomainHash());

            
            if (!emittedTuplesFilter.isPresent(testKey)) {
              // add to bloom
              emittedTuplesFilter.add(testKey);
              // emit home page entry 
              String homePageURL = makeHomePageURLFromUrlObject(urlObject);
              
              // construct output key ... 
              CrawlListKey.generateKey(keyOut, partitionDomain, fp.getDomainHash(), CrawlListKey.KEY_TYPE_HOMEPAGE_URL, 1, 0);
              
              // populate json ... 
              objectOut.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, homePageURL);
              
              // set into text output
              valueOut.set(objectOut.toString());
                            
              output.collect(keyOut,valueOut);
              
              reporter.incrCounter(Counters.EMITTED_HOMEPAGE_URL, 1);
            }
          }
        }
      }
    } 
  }
  
  public static class WikipediaURLPartitioner implements Mapper<IntWritable,Text,CrawlListKey,TextBytes> {

    Path rootDomainMetaPath;
    JobConf _conf;
    OutputCollector<CrawlListKey, TextBytes> _collector;


    @Override
    public void configure(JobConf job) {
      rootDomainMetaPath = new Path(job.get(ROOTDOMAIN_METADATA_PATH));
      _conf = job;
    }

    @Override
    public void close() throws IOException {
      // TODO Auto-generated method stub

    }

    enum Counters {
      SUBDOMAIN_METADATA_WITHOUT_MATCHING_ROOT_DOMAIN_METADATA, NO_SOURCE_URL, FILTERED_OUT_URL, ROOT_DOMAIN_RECORD, CRAWL_STATUS_RECORD, INVALID_URL, EMITTED_HOMEPAGE_URL, SKIPPPED_ALREADY_EMITTED_HOMEPAGE_URL 

    }

    static final int NUM_HASH_FUNCTIONS = 10;
    static final int NUM_BITS = 11;
    static final int NUM_ELEMENTS = 1 << 28;
    static final int FLUSH_THRESHOLD = 1 << 23;
    
    URLFPBloomFilter emittedTuplesFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
    
    static String makeHomePageURLFromUrlObject(GoogleURL urlObject) {
      String urlOut = urlObject.getScheme();
      urlOut += (urlObject.getScheme());
      urlOut += ("://");

      if (urlObject.getUserName() != GoogleURL.emptyString) {
        urlOut += (urlObject.getUserName());
        if (urlObject.getPassword() != GoogleURL.emptyString) {
          urlOut += (":");
          urlOut += (urlObject.getPassword());
        }
        urlOut += ("@");
      }

      String host = urlObject.getHost();
      if (host.endsWith(".")) {
        host = host.substring(0, host.length() - 1);
      }
      urlOut += (host);
      urlOut += "/";
      
      return urlOut;
    }
    
    @Override
    public void map(IntWritable key, Text value,OutputCollector<CrawlListKey, TextBytes> output, Reporter reporter) throws IOException {
      // set up merge attributes
      Configuration localMergeConfig = new Configuration(_conf);

      localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, CrawlDBKey.CrawlDBKeyComparator.class,
          Comparator.class);
      localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, CrawlDBKey.class, WritableComparable.class);

      // get the single input path... 
      Path inputPath = new Path(value.toString());

      // get fs based on path ... 
      FileSystem fs = FileSystem.get(inputPath.toUri(),_conf);

      ArrayList<Path> paths = Lists.newArrayList();
      // add join paths
      addPartFileGivenPath(paths, fs, rootDomainMetaPath);
      paths.add(inputPath);

      LOG.info("Input Paths for Shard:" + key.get() + " Are:" + paths);


      // replace emr s3n for inputs ... 
      FileSystem mergefs = getFileSystemForMergePath(paths.get(0),localMergeConfig);

      // ok now spawn merger
      MultiFileInputReader<TextBytes> multiFileInputReader 
      = new MultiFileInputReader<TextBytes>(mergefs, paths, localMergeConfig);


      try { 
        Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;

        TextBytes valueText = new TextBytes();
        DataInputBuffer valueStream = new DataInputBuffer();
        JsonParser parser = new JsonParser();
        _collector = output;
        long       _rootDomainId = -1L;
        JsonObject _rootDomainMetadata = null;
        boolean    _isSuperDomain = false;
        CrawlListKey keyOut = new CrawlListKey();
        TextBytes valueOut = new TextBytes();
        URLFPV2 testKey = new URLFPV2();
        JsonObject jsonObjOut = new JsonObject(); 

        while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

          //LOG.info("Key:"+ nextItem.e0._keyObject.toString());

          long recordType = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);

          if (recordType == CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD.ordinal()) {
            RawRecordValue rawValue = Iterables.getFirst(nextItem.e1,null);

            valueStream.reset(rawValue.data.getData(),0,rawValue.data.getLength());
            valueText.setFromRawTextBytes(valueStream);

            _rootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID);
            //LOG.info("Got Root Domain Record:"+  _rootDomainId);

            _rootDomainMetadata = parser.parse(valueText.toString()).getAsJsonObject();

            _isSuperDomain = JSONUtils.safeGetBoolean(_rootDomainMetadata,CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_SUPERDOMAIN);

            reporter.incrCounter(Counters.ROOT_DOMAIN_RECORD,1);
          }
          else if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) { 

            reporter.incrCounter(Counters.CRAWL_STATUS_RECORD,1);

            long currentRootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID);
            long currentDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID);

            // get first record, which will be merge record ... 
            RawRecordValue firstRawValue = Iterables.getFirst(nextItem.e1, null);

            // convert to json object ... 
            valueStream.reset(firstRawValue.data.getData(),0,firstRawValue.data.getLength());
            valueText.setFromRawTextBytes(valueStream);

            JsonObject jsonObject = parser.parse(valueText.toString()).getAsJsonObject();

            // extract url ... 
            if (jsonObject.has(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY)) {
              String url = jsonObject.get(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY).getAsString();
              
              if (currentRootDomainId != _rootDomainId) {
                reporter.incrCounter(Counters.SUBDOMAIN_METADATA_WITHOUT_MATCHING_ROOT_DOMAIN_METADATA, 1);
                _isSuperDomain = false;
                _rootDomainId = currentRootDomainId;
                LOG.error("No Root Domain Info for URL:" + url);
              }

              // figure out partition domain ... 
              // if not super domain, then partition on root domain id 
              long partitionDomain = currentRootDomainId;
              // if super domain then partition on root domain ... 
              if (_isSuperDomain)
                partitionDomain = currentDomainId;
              // construct output key ... 
              CrawlListKey.generateKey(keyOut, partitionDomain, currentDomainId, CrawlListKey.KEY_TYPE_URL, 100000, 0);
              // set 
              output.collect(keyOut, valueText);
              
              // generate home page url 
              GoogleURL urlObject = new GoogleURL(url);
              
              if (urlObject.isValid()) { 
                // ok check to see if we emitted this tuple ...
                testKey.setDomainHash(currentDomainId);
                testKey.setUrlHash(currentDomainId);

                if (!emittedTuplesFilter.isPresent(testKey)) {
                  // add to bloom
                  emittedTuplesFilter.add(testKey);
                  // emit home page entry 
                  String homePageURL = makeHomePageURLFromUrlObject(urlObject);
                  
                  // construct output key ... 
                  CrawlListKey.generateKey(keyOut, partitionDomain, currentDomainId, CrawlListKey.KEY_TYPE_HOMEPAGE_URL, 1, 0);
                  
                  // and proper JSON 
                  jsonObjOut.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, homePageURL);
                  
                  valueOut.set(jsonObjOut.toString());
                  
                  output.collect(keyOut,valueOut);
                  
                  reporter.incrCounter(Counters.EMITTED_HOMEPAGE_URL, 1);
                }
                else { 
                  reporter.incrCounter(Counters.SKIPPPED_ALREADY_EMITTED_HOMEPAGE_URL, 1);
                }
              }
              else { 
                reporter.incrCounter(Counters.INVALID_URL, 1);
              }
            }
            else { 
              reporter.incrCounter(Counters.NO_SOURCE_URL, 1);
            }
          }
        }
      }
      finally { 
        multiFileInputReader.close();
      }
    }
  }
}