ShardSubDomainMetadataStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.pipelineV3.crawllistgen;

import java.io.IOException;
import java.util.ArrayList;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.crawllistgen.NewPartitionUrlsStep.RankAndFilterMapper.Counters;
import org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCommon;
import org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.JoinDomainMetadataStep;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.JSONUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLUtils;

import com.google.common.collect.Lists;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

public class ShardSubDomainMetadataStep extends CrawlPipelineStep {

  public static final String OUTPUT_DIR_NAME = "subDomainMetadata";

  private static final Log LOG = LogFactory.getLog(ShardSubDomainMetadataStep.class);

  public ShardSubDomainMetadataStep(CrawlPipelineTask task) {
    super(task, "Shard Subdomain Metadata", OUTPUT_DIR_NAME);
  }

  @Override
  public Log getLogger() {
    return LOG;
  }

  @Override
  public void runStep(Path outputPathLocation) throws IOException {
    
    Path joinDataPath = getRootTask().getOutputDirForStep(JoinDomainMetadataStep.class);
    
    ArrayList<Path> partFiles = Lists.newArrayList();
    
    FileSystem fs = FileSystem.get(joinDataPath.toUri(),new Configuration());
    
    FileStatus files[] = fs.globStatus(new Path(joinDataPath,"part-*"));
    
    for (FileStatus file:files) 
      partFiles.add(file.getPath());
    
    JobConf job = new JobBuilder("Shard Subdomain Metadata", new Configuration())
    
    
    .inputs(partFiles)
    .inputIsSeqFile()
    .mapper(SubDomainMetadataMapper.class)
    .keyValue(TextBytes.class, BooleanWritable.class)
    .numReducers(16)
    .output(outputPathLocation)
    .sort(CrawlDBKey.LinkKeyComparator.class)
    .outputIsSeqFile()
    .build();
    
    
    JobClient.runJob(job);
  }
  
  public static class SubDomainMetadataMapper implements Mapper<TextBytes,TextBytes,TextBytes,BooleanWritable> {

    @Override
    public void configure(JobConf job) {
      
    }

    @Override
    public void close() throws IOException {
      // TODO Auto-generated method stub
      
    }

    enum Counters { 
      BAD_HOSTNAME, USING_WWW_PREFIX, COULD_NOT_RESOLVE_WWW_PREFIX
    }
    
    
    JsonParser parser = new JsonParser();
    
    @Override
    public void map(TextBytes key, TextBytes value,OutputCollector<TextBytes, BooleanWritable> output, Reporter reporter)throws IOException {
      URLFPV2 fp = URLUtils.getURLFPV2FromHost(key.toString());
      if (fp == null) { 
        reporter.incrCounter(Counters.BAD_HOSTNAME, 1);
      }
      else { 
        fp.setUrlHash(Long.MIN_VALUE);
        
        TextBytes outputKey = CrawlDBKey.generateKey(fp, CrawlDBKey.Type.KEY_TYPE_SUBDOMAIN_METADATA_RECORD, 0L);
        JsonObject subDomainMetadata = parser.parse(value.toString()).getAsJsonObject();
        // json.addProperty("domain", key.toString());
        
        JsonObject crawlStats = subDomainMetadata.getAsJsonObject(CrawlStatsCommon.JOINEDMETDATA_PROPERTY_CRAWLSTATS);
        boolean prefixWithWWW = false;
        
        if (crawlStats != null) { 
          int urlCount = JSONUtils.safeGetInteger(crawlStats, CrawlStatsCommon.CRAWLSTATS_URL_COUNT,0);
          int WWWCount = JSONUtils.safeGetInteger(crawlStats, CrawlStatsCommon.CRAWLSTATS_NON_WWW_TO_WWW_REDIRECT,0);
          int nonWWWCount = JSONUtils.safeGetInteger(crawlStats, CrawlStatsCommon.CRAWLSTATS_WWW_TO_NON_WWW_REDIRECT,0);
          if (WWWCount > nonWWWCount) {
            if ((double)WWWCount / (double)(WWWCount + nonWWWCount) >= .30) {
              reporter.incrCounter(Counters.USING_WWW_PREFIX, 1);
              prefixWithWWW = true;
            }
            else { 
              reporter.incrCounter(Counters.COULD_NOT_RESOLVE_WWW_PREFIX,1);
              LOG.error("DName:" + key.toString()
                  + " WWWCount:"+ WWWCount + " NonWWWCount:" + nonWWWCount + " URLCount:"+ urlCount);
            }
          }
        }
        output.collect(outputKey,new BooleanWritable(prefixWithWWW));
      }
    }

    
  }
}