NewCrawlStatsCollectorStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.Vector;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.rank.GenSuperDomainListStep;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JSONUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergeInputFormat;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergePartitioner;
import org.commoncrawl.util.SuperDomainList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.Tuples.Pair;
import org.commoncrawl.util.URLUtils;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

public class NewCrawlStatsCollectorStep extends CrawlPipelineStep{

  private static final Log LOG = LogFactory.getLog(NewCrawlStatsCollectorStep.class);

  public static final String OUTPUT_DIR_NAME = "crawlDBStatsV2";
  
  public static final String SUPER_DOMAIN_FILE_PATH = "super-domain-list";


  public NewCrawlStatsCollectorStep(CrawlPipelineTask task) {
    super(task, "Crawl DB Stats Collector", OUTPUT_DIR_NAME);
  }

  @Override
  public Log getLogger() {
    return LOG;
  }

  @Override
  public void runStep(Path outputPathLocation) throws IOException {
    LOG.info("Task Identity Path is:" + getTaskIdentityPath());
    
    DomainMetadataTask rootTask = (DomainMetadataTask) getRootTask();
    
    ImmutableList<Path> paths = new ImmutableList.Builder<Path>().addAll(rootTask.getRestrictedMergeDBDataPaths()).build();

    Path superDomainListPath = new Path(getOutputDirForStep(GenSuperDomainListStep.class), "part-00000");

    JobConf jobConf = new JobBuilder("New Domain Stats Collector", getConf())

    .inputs(paths)
    .inputFormat(MultiFileMergeInputFormat.class)
    .mapperKeyValue(IntWritable.class, Text.class)
    .outputKeyValue(TextBytes.class, TextBytes.class)
    .outputFormat(SequenceFileOutputFormat.class)
    .reducer(CrawlDBStatsCollectingReducer.class, false)
    .partition(MultiFileMergePartitioner.class)
    .numReducers(1000)
    .maxReduceAttempts(4)
    .maxReduceTaskFailures(10)
    .speculativeExecution(true)
    .output(outputPathLocation)
    .compressMapOutput(false).compressor(CompressionType.BLOCK, SnappyCodec.class)
    .set(SUPER_DOMAIN_FILE_PATH, superDomainListPath.toString())

    .build();

    LOG.info("Starting JOB");
    JobClient.runJob(jobConf);
    LOG.info("Finsihed JOB");
    
  }
  
  public static class CrawlDBStatsCollectingReducer implements Reducer<IntWritable,Text,TextBytes,TextBytes> {

    Set<Long> superDomainIdSet;
    JobConf _jobConf;

    @Override
    public void configure(JobConf job) {
      
      _jobConf = job;
      
      Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH));

      try {
        superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile);
      } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw new RuntimeException(e);
      }
    }

    @Override
    public void close() throws IOException {
    }

    String      activeDomain = "";
    JsonObject  activeRecord = new JsonObject();
    OutputCollector<TextBytes, TextBytes> _collector;
    TextBytes   activeDomainKeyBytes = new TextBytes();
    TextBytes   activeDomainValueBytes = new TextBytes();
    HashSet<String> activeDomainIPs = new HashSet<String>();
    
    private void setActiveDomain(String hostName)throws IOException { 
      if (!activeDomain.equalsIgnoreCase(hostName)) { 
        if (activeDomain.length() != 0) {
          if (activeDomainIPs.size() != 0) { 
            JSONUtils.stringCollectionToJsonArray(activeRecord,CrawlStatsCommon.CRAWLSTATS_IPS,activeDomainIPs);
          }
          activeDomainKeyBytes.set(activeDomain);
          activeDomainValueBytes.set(activeRecord.toString());
          _collector.collect(activeDomainKeyBytes, activeDomainValueBytes);
        }
        activeDomain = hostName;
        activeRecord = new JsonObject();
        activeDomainIPs.clear();
      }
    }
    
    
    static final String stripWWW(String host) {
      if (host.startsWith("www.")) {
        return host.substring("www.".length());
      }
      return host;
    }
    
    enum Counters {
      HIT_EXCEPTION_PROCESSING_RECORD, NO_SOURCE_URL_PROPERTY, INVALID_URL_DETECTED, HIT_SUMMARY_RECORD, HAD_SUMMARY_DETAILS_ARRAY, HAD_ATTEMPT_COUNT, HAD_CRAWL_COUNT, HAD_REDIRECT_URL_PROPERTY, REDIRECT_WENT_OUT_OF_DOMAIN, WWW_TO_NON_WWW_DETECTED, NON_WWW_TO_WWW_DETECTED, GOT_LINK_STATUS_RECORD, GOT_EXTRADOMAIN_SOURCES_COUNT, GOT_BLEKKO_RECORD, GOT_BLEKKO_GT1_RECORD, GOT_MERGED_RECORD 
      
    }
    @Override
    public void reduce(IntWritable key, Iterator<Text> values,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException {
      // collect all incoming paths first
      Vector<Path> incomingPaths = new Vector<Path>();

      while (values.hasNext()) {
        String path = values.next().toString();
        LOG.info("Found Incoming Path:" + path);
        incomingPaths.add(new Path(path));
      }

      // set up merge attributes
      Configuration localMergeConfig = new Configuration(_jobConf);

      localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, CrawlDBKey.LinkKeyComparator.class,
          RawComparator.class);
      localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class);

      FileSystem fs = FileSystem.get(incomingPaths.get(0).toUri(),_jobConf);
      
      // ok now spawn merger
      MultiFileInputReader<TextBytes> multiFileInputReader 
        = new MultiFileInputReader<TextBytes>(fs, incomingPaths, localMergeConfig);

      try {
        
        Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;
    
        TextBytes valueText = new TextBytes();
        DataInputBuffer valueStream = new DataInputBuffer();
        JsonParser parser = new JsonParser();
        _collector = output;
        
        while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
        
          long recordType = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
          if (recordType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) {
            reporter.incrCounter(Counters.GOT_MERGED_RECORD, 1);
            // walk records
            RawRecordValue rawValue = Iterators.getNext(nextItem.e1.iterator(),null);
            if (rawValue != null) {
              valueStream.reset(rawValue.data.getData(),0,rawValue.data.getLength());
              valueText.setFromRawTextBytes(valueStream);
              try {
                JsonObject mergeRecord = parser.parse(valueText.toString()).getAsJsonObject();
                
                if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY)) {
                  String sourceURL = mergeRecord.get(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY).getAsString();
                  GoogleURL urlObject = new GoogleURL(sourceURL);
                  if (urlObject.isValid()) { 
                    String hostName = stripWWW(urlObject.getHost());
                    setActiveDomain(hostName);
                    
                    JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_URL_COUNT);
                    
                    boolean crawled = false;
                    boolean inCrawlDB = false;
                    if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY)) {
                      reporter.incrCounter(Counters.HIT_SUMMARY_RECORD, 1);
                      
                      inCrawlDB = true;
                      JsonObject crawlStatus = mergeRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY);
                      
                      if (crawlStatus.has(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) {
                        reporter.incrCounter(Counters.HAD_SUMMARY_DETAILS_ARRAY, 1);
                        JsonArray crawlDetails = crawlStatus.getAsJsonArray(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY);
                        for (JsonElement arrayElement : crawlDetails) { 
                          JsonObject crawlDetail = arrayElement.getAsJsonObject();
                          if (crawlDetail.has(CrawlDBCommon.CRAWLDETAIL_SERVERIP_PROPERTY)) { 
                            activeDomainIPs.add(crawlDetail.get(CrawlDBCommon.CRAWLDETAIL_SERVERIP_PROPERTY).getAsString());
                          }
                        }
                      }
                      
                      if (crawlStatus.has(CrawlDBCommon.SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY)) {
                        reporter.incrCounter(Counters.HAD_ATTEMPT_COUNT, 1);
                        JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_ATTEMPTED_COUNT);
                      }
                      if (crawlStatus.has(CrawlDBCommon.SUMMARYRECORD_CRAWLCOUNT_PROPERTY)) {
                        reporter.incrCounter(Counters.HAD_CRAWL_COUNT, 1);
                        if (crawlStatus.get(CrawlDBCommon.SUMMARYRECORD_CRAWLCOUNT_PROPERTY).getAsInt() > 0) { 
                          JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_CRAWLED_COUNT);
                          crawled = true;
                        }
                      }
                      if (crawlStatus.has(CrawlDBCommon.SUMMARYRECORD_REDIRECT_URL_PROPERTY)) {
                        reporter.incrCounter(Counters.HAD_REDIRECT_URL_PROPERTY, 1);
                        JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_REDIRECTED_COUNT);
                        String redirectURL = crawlStatus.get(CrawlDBCommon.SUMMARYRECORD_REDIRECT_URL_PROPERTY).getAsString();
                        GoogleURL redirectURLObj = new GoogleURL(redirectURL);
                        if (redirectURLObj.isValid()) { 
                          String originalRootHost = URLUtils.extractRootDomainName(urlObject.getHost());
                          String redirectRootHost = URLUtils.extractRootDomainName(redirectURLObj.getHost());
                          if (originalRootHost != null && redirectRootHost != null) { 
                            if (!originalRootHost.equalsIgnoreCase(redirectRootHost)){
                              reporter.incrCounter(Counters.REDIRECT_WENT_OUT_OF_DOMAIN, 1);
                              JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_REDIRECTED_OUT_COUNT);
                            }
                            else { 
                              if (!redirectURLObj.getHost().startsWith("www.") && urlObject.getHost().startsWith("www.")) {
                                reporter.incrCounter(Counters.WWW_TO_NON_WWW_DETECTED, 1);
                                JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_WWW_TO_NON_WWW_REDIRECT);
                              }
                              else if (redirectURLObj.getHost().startsWith("www.") && !urlObject.getHost().startsWith("www.")) {
                                reporter.incrCounter(Counters.NON_WWW_TO_WWW_DETECTED, 1);
                                JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_NON_WWW_TO_WWW_REDIRECT);
                              }
                            }
                          }
                        }
                      }
                    }
                    
                    if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY)) {
                      reporter.incrCounter(Counters.GOT_LINK_STATUS_RECORD, 1);
                      inCrawlDB = true;
                      JsonObject linkStatus = mergeRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY);
                      if (linkStatus.has(CrawlDBCommon.LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY)) {
                        reporter.incrCounter(Counters.GOT_EXTRADOMAIN_SOURCES_COUNT, 1);
                        JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_EXTERNALLY_LINKED_URLS);
                        if (!crawled) { 
                          JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_EXTERNALLY_LINKED_NOT_CRAWLED_URLS);
                        }
                      }
                    }
                    
                    if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_BLEKKO_METADATA_PROPERTY)) {
                      reporter.incrCounter(Counters.GOT_BLEKKO_RECORD, 1);
                      JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL);
                      if (!inCrawlDB) { 
                        JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL_NOT_IN_CC);
                      }
                      
                      JsonObject blekkoStatus = mergeRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_BLEKKO_METADATA_PROPERTY);
                      if (blekkoStatus.has(CrawlDBCommon.BLEKKO_METADATA_STATUS)) { 
                        String statusStr = blekkoStatus.get(CrawlDBCommon.BLEKKO_METADATA_STATUS).getAsString();
                        if (statusStr.equalsIgnoreCase("crawled")) { 
                          JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_CRAWLED_COUNT);
                          if (crawled) { 
                            JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_AND_CC_CRAWLED_COUNT);
                          }
                        }
                      }
                      if (blekkoStatus.has(CrawlDBCommon.BLEKKO_METADATA_RANK_10)) { 
                        if (blekkoStatus.get(CrawlDBCommon.BLEKKO_METADATA_RANK_10).getAsDouble() >= 1.0) {
                          reporter.incrCounter(Counters.GOT_BLEKKO_GT1_RECORD, 1);
                          JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL_HAD_GT_1_RANK);
                        }
                      }
                    }
                    
                  }
                  else { 
                    reporter.incrCounter(Counters.INVALID_URL_DETECTED, 1);
                  }
                }
                else { 
                  reporter.incrCounter(Counters.NO_SOURCE_URL_PROPERTY, 1);
                }
              }
              catch (Exception e) {
                reporter.incrCounter(Counters.HIT_EXCEPTION_PROCESSING_RECORD, 1);
                LOG.error(CCStringUtils.stringifyException(e));
              }
            }
          }
        }
        setActiveDomain("");
      }
      finally { 
        multiFileInputReader.close();
      }      
    } 
    
  }

}