NonSuperSubdomainCollectorStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats;

import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.rank.GenSuperDomainListStep;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.SuperDomainList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;

import com.google.gson.JsonArray;
import com.google.gson.JsonPrimitive;

/**
 * 
 * @author rana
 *
 */
public class NonSuperSubdomainCollectorStep extends CrawlPipelineStep implements
    Mapper<TextBytes, TextBytes, TextBytes, TextBytes>, Reducer<TextBytes, TextBytes, TextBytes, TextBytes> {

  enum Counters {
    HIT_MAXSUBDOMAIN_LIMIT, SKIPPED_SUBDOMAIN_SAME_AS_ROOT_VIA_ID, SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PREFIX,
    SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PATTERN_MATCH
  }

  private static final Log LOG = LogFactory.getLog(NonSuperSubdomainCollectorStep.class);

  static final int NUM_HASH_FUNCTIONS = 10;

  static final int NUM_BITS = 11;
  static final int NUM_ELEMENTS = 1 << 29;
  static final int FLUSH_THRESHOLD = 1 << 23;
  public static final String SUPER_DOMAIN_FILE_PATH = "super-domain-list";

  URLFPBloomFilter subDomainFilter;

  public static final String OUTPUT_DIR_NAME = "nonsuper-subdomains";

  URLFPV2 bloomKey = new URLFPV2();

  TextBytes emptyTextBytes = new TextBytes();

  Pattern wwwMatchPattern = Pattern.compile("www[\\-0-9]*\\.");

  Set<Long> superDomainIdSet;
  HashSet<String> domains = new HashSet<String>();

  static final int MAX_SUBDOMAINS_ALLOWED = 100;

  public NonSuperSubdomainCollectorStep() {
    super(null, null, null);
  }

  public NonSuperSubdomainCollectorStep(CrawlPipelineTask task) {
    super(task, "SubDomain Collector", OUTPUT_DIR_NAME);
  }

  @Override
  public void close() throws IOException {

  }

  @Override
  public void configure(JobConf job) {

    if (job.getBoolean("mapred.task.is.map", false)) {
      Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH));

      try {
        superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile);
      } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw new RuntimeException(e);
      }

      subDomainFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
    }
  }

  @Override
  public Log getLogger() {
    return LOG;
  }

  @Override
  public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter)
      throws IOException {
    String url = key.toString();
    GoogleURL urlObject = new GoogleURL(url);

    if (urlObject.isValid()) {

      String rootDomain = URLUtils.extractRootDomainName(urlObject.getHost());

      if (rootDomain != null) {
        long rootDomainId = SuperDomainList.domainFingerprintGivenName(rootDomain);

        if (!superDomainIdSet.contains(rootDomainId)) {

          long subDomainId = SuperDomainList.domainFingerprintGivenName(urlObject.getHost());

          if (subDomainId == rootDomainId) {
            reporter.incrCounter(Counters.SKIPPED_SUBDOMAIN_SAME_AS_ROOT_VIA_ID, 1);
            return;
          }

          // extract prefix ...
          String prefix = urlObject.getHost().substring(0, urlObject.getHost().length() - rootDomain.length());

          // straight match ...
          if (prefix.equals("www.")) {
            reporter.incrCounter(Counters.SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PREFIX, 1);
            return; // skip
          } else if (prefix.startsWith("www") && wwwMatchPattern.matcher(prefix).matches()) {
            reporter.incrCounter(Counters.SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PATTERN_MATCH, 1);
            return;
          }

          bloomKey.setDomainHash(subDomainId);
          bloomKey.setUrlHash(subDomainId);

          if (subDomainFilter.isPresent(bloomKey)) {
            // hacky but ,oh well, pressed for time
            return;
          }

          // add it to the BF NOW
          subDomainFilter.add(bloomKey);

          // emit as root domain , sub domain
          output.collect(new TextBytes(rootDomain), new TextBytes(urlObject.getHost()));
        }
      }
    }
  }

  @Override
  public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output,
      Reporter reporter) throws IOException {
    while (values.hasNext()) {
      domains.add(values.next().toString());
      if (domains.size() >= MAX_SUBDOMAINS_ALLOWED) {
        reporter.incrCounter(Counters.HIT_MAXSUBDOMAIN_LIMIT, 1);
        break;
      }
    }

    if (domains.size() != 0 && domains.size() < MAX_SUBDOMAINS_ALLOWED) {
      JsonArray array = new JsonArray();
      for (String domain : domains) {
        array.add(new JsonPrimitive(domain));
      }
      output.collect(key, new TextBytes(array.toString()));
    }
    domains.clear();
  }

  @Override
  public void runStep(Path outputPathLocation) throws IOException {

    DomainMetadataTask rootTask = findTaskOfType(DomainMetadataTask.class);
    Path superDomainListPath = new Path(getOutputDirForStep(GenSuperDomainListStep.class), "part-00000");

    JobConf job = new JobBuilder(getDescription(), getConf()).inputs(rootTask.getRestrictedMergeDBDataPaths())
        .inputIsSeqFile().mapper(NonSuperSubdomainCollectorStep.class).reducer(NonSuperSubdomainCollectorStep.class,
            false).numReducers(CrawlEnvironment.NUM_DB_SHARDS / 2).keyValue(TextBytes.class, TextBytes.class).output(
            outputPathLocation).outputIsSeqFile().set(SUPER_DOMAIN_FILE_PATH, superDomainListPath.toString()).reuseJVM(
            1000).build();

    JobClient.runJob(job);
  }

}