GenDomainRankStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.mapred.pipelineV3.domainmeta.rank;

import java.io.IOException;
import java.util.Iterator;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.SuperDomainList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;

import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

/**
 * Hopelessly simplistic hack to get a domain ranking to minimally help
 * prioritize the crawl
 * 
 * @author rana
 * 
 */
public class GenDomainRankStep extends CrawlPipelineStep {

  public static class GenerateRankReducer implements Reducer<TextBytes, TextBytes, TextBytes, DoubleWritable> {

    JsonParser parser = new JsonParser();

    static final int NUM_HASH_FUNCTIONS = 10;
    static final int NUM_BITS = 11;
    static final int NUM_ELEMENTS = 1 << 28;
    static final int FLUSH_THRESHOLD = 1 << 23;
    Set<Long> superDomainIdSet = null;
    URLFPBloomFilter emittedTuplesFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
    int bloomEntriesCount = 0;
    URLFPV2 testKey = new URLFPV2();

    static final double SAME_ROOT_WEIGHT = .20;

    static double calcualteScore(int inlinksFromSameRoot, int inlinksFromDifferentRoot) {
      inlinksFromDifferentRoot = Math.max(1, inlinksFromDifferentRoot);
      inlinksFromSameRoot = Math.max(1, inlinksFromSameRoot);
      return (Math.min(Math.sqrt(Math.pow(Math.log(inlinksFromSameRoot) * SAME_ROOT_WEIGHT, 2)
          + Math.pow(Math.log(inlinksFromDifferentRoot), 2)), 14) / 14.0) * 10.0;
    }

    @Override
    public void close() throws IOException {

    }

    @Override
    public void configure(JobConf job) {

      Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH));

      try {
        superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile);
      } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw new RuntimeException(e);
      }
    }

    @Override
    public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, DoubleWritable> output,
        Reporter reporter) throws IOException {

      int inlinksFromDifferentRoot = 0;
      int inlinksFromSameRoot = 0;

      String currentDomain = key.toString();
      String rootDomain = URLUtils.extractRootDomainName(currentDomain);

      if (rootDomain != null) {
        // get root domain fp
        long rootDomainFP = SuperDomainList.domainFingerprintGivenName(rootDomain);
        // is super domain
        boolean isSuperDomain = superDomainIdSet.contains(rootDomainFP);

        while (values.hasNext()) {
          JsonObject linkObject = parser.parse(values.next().toString()).getAsJsonObject();
          if (linkObject.has("from")) {
            String fromHost = linkObject.get("from").getAsString();
            String fromRoot = URLUtils.extractRootDomainName(fromHost);
            if (fromRoot != null) {
              if (!fromRoot.equals(rootDomain)) {
                // get the domain's fp
                long rootFP = SuperDomainList.domainFingerprintGivenName(fromRoot);
                // is it a super domain ... ?
                if (superDomainIdSet.contains(rootFP)) {
                  // ok every entry counts
                  inlinksFromDifferentRoot++;
                } else {
                  // ok check to see if we emitted this tuple ...
                  testKey.setDomainHash(SuperDomainList.domainFingerprintGivenName(fromRoot + "->" + rootDomain));
                  testKey.setUrlHash(testKey.getDomainHash());

                  if (!emittedTuplesFilter.isPresent(testKey)) {
                    inlinksFromDifferentRoot++;
                    // add to bloom
                    emittedTuplesFilter.add(testKey);
                    // increment count ...
                    bloomEntriesCount++;
                  }
                }
              } else {
                if (isSuperDomain) {
                  inlinksFromSameRoot++;
                }
              }
            }
          }
        }
        output.collect(key, new DoubleWritable(calcualteScore(inlinksFromSameRoot, inlinksFromDifferentRoot)));
      }

      if (bloomEntriesCount >= FLUSH_THRESHOLD) {
        emittedTuplesFilter.clear();
        bloomEntriesCount = 0;
      }
    }
  }

  public static final String OUTPUT_DIR_NAME = "domainRank";

  public static final String SUPER_DOMAIN_FILE_PATH = "superDomainFilePath";

  private static final Log LOG = LogFactory.getLog(GenDomainRankStep.class);

  public GenDomainRankStep(CrawlPipelineTask task) {
    super(task, "Domain Rank Generator", OUTPUT_DIR_NAME);
  }

  @Override
  public Log getLogger() {
    return LOG;
  }

  @Override
  public void runStep(Path outputPathLocation) throws IOException {

    Path superDomainPath = new Path(makeUniqueOutputDirPath(_task
        .getOutputDirForStep(GenSuperDomainListStep.OUTPUT_DIR_NAME), getTaskIdentityId()), "part-00000");

    LOG.info("Super Domain File Path is:" + superDomainPath);

    JobConf job = new JobBuilder(getDescription(), getConf())

        .input(
            makeUniqueOutputDirPath(_task.getOutputDirForStep(DedupedDomainLinksStep.OUTPUT_DIR_NAME),
                getTaskIdentityId()))
        .inputIsSeqFile()
        .mapperKeyValue(TextBytes.class, TextBytes.class)
        .reducer(GenerateRankReducer.class, false)
        .numReducers(CrawlEnvironment.NUM_DB_SHARDS)
        .outputKeyValue(TextBytes.class, DoubleWritable.class)
        .output(outputPathLocation).outputIsSeqFile()
        .set(SUPER_DOMAIN_FILE_PATH, superDomainPath.toString())

        .build();

    JobClient.runJob(job);
  }

}