ScanDatabaseStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/
package org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.postfrequency;

import java.io.IOException;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.HitsByMonth;
import org.commoncrawl.mapred.PostFrequencyInfo;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLUtils;

import com.google.common.collect.ImmutableList;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

/**
 * 
 * @author rana
 *
 */
public class ScanDatabaseStep extends CrawlPipelineStep implements Mapper<TextBytes, TextBytes, TextBytes, HitsByMonth> {

  enum Counters {
    MATCHED_NESTED_INDEX_HTML_PATTERN, MATCHED_NESTED_INDEX_PATTERN, MATCHED_INDEX_PATTERN, MATCHED_INDEX_HTML_PATTERN,
    MATCHED_TOP_LEVEL_POST_PATTERN, MATCHED_TOP_PATTERN, MATCHED_NESTED_POST_PATTERN, MATCHED_TUMBLR_BLOG_POST_PATTERN,
    CAUGHT_EXCEPTION_DURING_METADATA_PARSE, DETECTED_WORDPRESS_DURING_METADATA_PARSE,
    DETECTED_BLOGGER_DURING_METADATA_PARSE, DETECTED_TYPEPAD_DURING_METADATA_PARSE,
    CAUGHT_EXCEPTION_DURING_TUMBLR_POST_PARSE
  }

  public static final String OUTPUT_DIR_NAME = "phase-1";

  private static final Log LOG = LogFactory.getLog(ScanDatabaseStep.class);

  static Pattern topLevelBlogPattern = Pattern.compile("http://([^/?]*)/([0-9]{4})/([0-9]{2})/.*$");

  static Pattern nestedBlogPattern = Pattern.compile("http://([^/?]*)/([^/?]*)/([0-9]{4})/([0-9]{2})/.*$");
  static Pattern indexHTMLBlogPattern = Pattern.compile("http://([^/?]*)/([0-9]{4})/([0-9]{2})/index.html$");
  static Pattern indexBlogPattern = Pattern.compile("http://([^/?]*)/([0-9]{4})/([0-9]{2})/$");
  static Pattern nestedIndexHTMLBlogPattern = Pattern
      .compile("http://([^/?]*)/([^/?]*)/([0-9]{4})/([0-9]{2})/index.html$");
  static Pattern nestedIndexBlogPattern = Pattern.compile("http://([^/?]*)/([^/?]*)/([0-9]{4})/([0-9]{2})/$");
  static Pattern tumblrStyleBlogPattern = Pattern.compile("http://([^/?]*)/post/([0-9]{5,})/[^/]*[/]*$");
  JsonParser parser = new JsonParser();

  /** default constructor (for mapper) **/
  public ScanDatabaseStep() {
    super(null, null, null);
  }

  /** step constructor **/
  public ScanDatabaseStep(CrawlPipelineTask task) throws IOException {
    super(task, task.getDescription() + " - Phase 1", OUTPUT_DIR_NAME);
  }

  @Override
  public void close() throws IOException {

  }

  @Override
  public void configure(JobConf job) {

  }

  @Override
  public Log getLogger() {
    return LOG;
  }

  @Override
  public void map(TextBytes key, TextBytes jsonMetadata, OutputCollector<TextBytes, HitsByMonth> collector,
      Reporter reporter) throws IOException {

    String url = key.toString();

    Matcher topLevelMatcher = topLevelBlogPattern.matcher(url);
    Matcher nestedBlogMatcher = nestedBlogPattern.matcher(url);
    Matcher indexHTMLBlogMatcher = indexHTMLBlogPattern.matcher(url);
    Matcher indexBlogMatcher = indexBlogPattern.matcher(url);
    Matcher nestedIndexHTMLBlogMatcher = nestedIndexHTMLBlogPattern.matcher(url);
    Matcher nestedIndexBlogMatcher = nestedIndexBlogPattern.matcher(url);
    Matcher tumblrPostMatcher = tumblrStyleBlogPattern.matcher(url);

    if (indexHTMLBlogMatcher.matches() && indexHTMLBlogMatcher.groupCount() >= 1) {
      reporter.incrCounter(Counters.MATCHED_INDEX_HTML_PATTERN, 1);
      HitsByMonth hits = new HitsByMonth();
      hits.setFlags(PostFrequencyInfo.Flags.HAS_INDEX_HTML_AFTER_DATE);
      collector.collect(new TextBytes("http://" + indexHTMLBlogMatcher.group(1) + "/"), hits);
    } else if (indexBlogMatcher.matches() && indexBlogMatcher.groupCount() >= 1) {
      reporter.incrCounter(Counters.MATCHED_INDEX_PATTERN, 1);
      HitsByMonth hits = new HitsByMonth();
      hits.setFlags(PostFrequencyInfo.Flags.HAS_YEAR_MONTH_SLASH_INDEX);
      collector.collect(new TextBytes("http://" + indexBlogMatcher.group(1) + "/"), hits);
    } else if (nestedIndexHTMLBlogMatcher.matches() && nestedIndexHTMLBlogMatcher.groupCount() >= 2) {
      reporter.incrCounter(Counters.MATCHED_NESTED_INDEX_HTML_PATTERN, 1);
      HitsByMonth hits = new HitsByMonth();
      hits.setFlags(PostFrequencyInfo.Flags.HAS_INDEX_HTML_AFTER_DATE);
      collector.collect(new TextBytes("http://" + nestedIndexHTMLBlogMatcher.group(1) + "/"
          + nestedIndexHTMLBlogMatcher.group(2) + "/"), hits);
    } else if (nestedIndexBlogMatcher.matches() && nestedIndexBlogMatcher.groupCount() >= 2) {
      reporter.incrCounter(Counters.MATCHED_NESTED_INDEX_PATTERN, 1);
      HitsByMonth hits = new HitsByMonth();
      hits.setFlags(PostFrequencyInfo.Flags.HAS_YEAR_MONTH_SLASH_INDEX);
      collector.collect(new TextBytes("http://" + nestedIndexBlogMatcher.group(1) + "/"
          + nestedIndexBlogMatcher.group(2) + "/"), hits);
    } else if (tumblrPostMatcher.matches() && tumblrPostMatcher.groupCount() >= 2) {
      reporter.incrCounter(Counters.MATCHED_TUMBLR_BLOG_POST_PATTERN, 1);

      String uniqueURL = new String("http://" + tumblrPostMatcher.group(1) + "/");

      try {
        // HACK
        long postId = Long.parseLong(tumblrPostMatcher.group(2));
        long relativeMonth = postId / 1000000000L;
        Date dateStart = new Date(110, 6, 1);
        Date dateOfPost = new Date(dateStart.getTime() + (relativeMonth * 30 * 24 * 60 * 60 * 1000));

        HitsByMonth hits = new HitsByMonth();
        hits.setHitCount(1);
        hits.setYear(dateOfPost.getYear() + 1900);
        hits.setMonth(dateOfPost.getMonth() + 1);

        collector.collect(new TextBytes(uniqueURL), hits);
      } catch (Exception e) {
        reporter.incrCounter(Counters.CAUGHT_EXCEPTION_DURING_TUMBLR_POST_PARSE, 1);
        LOG.error("Exception parsing url:" + url + " Exception:" + StringUtils.stringifyException(e));
      }

    } else if (topLevelMatcher.matches() && topLevelMatcher.groupCount() >= 3) {

      reporter.incrCounter(Counters.MATCHED_TOP_LEVEL_POST_PATTERN, 1);

      String uniqueURL = new String("http://" + topLevelMatcher.group(1) + "/");
      int year = Integer.parseInt(topLevelMatcher.group(2));
      int month = Integer.parseInt(topLevelMatcher.group(3));

      HitsByMonth hits = new HitsByMonth();
      hits.setHitCount(1);
      hits.setYear(year);
      hits.setMonth(month);

      hits.setFlags(scanForGenerator(key, jsonMetadata, reporter));

      collector.collect(new TextBytes(uniqueURL), hits);
    } else if (nestedBlogMatcher.matches() && nestedBlogMatcher.groupCount() >= 4) {

      reporter.incrCounter(Counters.MATCHED_NESTED_POST_PATTERN, 1);

      if (!nestedBlogMatcher.group(1).endsWith("tumblr.com")) {
        String uniqueURL = new String("http://" + nestedBlogMatcher.group(1) + "/" + nestedBlogMatcher.group(2) + "/");

        int year = Integer.parseInt(nestedBlogMatcher.group(3));
        int month = Integer.parseInt(nestedBlogMatcher.group(4));

        HitsByMonth hits = new HitsByMonth();
        hits.setHitCount(1);
        hits.setYear(year);
        hits.setMonth(month);

        hits.setFlags(scanForGenerator(key, jsonMetadata, reporter));

        collector.collect(new TextBytes(uniqueURL), hits);
      }
    }

  }

  @Override
  public void runStep(Path outputPathLocation) throws IOException {
    LOG.info("Task Identity Path is:" + getTaskIdentityPath());
    LOG.info("Temp Path is:" + outputPathLocation);

    ImmutableList<Path> paths = new ImmutableList.Builder<Path>().addAll(
        ((DomainMetadataTask) getTask().getTask()).getMergeDBDataPaths()).build();

    JobConf job = new JobBuilder(getDescription() + " - Phase 1", getConf()).inputIsSeqFile().inputs(paths).mapper(
        ScanDatabaseStep.class).keyValue(TextBytes.class, HitsByMonth.class).numReducers(
        CrawlEnvironment.NUM_DB_SHARDS / 2).output(outputPathLocation).outputIsSeqFile().build();

    JobClient.runJob(job);

  }

  int scanForGenerator(TextBytes key, TextBytes value, Reporter reporter) {
    try {
      JsonObject containerObj = parser.parse(value.toString()).getAsJsonObject();
      GoogleURL urlObject = new GoogleURL(key.toString());
      if (urlObject.isValid()) {
        String sourceRootDomain = URLUtils.extractRootDomainName(urlObject.getHost());
        if (sourceRootDomain != null) {
          URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);

          JsonObject objectOut = new JsonObject();
          if (fp != null) {
            objectOut.addProperty("dh", fp.getDomainHash());
          }

          JsonObject crawlStatus = containerObj.getAsJsonObject("crawl_status");

          if (crawlStatus != null) {
            if (crawlStatus.has("http_result")) {
              int httpResult = crawlStatus.get("http_result").getAsInt();
              if (httpResult == 200) {
                JsonArray crawlStatsArray = crawlStatus.getAsJsonArray("crawl_stats");
                if (crawlStatsArray != null && crawlStatsArray.size() != 0) {
                  JsonObject crawlStats = crawlStatsArray.get(0).getAsJsonObject();
                  if (crawlStats != null) {
                    JsonArray metaTags = crawlStats.getAsJsonArray("meta_tags");
                    if (metaTags != null) {
                      for (JsonElement metaObject : metaTags) {
                        String metaValue = metaObject.getAsJsonObject().get("value").getAsString();
                        if (metaValue.contains("Wordpress")) {
                          reporter.incrCounter(Counters.DETECTED_WORDPRESS_DURING_METADATA_PARSE, 1);
                          return PostFrequencyInfo.Flags.FLAG_GENERATOR_IS_WORDPRESS;
                        } else if (metaValue.contains("blogger")) {
                          reporter.incrCounter(Counters.DETECTED_BLOGGER_DURING_METADATA_PARSE, 1);
                          return PostFrequencyInfo.Flags.FLAG_GENERATOR_IS_BLOGGER;
                        } else if (metaValue.contains("http://www.typepad.com/")) {
                          reporter.incrCounter(Counters.DETECTED_TYPEPAD_DURING_METADATA_PARSE, 1);
                          return PostFrequencyInfo.Flags.FLAG_GENERATOR_IS_TYPEPAD;
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    } catch (Exception e) {
      reporter.incrCounter(Counters.CAUGHT_EXCEPTION_DURING_METADATA_PARSE, 1);
      LOG.error("Key:" + key.toString() + " Value:" + value.toString() + "\n" + StringUtils.stringifyException(e));
    }
    return 0;
  }

}