FeedUrlIdStep.java example

Explorer
commoncrawl-crawler-master
- src
  - com
    - dappit
      - Dapper
        parser
        CompressedDomBuilder.java
        DebugDocumentBuilder.java
        DocumentBuilder.java
        DomDocumentBuilder.java
        EnviromentController.java
        HTMLParser.java
        InstructionsPool.java
        LinkExtractionDocumentBuilder.java
        MozillaParser.java
        ParserException.java
        ParserInitializationException.java
        ParserInstruction.java
  - org
    - commoncrawl
/**
 * Copyright 2012 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.feedurlid;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.TextBytes;

import com.google.common.collect.MinMaxPriorityQueue;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;

/**
 *  @author rana
 */
public class FeedUrlIdStep extends CrawlPipelineStep implements Mapper<TextBytes, TextBytes, TextBytes, TextBytes>,
    Reducer<TextBytes, TextBytes, TextBytes, TextBytes> {

  enum Counters {
    GOT_EXCEPTION_PROCESSING_MAP, GOT_EXCEPTION_PROCESSING_REDUCE, REJECTED_HOST_TOO_MANY_KEYS,
    REJECTED_NESTED_BLOG_POST_FEED_ITEM, DETECTED_TOO_MANY_HOST_KEYS, REJECTED_EXCEEDED_MAX_COLLAPSED_CANDIDATES,
    REJECTED_TAG_FEED_ITEM, SKIPPING_BLOG_PLATFORM_URL

  }

  static class URLCandidate implements Comparable<URLCandidate> {

    String[] parts;

    GoogleURL urlObject;

    URLCandidate(GoogleURL urlObject) {
      this.urlObject = urlObject;
      String path = urlObject.getPathAndQuery();
      if (path.charAt(0) == '/') {
        this.parts = path.substring(1).split("/");
      } else {
        this.parts = path.split("/");
      }
    }

    @Override
    public int compareTo(URLCandidate o) {
      int result = parts.length < o.parts.length ? -1 : (parts.length > o.parts.length) ? 1 : 0;
      if (result == 0) {
        for (int i = 0; i < parts.length; ++i) {
          result = parts[i].compareTo(o.parts[i]);
          if (result != 0)
            break;
        }
      }
      return result;
    }

    @Override
    public String toString() {
      return urlObject.getCanonicalURL();
    }
  }

  public static final String OUTPUT_DIR_NAME = "feedURLIdentifier";

  private static final Log LOG = LogFactory.getLog(FeedUrlIdStep.class);

  JsonParser parser = new JsonParser();

  static final int HAS_ATOM_XML = 1 << 0;

  static final int HAS_FEED_XML = 1 << 1;

  JsonObject objectOut = new JsonObject();

  Pattern atomRSSPattern = Pattern.compile(".*(application/atom.xml|application/rss.xml).*");
  Pattern blogPostFeedPattern = Pattern.compile(".*/[0-9]{4}/[0-9]{2}/.*");

  Pattern tagPattern = Pattern.compile(".*/tag/.*");

  Pattern blogPlatformURLPattern = Pattern.compile("http://[^/]*.(blogspot|wordpress|tumblr|typepad).com/.*");
  TreeMap<String, JsonObject> treeMap = new TreeMap<String, JsonObject>();
  static final int MAX_SAMPLES = 100;
  static final int MAX_SAME_LEVEL_SAMPLES = 4;

  private static final int SIMILARITY_THRESHOLD = 7;

  static ArrayList<URLCandidate> collapseCandidates(ArrayList<URLCandidate> candidateList, int similarityThreshold) {

    for (int i = 0; i < candidateList.size(); ++i) {
      URLCandidate head = candidateList.get(i);
      if (head.parts.length >= 2 && head.parts[head.parts.length - 1].equals("feed")) {
        boolean endsWithSlash = head.urlObject.getCanonicalURL().endsWith("/");
        int trailingItemIndex = head.parts.length - 1;
        int similar = 0;
        for (int j = i + 1; j < candidateList.size(); ++j) {
          URLCandidate other = candidateList.get(j);
          if (other.parts.length == head.parts.length) {
            if (other.parts[trailingItemIndex].equals("feed")) {
              if (head.parts.length == 2
                  || head.parts[trailingItemIndex - 2].equals(other.parts[trailingItemIndex - 2])) {
                similar++;
              }
            }
          }
        }
        if (similar >= similarityThreshold) {
          for (int k = 0; k < similar; ++k) {
            candidateList.remove(i + 1);
          }
          candidateList.remove(i);
          StringBuffer finalString = new StringBuffer("/");
          for (int k = 0; k <= trailingItemIndex - 2; ++k) {
            finalString.append(head.parts[k]);
            finalString.append("/");
          }
          finalString.append("feed");
          if (endsWithSlash)
            finalString.append("/");

          candidateList.add(new URLCandidate(new GoogleURL(head.urlObject.getScheme() + "://"
              + head.urlObject.getHost() + finalString.toString())));
        }
      }
    }

    for (int i = 0; i < candidateList.size(); ++i) {
      URLCandidate head = candidateList.get(i);
      String prefix = getPrefixGivenPath(head.urlObject.getPathAndQuery());
      if (prefix.length() != 0) {
        for (int j = i + 1; j < candidateList.size();) {
          if (candidateList.get(j).urlObject.getPathAndQuery().startsWith(prefix)) {
            candidateList.remove(j);
          } else {
            ++j;
          }
        }
      }
    }
    return candidateList;
  }

  public static ArrayList<URLCandidate> drainToArrayList(MinMaxPriorityQueue<URLCandidate> queue) {
    int queueSize = queue.size();
    ArrayList<URLCandidate> list = new ArrayList<URLCandidate>(queueSize);
    for (int i = 0; i < queueSize; ++i) {
      list.add(queue.removeFirst());
    }
    return list;
  }

  static String getPrefixGivenPath(String path) {
    int lastIndexOfSlash;
    if (path.lastIndexOf('/') == path.length() - 1) {
      lastIndexOfSlash = path.lastIndexOf('/', path.length() - 2);
    } else {
      lastIndexOfSlash = path.lastIndexOf('/');
    }

    if (lastIndexOfSlash != -1) {
      return path.substring(0, lastIndexOfSlash);
    }
    return path;
  }

  public static void main(String[] args) {
    ArrayList<URLCandidate> candidates = new ArrayList<URLCandidate>();
    candidates.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/info/feed/")));
    candidates.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/supernatural-amphitheatre/feed/")));
    candidates.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/tickets-pre-ballot/feed/")));

    collapseCandidates(candidates, 2);

    System.out.println(candidates.toString());

    MinMaxPriorityQueue<URLCandidate> deque2 = MinMaxPriorityQueue.create();
    deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/blog/feed/")));
    deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/blog/supernatural-amphitheatre/feed/")));
    deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/vlog/tickets-pre-ballot/feed/")));
    deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/blog/tickets-pre-ballot/feed/")));
    deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/vlog/tickets-pre-ballot-2/feed/")));
    deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/vlog/tickets-pre-ballot-3/feed/")));
    ArrayList<URLCandidate> test = drainToArrayList(deque2);
    System.out.println(test);

    collapseCandidates(test, 2);

    System.out.println(test.toString());
  }

  MinMaxPriorityQueue<URLCandidate> candidateList = MinMaxPriorityQueue.maximumSize(MAX_SAMPLES).create();
  ArrayList<URLCandidate> lookAhead = new ArrayList<URLCandidate>();

  public FeedUrlIdStep() {
    super(null, null, null);
  }

  public FeedUrlIdStep(CrawlPipelineTask task) {
    super(task, "Feed URL Identifier", OUTPUT_DIR_NAME);
  }

  @Override
  public void close() throws IOException {

  }

  @Override
  public void configure(JobConf job) {

  }

  @Override
  public Log getLogger() {
    return LOG;
  }

  @Override
  public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter)
      throws IOException {
    try {
      if (blogPlatformURLPattern.matcher(key.toString()).matches()) {
        reporter.incrCounter(Counters.SKIPPING_BLOG_PLATFORM_URL, 1);
      } else {
        if (atomRSSPattern.matcher(value.toString()).matches()) {
          JsonObject containerObj = parser.parse(value.toString()).getAsJsonObject();
          JsonObject linkStatus = containerObj.getAsJsonObject("link_status");

          if (linkStatus != null) {
            JsonArray typeAndRels = linkStatus.getAsJsonArray("typeAndRels");
            if (typeAndRels != null) {
              for (JsonElement e : typeAndRels) {
                String parts[] = e.getAsString().split(":");
                if (parts.length == 3
                    && (parts[1].equalsIgnoreCase("application/atom+xml") || parts[1]
                        .equalsIgnoreCase("application/rss+xml"))) {

                  if (blogPostFeedPattern.matcher(key.toString()).matches()) {
                    reporter.incrCounter(Counters.REJECTED_NESTED_BLOG_POST_FEED_ITEM, 1);
                  } else if (tagPattern.matcher(key.toString()).matches()) {
                    reporter.incrCounter(Counters.REJECTED_TAG_FEED_ITEM, 1);
                  } else {
                    GoogleURL urlObject = new GoogleURL(key.toString());
                    if (urlObject.isValid()) {

                      objectOut.addProperty("url", urlObject.getCanonicalURL());
                      objectOut.addProperty("type", parts[1]);
                      output.collect(new TextBytes(urlObject.getHost()), new TextBytes(objectOut.toString()));

                      break;
                    }
                  }
                }
              }
            }
          }
        }
      }
    } catch (Exception e) {
      reporter.incrCounter(Counters.GOT_EXCEPTION_PROCESSING_MAP, 1);
    }
  }

  @Override
  public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output,
      Reporter reporter) throws IOException {

    candidateList.clear();
    lookAhead.clear();

    try {
      while (values.hasNext()) {
        JsonObject object = parser.parse(values.next().toString()).getAsJsonObject();
        String url = object.get("url").getAsString();
        candidateList.add(new URLCandidate(new GoogleURL(url)));
      }

      if (candidateList.size() != 0) {
        if (candidateList.size() == 1) {
          output.collect(key, new TextBytes(candidateList.peek().urlObject.getCanonicalURL()));
        } else {
          // pop first candidate ...
          URLCandidate firstCandidate = candidateList.removeFirst();

          while (candidateList.peek() != null && candidateList.peek().parts.length == firstCandidate.parts.length) {
            // ok // reject outright if too many candidates of this length ...
            lookAhead.add(candidateList.removeFirst());
          }

          if (lookAhead.size() > MAX_SAME_LEVEL_SAMPLES) {
            reporter.incrCounter(Counters.DETECTED_TOO_MANY_HOST_KEYS, 1);
            return;
          } else {
            output.collect(key, new TextBytes(firstCandidate.urlObject.getCanonicalURL()));
            for (URLCandidate nextCandidate : lookAhead) {
              output.collect(key, new TextBytes(nextCandidate.urlObject.getCanonicalURL()));
            }
          }
        }
      }
    } catch (Exception e) {
      reporter.incrCounter(Counters.GOT_EXCEPTION_PROCESSING_REDUCE, 1);
      LOG.error("Key:" + key + "\n" + StringUtils.stringifyException(e));
    }
  }

  @Override
  public void runStep(Path outputPathLocation) throws IOException {

    JobConf job = new JobBuilder(getDescription(), getConf())

    .inputs(((DomainMetadataTask) getTask()).getMergeDBDataPaths()).inputIsSeqFile().mapper(FeedUrlIdStep.class)
        .reducer(FeedUrlIdStep.class, false).outputIsSeqFile().output(outputPathLocation).outputKeyValue(
            TextBytes.class, TextBytes.class).numReducers(CrawlEnvironment.NUM_DB_SHARDS / 2).build();

    JobClient.runJob(job);
  }

}