/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.feedurlid; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.TreeMap; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.StringUtils; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.TextBytes; import com.google.common.collect.MinMaxPriorityQueue; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; /** * @author rana */ public class FeedUrlIdStep extends CrawlPipelineStep implements Mapper<TextBytes, TextBytes, TextBytes, TextBytes>, Reducer<TextBytes, TextBytes, TextBytes, TextBytes> { enum Counters { GOT_EXCEPTION_PROCESSING_MAP, GOT_EXCEPTION_PROCESSING_REDUCE, REJECTED_HOST_TOO_MANY_KEYS, REJECTED_NESTED_BLOG_POST_FEED_ITEM, DETECTED_TOO_MANY_HOST_KEYS, REJECTED_EXCEEDED_MAX_COLLAPSED_CANDIDATES, REJECTED_TAG_FEED_ITEM, SKIPPING_BLOG_PLATFORM_URL } static class URLCandidate implements Comparable<URLCandidate> { String[] parts; GoogleURL urlObject; URLCandidate(GoogleURL urlObject) { this.urlObject = urlObject; String path = urlObject.getPathAndQuery(); if (path.charAt(0) == '/') { this.parts = path.substring(1).split("/"); } else { this.parts = path.split("/"); } } @Override public int compareTo(URLCandidate o) { int result = parts.length < o.parts.length ? -1 : (parts.length > o.parts.length) ? 1 : 0; if (result == 0) { for (int i = 0; i < parts.length; ++i) { result = parts[i].compareTo(o.parts[i]); if (result != 0) break; } } return result; } @Override public String toString() { return urlObject.getCanonicalURL(); } } public static final String OUTPUT_DIR_NAME = "feedURLIdentifier"; private static final Log LOG = LogFactory.getLog(FeedUrlIdStep.class); JsonParser parser = new JsonParser(); static final int HAS_ATOM_XML = 1 << 0; static final int HAS_FEED_XML = 1 << 1; JsonObject objectOut = new JsonObject(); Pattern atomRSSPattern = Pattern.compile(".*(application/atom.xml|application/rss.xml).*"); Pattern blogPostFeedPattern = Pattern.compile(".*/[0-9]{4}/[0-9]{2}/.*"); Pattern tagPattern = Pattern.compile(".*/tag/.*"); Pattern blogPlatformURLPattern = Pattern.compile("http://[^/]*.(blogspot|wordpress|tumblr|typepad).com/.*"); TreeMap<String, JsonObject> treeMap = new TreeMap<String, JsonObject>(); static final int MAX_SAMPLES = 100; static final int MAX_SAME_LEVEL_SAMPLES = 4; private static final int SIMILARITY_THRESHOLD = 7; static ArrayList<URLCandidate> collapseCandidates(ArrayList<URLCandidate> candidateList, int similarityThreshold) { for (int i = 0; i < candidateList.size(); ++i) { URLCandidate head = candidateList.get(i); if (head.parts.length >= 2 && head.parts[head.parts.length - 1].equals("feed")) { boolean endsWithSlash = head.urlObject.getCanonicalURL().endsWith("/"); int trailingItemIndex = head.parts.length - 1; int similar = 0; for (int j = i + 1; j < candidateList.size(); ++j) { URLCandidate other = candidateList.get(j); if (other.parts.length == head.parts.length) { if (other.parts[trailingItemIndex].equals("feed")) { if (head.parts.length == 2 || head.parts[trailingItemIndex - 2].equals(other.parts[trailingItemIndex - 2])) { similar++; } } } } if (similar >= similarityThreshold) { for (int k = 0; k < similar; ++k) { candidateList.remove(i + 1); } candidateList.remove(i); StringBuffer finalString = new StringBuffer("/"); for (int k = 0; k <= trailingItemIndex - 2; ++k) { finalString.append(head.parts[k]); finalString.append("/"); } finalString.append("feed"); if (endsWithSlash) finalString.append("/"); candidateList.add(new URLCandidate(new GoogleURL(head.urlObject.getScheme() + "://" + head.urlObject.getHost() + finalString.toString()))); } } } for (int i = 0; i < candidateList.size(); ++i) { URLCandidate head = candidateList.get(i); String prefix = getPrefixGivenPath(head.urlObject.getPathAndQuery()); if (prefix.length() != 0) { for (int j = i + 1; j < candidateList.size();) { if (candidateList.get(j).urlObject.getPathAndQuery().startsWith(prefix)) { candidateList.remove(j); } else { ++j; } } } } return candidateList; } public static ArrayList<URLCandidate> drainToArrayList(MinMaxPriorityQueue<URLCandidate> queue) { int queueSize = queue.size(); ArrayList<URLCandidate> list = new ArrayList<URLCandidate>(queueSize); for (int i = 0; i < queueSize; ++i) { list.add(queue.removeFirst()); } return list; } static String getPrefixGivenPath(String path) { int lastIndexOfSlash; if (path.lastIndexOf('/') == path.length() - 1) { lastIndexOfSlash = path.lastIndexOf('/', path.length() - 2); } else { lastIndexOfSlash = path.lastIndexOf('/'); } if (lastIndexOfSlash != -1) { return path.substring(0, lastIndexOfSlash); } return path; } public static void main(String[] args) { ArrayList<URLCandidate> candidates = new ArrayList<URLCandidate>(); candidates.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/info/feed/"))); candidates.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/supernatural-amphitheatre/feed/"))); candidates.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/tickets-pre-ballot/feed/"))); collapseCandidates(candidates, 2); System.out.println(candidates.toString()); MinMaxPriorityQueue<URLCandidate> deque2 = MinMaxPriorityQueue.create(); deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/blog/feed/"))); deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/blog/supernatural-amphitheatre/feed/"))); deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/vlog/tickets-pre-ballot/feed/"))); deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/blog/tickets-pre-ballot/feed/"))); deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/vlog/tickets-pre-ballot-2/feed/"))); deque2.add(new URLCandidate(new GoogleURL("http://2010.goldenplains.com.au/vlog/tickets-pre-ballot-3/feed/"))); ArrayList<URLCandidate> test = drainToArrayList(deque2); System.out.println(test); collapseCandidates(test, 2); System.out.println(test.toString()); } MinMaxPriorityQueue<URLCandidate> candidateList = MinMaxPriorityQueue.maximumSize(MAX_SAMPLES).create(); ArrayList<URLCandidate> lookAhead = new ArrayList<URLCandidate>(); public FeedUrlIdStep() { super(null, null, null); } public FeedUrlIdStep(CrawlPipelineTask task) { super(task, "Feed URL Identifier", OUTPUT_DIR_NAME); } @Override public void close() throws IOException { } @Override public void configure(JobConf job) { } @Override public Log getLogger() { return LOG; } @Override public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { try { if (blogPlatformURLPattern.matcher(key.toString()).matches()) { reporter.incrCounter(Counters.SKIPPING_BLOG_PLATFORM_URL, 1); } else { if (atomRSSPattern.matcher(value.toString()).matches()) { JsonObject containerObj = parser.parse(value.toString()).getAsJsonObject(); JsonObject linkStatus = containerObj.getAsJsonObject("link_status"); if (linkStatus != null) { JsonArray typeAndRels = linkStatus.getAsJsonArray("typeAndRels"); if (typeAndRels != null) { for (JsonElement e : typeAndRels) { String parts[] = e.getAsString().split(":"); if (parts.length == 3 && (parts[1].equalsIgnoreCase("application/atom+xml") || parts[1] .equalsIgnoreCase("application/rss+xml"))) { if (blogPostFeedPattern.matcher(key.toString()).matches()) { reporter.incrCounter(Counters.REJECTED_NESTED_BLOG_POST_FEED_ITEM, 1); } else if (tagPattern.matcher(key.toString()).matches()) { reporter.incrCounter(Counters.REJECTED_TAG_FEED_ITEM, 1); } else { GoogleURL urlObject = new GoogleURL(key.toString()); if (urlObject.isValid()) { objectOut.addProperty("url", urlObject.getCanonicalURL()); objectOut.addProperty("type", parts[1]); output.collect(new TextBytes(urlObject.getHost()), new TextBytes(objectOut.toString())); break; } } } } } } } } } catch (Exception e) { reporter.incrCounter(Counters.GOT_EXCEPTION_PROCESSING_MAP, 1); } } @Override public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { candidateList.clear(); lookAhead.clear(); try { while (values.hasNext()) { JsonObject object = parser.parse(values.next().toString()).getAsJsonObject(); String url = object.get("url").getAsString(); candidateList.add(new URLCandidate(new GoogleURL(url))); } if (candidateList.size() != 0) { if (candidateList.size() == 1) { output.collect(key, new TextBytes(candidateList.peek().urlObject.getCanonicalURL())); } else { // pop first candidate ... URLCandidate firstCandidate = candidateList.removeFirst(); while (candidateList.peek() != null && candidateList.peek().parts.length == firstCandidate.parts.length) { // ok // reject outright if too many candidates of this length ... lookAhead.add(candidateList.removeFirst()); } if (lookAhead.size() > MAX_SAME_LEVEL_SAMPLES) { reporter.incrCounter(Counters.DETECTED_TOO_MANY_HOST_KEYS, 1); return; } else { output.collect(key, new TextBytes(firstCandidate.urlObject.getCanonicalURL())); for (URLCandidate nextCandidate : lookAhead) { output.collect(key, new TextBytes(nextCandidate.urlObject.getCanonicalURL())); } } } } } catch (Exception e) { reporter.incrCounter(Counters.GOT_EXCEPTION_PROCESSING_REDUCE, 1); LOG.error("Key:" + key + "\n" + StringUtils.stringifyException(e)); } } @Override public void runStep(Path outputPathLocation) throws IOException { JobConf job = new JobBuilder(getDescription(), getConf()) .inputs(((DomainMetadataTask) getTask()).getMergeDBDataPaths()).inputIsSeqFile().mapper(FeedUrlIdStep.class) .reducer(FeedUrlIdStep.class, false).outputIsSeqFile().output(outputPathLocation).outputKeyValue( TextBytes.class, TextBytes.class).numReducers(CrawlEnvironment.NUM_DB_SHARDS / 2).build(); JobClient.runJob(job); } }