/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.postfrequency; import java.io.IOException; import java.util.Date; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.StringUtils; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.HitsByMonth; import org.commoncrawl.mapred.PostFrequencyInfo; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLUtils; import com.google.common.collect.ImmutableList; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; /** * * @author rana * */ public class ScanDatabaseStep extends CrawlPipelineStep implements Mapper<TextBytes, TextBytes, TextBytes, HitsByMonth> { enum Counters { MATCHED_NESTED_INDEX_HTML_PATTERN, MATCHED_NESTED_INDEX_PATTERN, MATCHED_INDEX_PATTERN, MATCHED_INDEX_HTML_PATTERN, MATCHED_TOP_LEVEL_POST_PATTERN, MATCHED_TOP_PATTERN, MATCHED_NESTED_POST_PATTERN, MATCHED_TUMBLR_BLOG_POST_PATTERN, CAUGHT_EXCEPTION_DURING_METADATA_PARSE, DETECTED_WORDPRESS_DURING_METADATA_PARSE, DETECTED_BLOGGER_DURING_METADATA_PARSE, DETECTED_TYPEPAD_DURING_METADATA_PARSE, CAUGHT_EXCEPTION_DURING_TUMBLR_POST_PARSE } public static final String OUTPUT_DIR_NAME = "phase-1"; private static final Log LOG = LogFactory.getLog(ScanDatabaseStep.class); static Pattern topLevelBlogPattern = Pattern.compile("http://([^/?]*)/([0-9]{4})/([0-9]{2})/.*$"); static Pattern nestedBlogPattern = Pattern.compile("http://([^/?]*)/([^/?]*)/([0-9]{4})/([0-9]{2})/.*$"); static Pattern indexHTMLBlogPattern = Pattern.compile("http://([^/?]*)/([0-9]{4})/([0-9]{2})/index.html$"); static Pattern indexBlogPattern = Pattern.compile("http://([^/?]*)/([0-9]{4})/([0-9]{2})/$"); static Pattern nestedIndexHTMLBlogPattern = Pattern .compile("http://([^/?]*)/([^/?]*)/([0-9]{4})/([0-9]{2})/index.html$"); static Pattern nestedIndexBlogPattern = Pattern.compile("http://([^/?]*)/([^/?]*)/([0-9]{4})/([0-9]{2})/$"); static Pattern tumblrStyleBlogPattern = Pattern.compile("http://([^/?]*)/post/([0-9]{5,})/[^/]*[/]*$"); JsonParser parser = new JsonParser(); /** default constructor (for mapper) **/ public ScanDatabaseStep() { super(null, null, null); } /** step constructor **/ public ScanDatabaseStep(CrawlPipelineTask task) throws IOException { super(task, task.getDescription() + " - Phase 1", OUTPUT_DIR_NAME); } @Override public void close() throws IOException { } @Override public void configure(JobConf job) { } @Override public Log getLogger() { return LOG; } @Override public void map(TextBytes key, TextBytes jsonMetadata, OutputCollector<TextBytes, HitsByMonth> collector, Reporter reporter) throws IOException { String url = key.toString(); Matcher topLevelMatcher = topLevelBlogPattern.matcher(url); Matcher nestedBlogMatcher = nestedBlogPattern.matcher(url); Matcher indexHTMLBlogMatcher = indexHTMLBlogPattern.matcher(url); Matcher indexBlogMatcher = indexBlogPattern.matcher(url); Matcher nestedIndexHTMLBlogMatcher = nestedIndexHTMLBlogPattern.matcher(url); Matcher nestedIndexBlogMatcher = nestedIndexBlogPattern.matcher(url); Matcher tumblrPostMatcher = tumblrStyleBlogPattern.matcher(url); if (indexHTMLBlogMatcher.matches() && indexHTMLBlogMatcher.groupCount() >= 1) { reporter.incrCounter(Counters.MATCHED_INDEX_HTML_PATTERN, 1); HitsByMonth hits = new HitsByMonth(); hits.setFlags(PostFrequencyInfo.Flags.HAS_INDEX_HTML_AFTER_DATE); collector.collect(new TextBytes("http://" + indexHTMLBlogMatcher.group(1) + "/"), hits); } else if (indexBlogMatcher.matches() && indexBlogMatcher.groupCount() >= 1) { reporter.incrCounter(Counters.MATCHED_INDEX_PATTERN, 1); HitsByMonth hits = new HitsByMonth(); hits.setFlags(PostFrequencyInfo.Flags.HAS_YEAR_MONTH_SLASH_INDEX); collector.collect(new TextBytes("http://" + indexBlogMatcher.group(1) + "/"), hits); } else if (nestedIndexHTMLBlogMatcher.matches() && nestedIndexHTMLBlogMatcher.groupCount() >= 2) { reporter.incrCounter(Counters.MATCHED_NESTED_INDEX_HTML_PATTERN, 1); HitsByMonth hits = new HitsByMonth(); hits.setFlags(PostFrequencyInfo.Flags.HAS_INDEX_HTML_AFTER_DATE); collector.collect(new TextBytes("http://" + nestedIndexHTMLBlogMatcher.group(1) + "/" + nestedIndexHTMLBlogMatcher.group(2) + "/"), hits); } else if (nestedIndexBlogMatcher.matches() && nestedIndexBlogMatcher.groupCount() >= 2) { reporter.incrCounter(Counters.MATCHED_NESTED_INDEX_PATTERN, 1); HitsByMonth hits = new HitsByMonth(); hits.setFlags(PostFrequencyInfo.Flags.HAS_YEAR_MONTH_SLASH_INDEX); collector.collect(new TextBytes("http://" + nestedIndexBlogMatcher.group(1) + "/" + nestedIndexBlogMatcher.group(2) + "/"), hits); } else if (tumblrPostMatcher.matches() && tumblrPostMatcher.groupCount() >= 2) { reporter.incrCounter(Counters.MATCHED_TUMBLR_BLOG_POST_PATTERN, 1); String uniqueURL = new String("http://" + tumblrPostMatcher.group(1) + "/"); try { // HACK long postId = Long.parseLong(tumblrPostMatcher.group(2)); long relativeMonth = postId / 1000000000L; Date dateStart = new Date(110, 6, 1); Date dateOfPost = new Date(dateStart.getTime() + (relativeMonth * 30 * 24 * 60 * 60 * 1000)); HitsByMonth hits = new HitsByMonth(); hits.setHitCount(1); hits.setYear(dateOfPost.getYear() + 1900); hits.setMonth(dateOfPost.getMonth() + 1); collector.collect(new TextBytes(uniqueURL), hits); } catch (Exception e) { reporter.incrCounter(Counters.CAUGHT_EXCEPTION_DURING_TUMBLR_POST_PARSE, 1); LOG.error("Exception parsing url:" + url + " Exception:" + StringUtils.stringifyException(e)); } } else if (topLevelMatcher.matches() && topLevelMatcher.groupCount() >= 3) { reporter.incrCounter(Counters.MATCHED_TOP_LEVEL_POST_PATTERN, 1); String uniqueURL = new String("http://" + topLevelMatcher.group(1) + "/"); int year = Integer.parseInt(topLevelMatcher.group(2)); int month = Integer.parseInt(topLevelMatcher.group(3)); HitsByMonth hits = new HitsByMonth(); hits.setHitCount(1); hits.setYear(year); hits.setMonth(month); hits.setFlags(scanForGenerator(key, jsonMetadata, reporter)); collector.collect(new TextBytes(uniqueURL), hits); } else if (nestedBlogMatcher.matches() && nestedBlogMatcher.groupCount() >= 4) { reporter.incrCounter(Counters.MATCHED_NESTED_POST_PATTERN, 1); if (!nestedBlogMatcher.group(1).endsWith("tumblr.com")) { String uniqueURL = new String("http://" + nestedBlogMatcher.group(1) + "/" + nestedBlogMatcher.group(2) + "/"); int year = Integer.parseInt(nestedBlogMatcher.group(3)); int month = Integer.parseInt(nestedBlogMatcher.group(4)); HitsByMonth hits = new HitsByMonth(); hits.setHitCount(1); hits.setYear(year); hits.setMonth(month); hits.setFlags(scanForGenerator(key, jsonMetadata, reporter)); collector.collect(new TextBytes(uniqueURL), hits); } } } @Override public void runStep(Path outputPathLocation) throws IOException { LOG.info("Task Identity Path is:" + getTaskIdentityPath()); LOG.info("Temp Path is:" + outputPathLocation); ImmutableList<Path> paths = new ImmutableList.Builder<Path>().addAll( ((DomainMetadataTask) getTask().getTask()).getMergeDBDataPaths()).build(); JobConf job = new JobBuilder(getDescription() + " - Phase 1", getConf()).inputIsSeqFile().inputs(paths).mapper( ScanDatabaseStep.class).keyValue(TextBytes.class, HitsByMonth.class).numReducers( CrawlEnvironment.NUM_DB_SHARDS / 2).output(outputPathLocation).outputIsSeqFile().build(); JobClient.runJob(job); } int scanForGenerator(TextBytes key, TextBytes value, Reporter reporter) { try { JsonObject containerObj = parser.parse(value.toString()).getAsJsonObject(); GoogleURL urlObject = new GoogleURL(key.toString()); if (urlObject.isValid()) { String sourceRootDomain = URLUtils.extractRootDomainName(urlObject.getHost()); if (sourceRootDomain != null) { URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject); JsonObject objectOut = new JsonObject(); if (fp != null) { objectOut.addProperty("dh", fp.getDomainHash()); } JsonObject crawlStatus = containerObj.getAsJsonObject("crawl_status"); if (crawlStatus != null) { if (crawlStatus.has("http_result")) { int httpResult = crawlStatus.get("http_result").getAsInt(); if (httpResult == 200) { JsonArray crawlStatsArray = crawlStatus.getAsJsonArray("crawl_stats"); if (crawlStatsArray != null && crawlStatsArray.size() != 0) { JsonObject crawlStats = crawlStatsArray.get(0).getAsJsonObject(); if (crawlStats != null) { JsonArray metaTags = crawlStats.getAsJsonArray("meta_tags"); if (metaTags != null) { for (JsonElement metaObject : metaTags) { String metaValue = metaObject.getAsJsonObject().get("value").getAsString(); if (metaValue.contains("Wordpress")) { reporter.incrCounter(Counters.DETECTED_WORDPRESS_DURING_METADATA_PARSE, 1); return PostFrequencyInfo.Flags.FLAG_GENERATOR_IS_WORDPRESS; } else if (metaValue.contains("blogger")) { reporter.incrCounter(Counters.DETECTED_BLOGGER_DURING_METADATA_PARSE, 1); return PostFrequencyInfo.Flags.FLAG_GENERATOR_IS_BLOGGER; } else if (metaValue.contains("http://www.typepad.com/")) { reporter.incrCounter(Counters.DETECTED_TYPEPAD_DURING_METADATA_PARSE, 1); return PostFrequencyInfo.Flags.FLAG_GENERATOR_IS_TYPEPAD; } } } } } } } } } } } catch (Exception e) { reporter.incrCounter(Counters.CAUGHT_EXCEPTION_DURING_METADATA_PARSE, 1); LOG.error("Key:" + key.toString() + " Value:" + value.toString() + "\n" + StringUtils.stringifyException(e)); } return 0; } }