/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.pipelineV3.crawllistgen;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.Date;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.mapred.PostFrequencyInfo;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.blogs.postfrequency.GenPostFrequencyStep;
import org.commoncrawl.mapred.pipelineV3.domainmeta.rank.GenSuperDomainListStep;
import org.commoncrawl.util.FPGenerator;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.SuperDomainList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.Tuples.Pair;
import org.commoncrawl.util.time.SerialDate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
/**
*
* @author rana
*
*/
public class GenBlogPlatformUrlsStep extends CrawlPipelineStep implements
Mapper<TextBytes, TextBytes, TextBytes, BooleanWritable> {
enum Counters {
ROOT_DOMAIN_WAS_NULL, ROOT_WAS_WORDPRESS, ROOT_WAS_TUMBLR, ROOT_WAS_BLOGGER, ROOT_WAS_TYPEPAD, EMITTING_WORDPRESS,
EMITTING_BLOGGER, EMITTING_TYPEPAD, EMITTING_TUMBLR, EMITTING_OTHER
}
private static final Log LOG = LogFactory.getLog(GenBlogPlatformUrlsStep.class);
public static final String OUTPUT_DIR_NAME = "blogUrls";
public static final String SUPER_DOMAIN_FILE_PATH = "super-domain-list";
Set<Long> superDomainIdSet;
JsonParser parser = new JsonParser();
PostFrequencyInfo freqInfo = new PostFrequencyInfo();
static long wikipediaRootHash = FPGenerator.std64.fp("wikipedia.org");
static long flickrRootHash = FPGenerator.std64.fp("flickr.com");
static long stumbleUponRootHash = FPGenerator.std64.fp("stumbleupon.com");
static long wordpressRootHash = FPGenerator.std64.fp("wordpress.com");
static long ebayRootHash = FPGenerator.std64.fp("ebay.com");
static long technoratiRootHash = FPGenerator.std64.fp("technorati.com");
static long imdbRootHash = FPGenerator.std64.fp("imdb.com");
static long quoraRootHash = FPGenerator.std64.fp("quora.com");
static long stackOverflowRootHash = FPGenerator.std64.fp("stackoverflow.com");
static long slideShareRootHash = FPGenerator.std64.fp("slideshare.net");
static long youtubeRootHash = FPGenerator.std64.fp("youtube.com");
static long amazonRootHash = FPGenerator.std64.fp("amazon.com");
static long tumblrRootHash = FPGenerator.std64.fp("tumblr.com");
static long typepadRootHash = FPGenerator.std64.fp("typepad.com");
static long blogspotRootHash = FPGenerator.std64.fp("blogspot.com");
static long facebookRootHash = FPGenerator.std64.fp("facebook.com");
private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
private static final NumberFormat SHORT_MONTH_FORMAT = NumberFormat.getInstance();
static {
SHORT_MONTH_FORMAT.setMinimumIntegerDigits(2);
SHORT_MONTH_FORMAT.setGroupingUsed(false);
}
private static final NumberFormat YEAR_FORMAT = NumberFormat.getInstance();
static {
YEAR_FORMAT.setMinimumIntegerDigits(4);
YEAR_FORMAT.setGroupingUsed(false);
}
static ImmutableList<Pair<Integer, Integer>> getProbeDates(int maxMonthsToProbe) {
Pair<Integer, Integer> startYearMonth = getStartYearMonth();
ImmutableList.Builder<Pair<Integer, Integer>> builder = new ImmutableList.Builder<Pair<Integer, Integer>>();
int months = 0;
outer: for (int year = startYearMonth.e0; year >= 2000; --year) {
int maxMonth = (year == startYearMonth.e0) ? startYearMonth.e1 : 12;
for (int month = maxMonth; month >= 1; --month) {
builder.add(new Pair<Integer, Integer>(year, month));
if (++months == maxMonthsToProbe)
break outer;
}
}
return builder.build();
}
static Pair<Integer, Integer> getStartYearMonth() {
// more detailed fetch ...
SerialDate today = SerialDate.createInstance(new Date(System.currentTimeMillis()));
return new Pair<Integer, Integer>(today.getYYYY(), today.getMonth());
}
TextBytes keyBuffer = new TextBytes();
TextBytes urlBuffer = new TextBytes();
BooleanWritable skipData = new BooleanWritable();
public GenBlogPlatformUrlsStep() {
super(null, null, null);
}
public GenBlogPlatformUrlsStep(CrawlPipelineTask parentTask) throws IOException {
super(parentTask, "Blog URL Injector", OUTPUT_DIR_NAME);
}
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
@Override
public void configure(JobConf job) {
Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH));
try {
superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile);
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw new RuntimeException(e);
}
}
private void emitBlogspotDomain(String blogspotURL, PostFrequencyInfo postFrequencyInfo,
OutputCollector<TextBytes, BooleanWritable> output, Reporter reporter) throws IOException {
// emit home page
emitItem(blogspotURL, output, reporter, false);
if (postFrequencyInfo != null && postFrequencyInfo.getLastYearWithPosts() >= 2011) {
// more detailed fetch ...
ImmutableList<Pair<Integer, Integer>> probeDates = getProbeDates(24);
int probeItemIndex = 0;
for (Pair<Integer, Integer> probeDate : probeDates) {
emitItem(blogspotURL + YEAR_FORMAT.format(probeDate.e0) + "_" + SHORT_MONTH_FORMAT.format(probeDate.e1)
+ "_01_archive.html", output, reporter, (probeItemIndex != 0));
++probeItemIndex;
}
}
}
private void emitItem(String url, OutputCollector<TextBytes, BooleanWritable> output, Reporter reporter,
boolean skipIfDupe) throws IOException {
skipData.set(skipIfDupe);
urlBuffer.set(url);
if (PartitionUtils.generatePartitionKeyGivenURL(superDomainIdSet, urlBuffer,
CrawlListGeneratorTask.KEY_TYPE_BLOGPROBE_URL, keyBuffer)) {
output.collect(keyBuffer, skipData);
}
}
private void emitOtherBlogPlatformDomain(String siteURL, PostFrequencyInfo postFrequencyInfo,
OutputCollector<TextBytes, BooleanWritable> output, Reporter reporter) throws IOException {
emitItem(siteURL, output, reporter, false);
}
private void emitTumblrDomain(String siteURL, PostFrequencyInfo postFrequencyInfo,
OutputCollector<TextBytes, BooleanWritable> output, Reporter reporter) throws IOException {
GoogleURL urlObject = new GoogleURL(siteURL);
if (urlObject.isValid()) {
// emit home page
emitItem(siteURL, output, reporter, false);
// emit archive url
emitItem("http://" + urlObject.getHost() + "/archive", output, reporter, false);
}
}
private void emitTypepadDomain(String siteURL, PostFrequencyInfo postFrequencyInfo,
OutputCollector<TextBytes, BooleanWritable> output, Reporter reporter) throws IOException {
siteURL = postFrequencyInfo.getBlogPath();
if (!siteURL.endsWith("/")) {
siteURL += "/";
}
// emit home page
emitItem(siteURL, output, reporter, false);
if (postFrequencyInfo != null && postFrequencyInfo.getLastYearWithPosts() >= 2011) {
ImmutableList<Pair<Integer, Integer>> probeDates = getProbeDates(24);
int probeItemIndex = 0;
for (Pair<Integer, Integer> probeDate : probeDates) {
emitItem(siteURL + YEAR_FORMAT.format(probeDate.e0) + "/" + SHORT_MONTH_FORMAT.format(probeDate.e1)
+ "/index.html", output, reporter, (probeItemIndex != 0));
probeItemIndex++;
}
}
}
private void emitWordPressDomain(String wordpressURL, PostFrequencyInfo postFrequencyInfo,
OutputCollector<TextBytes, BooleanWritable> output, Reporter reporter) throws IOException {
emitItem(wordpressURL, output, reporter, false);
if (postFrequencyInfo != null && postFrequencyInfo.getLastYearWithPosts() >= 2011) {
ImmutableList<Pair<Integer, Integer>> probeDates = getProbeDates(24);
// number of paginations is based on avg posts per month
int paginations = (int) Math.ceil((float) postFrequencyInfo.getAvgPostsPerMonth() / (float) 10);
int probeItemIndex = 0;
for (Pair<Integer, Integer> probeDate : probeDates) {
for (int page = 1; page < paginations; ++page) {
if (page == 1) {
emitItem(wordpressURL + YEAR_FORMAT.format(probeDate.e0) + "/" + SHORT_MONTH_FORMAT.format(probeDate.e1)
+ "/", output, reporter, (probeItemIndex != 0));
} else {
emitItem(wordpressURL + YEAR_FORMAT.format(probeDate.e0) + "/" + SHORT_MONTH_FORMAT.format(probeDate.e1)
+ "/page/" + (page + 1) + "/", output, reporter, (probeItemIndex != 0));
}
}
++probeItemIndex;
}
}
}
@Override
public Log getLogger() {
return LOG;
}
@Override
public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, BooleanWritable> output, Reporter reporter)
throws IOException {
JsonObject postFreqJSON = parser.parse(value.toString()).getAsJsonObject();
freqInfo.setBlogPath(postFreqJSON.get("url").getAsString());
freqInfo.setLastYearWithPosts(postFreqJSON.get("lastYearWithPost").getAsInt());
freqInfo.setAvgPostsPerMonth((int) Math.ceil(postFreqJSON.get("avg").getAsDouble()));
boolean isBlogger = postFreqJSON.has("blogger");
boolean isWordpress = postFreqJSON.has("wordpress");
boolean isTypepad = postFreqJSON.has("typepad");
boolean isTumblr = postFreqJSON.has("tumblr");
if (!isBlogger && !isWordpress && !isTypepad && !isTumblr) {
String rootDomain = URLUtils.extractRootDomainName(key.toString());
if (rootDomain == null) {
reporter.incrCounter(Counters.ROOT_DOMAIN_WAS_NULL, 1);
return;
} else {
long rootFP = SuperDomainList.domainFingerprintGivenName(rootDomain);
if (rootFP == wordpressRootHash) {
reporter.incrCounter(Counters.ROOT_WAS_WORDPRESS, 1);
isWordpress = true;
} else if (rootFP == tumblrRootHash) {
reporter.incrCounter(Counters.ROOT_WAS_TUMBLR, 1);
isTumblr = true;
} else if (rootFP == blogspotRootHash) {
reporter.incrCounter(Counters.ROOT_WAS_BLOGGER, 1);
isBlogger = true;
} else if (rootFP == typepadRootHash) {
reporter.incrCounter(Counters.ROOT_WAS_TYPEPAD, 1);
isTypepad = true;
}
}
}
if (isWordpress) {
reporter.incrCounter(Counters.EMITTING_WORDPRESS, 1);
emitWordPressDomain(freqInfo.getBlogPath(), freqInfo, output, reporter);
} else if (isBlogger) {
reporter.incrCounter(Counters.EMITTING_BLOGGER, 1);
emitBlogspotDomain(freqInfo.getBlogPath(), freqInfo, output, reporter);
} else if (isTypepad) {
reporter.incrCounter(Counters.EMITTING_TYPEPAD, 1);
emitTypepadDomain(freqInfo.getBlogPath(), freqInfo, output, reporter);
} else if (isTumblr) {
reporter.incrCounter(Counters.EMITTING_TUMBLR, 1);
emitTumblrDomain(freqInfo.getBlogPath(), freqInfo, output, reporter);
} else {
reporter.incrCounter(Counters.EMITTING_OTHER, 1);
emitOtherBlogPlatformDomain(freqInfo.getBlogPath(), freqInfo, output, reporter);
}
}
@Override
public void runStep(Path outputPathLocation) throws IOException {
Path superDomainListPath = new Path(getOutputDirForStep(GenSuperDomainListStep.class), "part-00000");
JobConf job = new JobBuilder(getDescription(), getConf())
.input(getOutputDirForStep(GenPostFrequencyStep.class)).inputIsSeqFile().keyValue(TextBytes.class,
BooleanWritable.class).mapper(GenBlogPlatformUrlsStep.class).partition(
PartitionUtils.PartitionKeyPartitioner.class).numReducers(CrawlListGeneratorTask.NUM_SHARDS).output(
outputPathLocation).outputIsSeqFile().setAffinityNoBalancing(getOutputDirForStep(PartitionCrawlDBStep.class),
ImmutableSet.of("ccd001.commoncrawl.org", "ccd006.commoncrawl.org")).set(SUPER_DOMAIN_FILE_PATH,
superDomainListPath.toString())
.build();
JobClient.runJob(job);
}
}