package org.commoncrawl.mapred.pipelineV3.crawllistgen;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCommon;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JSONUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.S3NFileSystem;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData;
import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue;
import org.commoncrawl.util.Tuples.Pair;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
public class PartitionWikipediaUrlsStep extends CrawlPipelineStep {
public static final String ROOTDOMAIN_METADATA_PATH = "root.meta.path";
public static final String SUBDOMAIN_METADATA_PATH = "subdomain.meta.path";
public static final String OUTPUT_DIR_NAME = "wikipedaURLS";
public PartitionWikipediaUrlsStep(CrawlPipelineTask task) {
super(task, "Partition Wikipedia", OUTPUT_DIR_NAME);
}
private static final Log LOG = LogFactory.getLog(PartitionWikipediaUrlsStep.class);
@Override
public Log getLogger() {
return LOG;
}
@Override
public void runStep(Path outputPathLocation) throws IOException {
// get entire set of input crawl db paths ...
DomainMetadataTask rootTask = (DomainMetadataTask)getRootTask();
Configuration conf = new Configuration();
List<Path> inputPaths = Lists.newArrayList();
Path dbpediaDataPath = new Path("s3n://aws-publicdatasets/common-crawl/wikipedia/dbpedia/3.8");
FileSystem fs = FileSystem.get(dbpediaDataPath.toUri(),conf);
for (FileStatus file : fs.globStatus(new Path(dbpediaDataPath,"*.nt"))) {
inputPaths.add(file.getPath());
}
Path tempPath = new Path("s3n://aws-publicdatasets/common-crawl/wikipedia/dbpedia/3.8/partitioned");
JobConf job = new JobBuilder("Pre-Partition Wikipedia URLS", new Configuration())
.inputFormat(TextInputFormat.class)
.inputs(inputPaths)
.mapper(DBPediaEntryParser.class)
.keyValue(TextBytes.class, TextBytes.class)
.sort(CrawlDBKey.LinkKeyComparator.class)
.numReducers(100)
.compressor(CompressionType.BLOCK, SnappyCodec.class)
.output(tempPath)
.outputIsSeqFile()
.build();
//
JobClient.runJob(job);
Path tempPath2 = new Path("s3n://aws-publicdatasets/common-crawl/wikipedia/dbpedia/3.8/joined");
// join root domain metadata and wikipedia data
job = new JobBuilder("Join to Root Domain Metadata", new Configuration())
.input(tempPath)
.input(rootTask.getOutputDirForStep(ShardRootDomainClassificationStep.class))
.inputIsSeqFile()
.mapperKeyValue(TextBytes.class, TextBytes.class)
.outputKeyValue(CrawlListKey.class, TextBytes.class)
.sort(CrawlDBKey.CrawlDBKeyGroupByRootDomainComparator.class)
.partition(CrawlDBKey.PartitionBySuperDomainPartitioner.class)
.reducer(JoinRootDomainMetadataEmitLinkKeyReducer.class, false)
.outputIsSeqFile()
.output(tempPath2)
.build();
// JobClient.runJob(job);
// partition and sort by list key
job = new JobBuilder("Sort by ListKey", new Configuration())
.input(tempPath2)
.inputIsSeqFile()
.keyValue(CrawlListKey.class, TextBytes.class)
.sort(CrawlListKey.CrawListKeyComparator.class)
.partition(CrawlListKey.CrawlListKeyPartitioner.class)
.outputIsSeqFile()
.compressor(CompressionType.BLOCK, SnappyCodec.class)
.output(outputPathLocation)
.jarByClass(PartitionWikipediaUrlsStep.class)
.numReducers(CrawlListGenCommon.NUM_LIST_PARTITIONS)
.build();
// JobClient.runJob(job);
/*
// collect input paths from first stage
List<Path> secondStageInputs = Lists.newArrayList();
for (FileStatus file : fs.globStatus(new Path(tempPath,"part-*"))) {
secondStageInputs.add(file.getPath());
}
// build the basic job config ...
job = new JobBuilder("Parition Wikipedia URLS", new Configuration())
.inputFormat(PartitionJoinInputFormat.class)
.mapper(WikipediaURLPartitioner.class)
.keyValue(CrawlListKey.class, TextBytes.class)
.sort(CrawlListKey.CrawListKeyComparator.class)
.partition(CrawlListKey.CrawlListKeyPartitioner.class)
.numReducers(CrawlListGenCommon.NUM_LIST_PARTITIONS)
.compressor(CompressionType.BLOCK, SnappyCodec.class)
.output(outputPathLocation)
.outputIsSeqFile()
.build();
job.setInt("mapred.task.timeout",4*(60*(60*1000)));
// write partition paths ...
PartitionJoinInputFormat.writeSinglePathPerPartition(secondStageInputs, job);
// ok, figure out locations of dependent metadata ...
job.set(ROOTDOMAIN_METADATA_PATH, rootTask.getOutputDirForStep(ShardRootDomainClassificationStep.class).toString());
// run it ...
JobClient.runJob(job);
*/
}
static String parseDBPediaLine(String str) {
int lastIndexOfGT = str.lastIndexOf('>');
if (lastIndexOfGT >= 0) {
int lastIndexOfLT = str.lastIndexOf('<',lastIndexOfGT);
if (lastIndexOfLT < lastIndexOfGT) {
return str.substring(lastIndexOfLT + 1,lastIndexOfGT);
}
}
return null;
}
/**
*
* @author rana
*
*/
public static class DBPediaEntryParser implements Mapper<LongWritable,Text,TextBytes,TextBytes> {
@Override
public void configure(JobConf job) {
}
@Override
public void close() throws IOException {
}
enum Counters {
FAILED_TO_PARSE_ENTRY
, INVALID_URL, NULL_FP}
@Override
public void map(LongWritable key, Text value,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException {
String url = parseDBPediaLine(value.toString());
if (url == null) {
reporter.incrCounter(Counters.FAILED_TO_PARSE_ENTRY, 1);
}
else {
GoogleURL urlObject = new GoogleURL(url);
if (!urlObject.isValid()) {
reporter.incrCounter(Counters.INVALID_URL, 1);
}
else {
// generate a fingerprint
URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
if (fp == null) {
reporter.incrCounter(Counters.NULL_FP, 1);
}
else {
JsonObject outputJSON = new JsonObject();
// append it to output json
outputJSON.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, urlObject.getCanonicalURL());
// emit a CrawlDBKey
TextBytes outputKey = CrawlDBKey.generateCrawlStatusKey(fp, 0);
// write out
output.collect(outputKey, new TextBytes(outputJSON.toString()));
}
}
}
}
}
private static FileSystem getFileSystemForMergePath(Path path,Configuration conf)throws IOException {
// override S3N
if (path.toUri().getScheme().equalsIgnoreCase("s3n")) {
FileSystem fs = new S3NFileSystem();
fs.initialize(path.toUri(), conf);
return fs;
}
// conf.setClass("fs.s3n.impl", S3NFileSystem.class,FileSystem.class);
return FileSystem.get(path.toUri(),conf);
}
static void addPartFileGivenPath(List<Path> paths,FileSystem fs,Path path) throws IOException {
FileStatus files[] = fs.globStatus(new Path(path,"part-*"));
for (FileStatus file : files) {
paths.add(file.getPath());
}
}
public static class JoinRootDomainMetadataEmitLinkKeyReducer implements Reducer<TextBytes,TextBytes,CrawlListKey,TextBytes> {
@Override
public void configure(JobConf job) {
}
@Override
public void close() throws IOException {
}
JsonParser parser = new JsonParser();
enum Counters {
FOUND_ROOT_DOMAIN_RECORD,
DID_NOT_FIND_ROOT_DOMAIN_RECORD, BAD_URL, BAD_FP, EMITTED_HOMEPAGE_URL, JOINED_ROOT_DOMAIN_AND_WIKI_URL, BAD_JOIN_MORE_THAN_ONE_ROOT_DOMAIN, PARTITIONING_URL_WITH_SUBDOMAIN
}
static final int NUM_HASH_FUNCTIONS = 10;
static final int NUM_BITS = 11;
static final int NUM_ELEMENTS = 1 << 28;
static final int FLUSH_THRESHOLD = 1 << 23;
URLFPBloomFilter emittedTuplesFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
static String makeHomePageURLFromUrlObject(GoogleURL urlObject) {
String urlOut = urlObject.getScheme();
urlOut += (urlObject.getScheme());
urlOut += ("://");
if (urlObject.getUserName() != GoogleURL.emptyString) {
urlOut += (urlObject.getUserName());
if (urlObject.getPassword() != GoogleURL.emptyString) {
urlOut += (":");
urlOut += (urlObject.getPassword());
}
urlOut += ("@");
}
String host = urlObject.getHost();
if (host.endsWith(".")) {
host = host.substring(0, host.length() - 1);
}
urlOut += (host);
urlOut += "/";
return urlOut;
}
@Override
public void reduce(TextBytes key, Iterator<TextBytes> values,OutputCollector<CrawlListKey, TextBytes> output, Reporter reporter)throws IOException {
ArrayList<String> urls = new ArrayList<String>();
boolean isSuperDomain = false;
int rootDomainRecordCount = 0;
while (values.hasNext()) {
TextBytes nextValue = values.next();
JsonObject object = parser.parse(nextValue.toString()).getAsJsonObject();
if (object.has(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY)) {
urls.add(object.get(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY).getAsString());
}
else {
if (object.has(CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_SUPERDOMAIN)) {
rootDomainRecordCount++;
reporter.incrCounter(Counters.FOUND_ROOT_DOMAIN_RECORD, 1);
isSuperDomain = object.get(CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_SUPERDOMAIN).getAsBoolean();
}
}
}
if (urls.size() != 0 && rootDomainRecordCount != 0) {
reporter.incrCounter(Counters.JOINED_ROOT_DOMAIN_AND_WIKI_URL, 1);
if (rootDomainRecordCount > 1) {
reporter.incrCounter(Counters.BAD_JOIN_MORE_THAN_ONE_ROOT_DOMAIN,1);
}
}
JsonObject objectOut = new JsonObject();
CrawlListKey keyOut = new CrawlListKey();
TextBytes valueOut = new TextBytes();
URLFPV2 testKey = new URLFPV2();
for (String url : urls) {
GoogleURL urlObject = new GoogleURL(url);
if (!urlObject.isValid()) {
reporter.incrCounter(Counters.BAD_URL, 1);
}
else {
URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject);
if (fp == null) {
reporter.incrCounter(Counters.BAD_FP, 1);
}
else {
// if not super domain, then partition on root domain id
long partitionDomain = fp.getRootDomainHash();
// if super domain then partition on root domain ...
if (isSuperDomain) {
partitionDomain = fp.getDomainHash();
reporter.incrCounter(Counters.PARTITIONING_URL_WITH_SUBDOMAIN, 1);
}
// populate json ...
objectOut.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, url);
// set into text output
valueOut.set(objectOut.toString());
// construct output key ...
CrawlListKey.generateKey(keyOut, partitionDomain, fp.getDomainHash(), CrawlListKey.KEY_TYPE_URL, 100000, 0);
// output
output.collect(keyOut, valueOut);
// ok check to see if we emitted this tuple ...
testKey.setDomainHash(fp.getDomainHash());
testKey.setUrlHash(fp.getDomainHash());
if (!emittedTuplesFilter.isPresent(testKey)) {
// add to bloom
emittedTuplesFilter.add(testKey);
// emit home page entry
String homePageURL = makeHomePageURLFromUrlObject(urlObject);
// construct output key ...
CrawlListKey.generateKey(keyOut, partitionDomain, fp.getDomainHash(), CrawlListKey.KEY_TYPE_HOMEPAGE_URL, 1, 0);
// populate json ...
objectOut.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, homePageURL);
// set into text output
valueOut.set(objectOut.toString());
output.collect(keyOut,valueOut);
reporter.incrCounter(Counters.EMITTED_HOMEPAGE_URL, 1);
}
}
}
}
}
}
public static class WikipediaURLPartitioner implements Mapper<IntWritable,Text,CrawlListKey,TextBytes> {
Path rootDomainMetaPath;
JobConf _conf;
OutputCollector<CrawlListKey, TextBytes> _collector;
@Override
public void configure(JobConf job) {
rootDomainMetaPath = new Path(job.get(ROOTDOMAIN_METADATA_PATH));
_conf = job;
}
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
enum Counters {
SUBDOMAIN_METADATA_WITHOUT_MATCHING_ROOT_DOMAIN_METADATA, NO_SOURCE_URL, FILTERED_OUT_URL, ROOT_DOMAIN_RECORD, CRAWL_STATUS_RECORD, INVALID_URL, EMITTED_HOMEPAGE_URL, SKIPPPED_ALREADY_EMITTED_HOMEPAGE_URL
}
static final int NUM_HASH_FUNCTIONS = 10;
static final int NUM_BITS = 11;
static final int NUM_ELEMENTS = 1 << 28;
static final int FLUSH_THRESHOLD = 1 << 23;
URLFPBloomFilter emittedTuplesFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
static String makeHomePageURLFromUrlObject(GoogleURL urlObject) {
String urlOut = urlObject.getScheme();
urlOut += (urlObject.getScheme());
urlOut += ("://");
if (urlObject.getUserName() != GoogleURL.emptyString) {
urlOut += (urlObject.getUserName());
if (urlObject.getPassword() != GoogleURL.emptyString) {
urlOut += (":");
urlOut += (urlObject.getPassword());
}
urlOut += ("@");
}
String host = urlObject.getHost();
if (host.endsWith(".")) {
host = host.substring(0, host.length() - 1);
}
urlOut += (host);
urlOut += "/";
return urlOut;
}
@Override
public void map(IntWritable key, Text value,OutputCollector<CrawlListKey, TextBytes> output, Reporter reporter) throws IOException {
// set up merge attributes
Configuration localMergeConfig = new Configuration(_conf);
localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, CrawlDBKey.CrawlDBKeyComparator.class,
Comparator.class);
localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, CrawlDBKey.class, WritableComparable.class);
// get the single input path...
Path inputPath = new Path(value.toString());
// get fs based on path ...
FileSystem fs = FileSystem.get(inputPath.toUri(),_conf);
ArrayList<Path> paths = Lists.newArrayList();
// add join paths
addPartFileGivenPath(paths, fs, rootDomainMetaPath);
paths.add(inputPath);
LOG.info("Input Paths for Shard:" + key.get() + " Are:" + paths);
// replace emr s3n for inputs ...
FileSystem mergefs = getFileSystemForMergePath(paths.get(0),localMergeConfig);
// ok now spawn merger
MultiFileInputReader<TextBytes> multiFileInputReader
= new MultiFileInputReader<TextBytes>(mergefs, paths, localMergeConfig);
try {
Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;
TextBytes valueText = new TextBytes();
DataInputBuffer valueStream = new DataInputBuffer();
JsonParser parser = new JsonParser();
_collector = output;
long _rootDomainId = -1L;
JsonObject _rootDomainMetadata = null;
boolean _isSuperDomain = false;
CrawlListKey keyOut = new CrawlListKey();
TextBytes valueOut = new TextBytes();
URLFPV2 testKey = new URLFPV2();
JsonObject jsonObjOut = new JsonObject();
while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {
//LOG.info("Key:"+ nextItem.e0._keyObject.toString());
long recordType = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID);
if (recordType == CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD.ordinal()) {
RawRecordValue rawValue = Iterables.getFirst(nextItem.e1,null);
valueStream.reset(rawValue.data.getData(),0,rawValue.data.getLength());
valueText.setFromRawTextBytes(valueStream);
_rootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID);
//LOG.info("Got Root Domain Record:"+ _rootDomainId);
_rootDomainMetadata = parser.parse(valueText.toString()).getAsJsonObject();
_isSuperDomain = JSONUtils.safeGetBoolean(_rootDomainMetadata,CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_SUPERDOMAIN);
reporter.incrCounter(Counters.ROOT_DOMAIN_RECORD,1);
}
else if (recordType == CrawlDBKey.Type.KEY_TYPE_CRAWL_STATUS.ordinal()) {
reporter.incrCounter(Counters.CRAWL_STATUS_RECORD,1);
long currentRootDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.ROOT_DOMAIN_HASH_COMPONENT_ID);
long currentDomainId = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.DOMAIN_HASH_COMPONENT_ID);
// get first record, which will be merge record ...
RawRecordValue firstRawValue = Iterables.getFirst(nextItem.e1, null);
// convert to json object ...
valueStream.reset(firstRawValue.data.getData(),0,firstRawValue.data.getLength());
valueText.setFromRawTextBytes(valueStream);
JsonObject jsonObject = parser.parse(valueText.toString()).getAsJsonObject();
// extract url ...
if (jsonObject.has(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY)) {
String url = jsonObject.get(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY).getAsString();
if (currentRootDomainId != _rootDomainId) {
reporter.incrCounter(Counters.SUBDOMAIN_METADATA_WITHOUT_MATCHING_ROOT_DOMAIN_METADATA, 1);
_isSuperDomain = false;
_rootDomainId = currentRootDomainId;
LOG.error("No Root Domain Info for URL:" + url);
}
// figure out partition domain ...
// if not super domain, then partition on root domain id
long partitionDomain = currentRootDomainId;
// if super domain then partition on root domain ...
if (_isSuperDomain)
partitionDomain = currentDomainId;
// construct output key ...
CrawlListKey.generateKey(keyOut, partitionDomain, currentDomainId, CrawlListKey.KEY_TYPE_URL, 100000, 0);
// set
output.collect(keyOut, valueText);
// generate home page url
GoogleURL urlObject = new GoogleURL(url);
if (urlObject.isValid()) {
// ok check to see if we emitted this tuple ...
testKey.setDomainHash(currentDomainId);
testKey.setUrlHash(currentDomainId);
if (!emittedTuplesFilter.isPresent(testKey)) {
// add to bloom
emittedTuplesFilter.add(testKey);
// emit home page entry
String homePageURL = makeHomePageURLFromUrlObject(urlObject);
// construct output key ...
CrawlListKey.generateKey(keyOut, partitionDomain, currentDomainId, CrawlListKey.KEY_TYPE_HOMEPAGE_URL, 1, 0);
// and proper JSON
jsonObjOut.addProperty(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY, homePageURL);
valueOut.set(jsonObjOut.toString());
output.collect(keyOut,valueOut);
reporter.incrCounter(Counters.EMITTED_HOMEPAGE_URL, 1);
}
else {
reporter.incrCounter(Counters.SKIPPPED_ALREADY_EMITTED_HOMEPAGE_URL, 1);
}
}
else {
reporter.incrCounter(Counters.INVALID_URL, 1);
}
}
else {
reporter.incrCounter(Counters.NO_SOURCE_URL, 1);
}
}
}
}
finally {
multiFileInputReader.close();
}
}
}
}