package org.commoncrawl.mapred.ec2.postprocess.crawldb;
import java.io.IOException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.CrawlDBKeyPartitioner;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.LinkKeyComparator;
import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey.Type;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.MultiFileMergeUtils;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLUtils;
import org.junit.Assert;
import org.junit.Test;
import com.google.gson.JsonObject;
@SuppressWarnings("static-access")
public class CrawlDBBlekkoMerge {
static final Log LOG = LogFactory.getLog(CrawlDBBlekkoMerge.class);
static final String BLKEKKO_TIMESTAMP_PROPERTY = "blekko.timestamp";
static Options options = new Options();
static {
options.addOption(OptionBuilder.withArgName("op").hasArg(true).isRequired().withDescription("Operation (shard/import)").create("op"));
options.addOption(OptionBuilder.withArgName("input").hasArg(true).isRequired().withDescription("Input Path").create("input"));
options.addOption(OptionBuilder.withArgName("output").hasArg(true).isRequired().withDescription("Output Path").create("output"));
options.addOption(OptionBuilder.withArgName("crawldb").hasArg(true).withDescription("CrawlDB Path").create("crawldb"));
options.addOption(OptionBuilder.withArgName("timestamp").hasArg(true).withDescription("Metadata Timestamp").create("timestamp"));
options.addOption(OptionBuilder.withArgName("shards").hasArg(true).withDescription("Shard Count (test only)").create("shards"));
}
public static void main(String[] args) throws Exception {
CommandLineParser parser = new GnuParser();
try {
// parse the command line arguments
CommandLine cmdLine = parser.parse( options, args );
if (cmdLine.getOptionValue("op").equalsIgnoreCase("shard")) {
runShardStep(cmdLine);
}
else if (cmdLine.getOptionValue("op").equalsIgnoreCase("import")) {
runImportStep(cmdLine);
}
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "CrawlDBBlekkoMerge", options );
throw e;
}
}
static Pattern METADATA_PATTERN = Pattern.compile("^rank=([0-9.]*)\\s*rank10=([0-9.]*)[ ]*([^\\s]*)");
static String BLEKKO_CRAWL_STATUS_CRAWLED = "crawled";
static String BLEKKO_CRAWL_STATUS_REDIRECT = "redir";
static class BlekkoURLMetadataToJSONMapper implements Mapper<Text,Text,TextBytes,TextBytes> {
enum Counters {
NULL_FP, BAD_METADATA, EXCEPTION_DURING_PARSE, RANK10_LT_1, RANK10_LT_2, RANK10_LT_3, RANK10_GT_4, RANK10_LT_4, BLEKKO_CRAWLED
}
long metadataTimestamp = -1L;
@Override
public void configure(JobConf job) {
metadataTimestamp = job.getLong(BLKEKKO_TIMESTAMP_PROPERTY, -1);
}
@Override
public void close() throws IOException {
}
@Override
public void map(Text key, Text value,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException {
// map to fingerprint ...
URLFPV2 fp = URLUtils.getURLFPV2FromURL(key.toString());
if (fp != null) {
// parse
Matcher m = METADATA_PATTERN.matcher(value.toString().trim());
if (m.matches()) {
try {
float rank10= Float.parseFloat(m.group(2));
boolean crawled = m.group(3).equalsIgnoreCase("crawled");
if (rank10>=0.0 && rank10 <1)
reporter.incrCounter(Counters.RANK10_LT_1, 1);
else if (rank10>=1.0 && rank10 <2)
reporter.incrCounter(Counters.RANK10_LT_2, 1);
else if (rank10>=2.0 && rank10 <3)
reporter.incrCounter(Counters.RANK10_LT_3, 1);
else if (rank10>=3.0 && rank10 <4)
reporter.incrCounter(Counters.RANK10_LT_4, 1);
else if (rank10>=4.0)
reporter.incrCounter(Counters.RANK10_GT_4, 1);
if (crawled)
reporter.incrCounter(Counters.BLEKKO_CRAWLED, 1);
JsonObject jsonMetadata = new JsonObject();
jsonMetadata.addProperty(CrawlDBMergingReducer.BLEKKO_METADATA_TIMESTAMP_PROPERTY,metadataTimestamp);
jsonMetadata.addProperty(CrawlDBMergingReducer.BLEKKO_METADATA_RANK,Float.parseFloat(m.group(1)));
jsonMetadata.addProperty(CrawlDBMergingReducer.BLEKKO_METADATA_RANK_10,Float.parseFloat(m.group(2)));
jsonMetadata.addProperty(CrawlDBMergingReducer.BLEKKO_METADATA_STATUS,m.group(3));
JsonObject topLevelObject = new JsonObject();
topLevelObject.add(CrawlDBMergingReducer.TOPLEVEL_BLEKKO_METADATA_PROPERTY, jsonMetadata);
topLevelObject.addProperty(CrawlDBMergingReducer.TOPLEVEL_SOURCE_URL_PROPRETY,key.toString());
// get crawl db key format key
TextBytes keyOut = CrawlDBKey.generateKey(fp, Type.KEY_TYPE_MERGED_RECORD, metadataTimestamp);
// emit
output.collect(keyOut, new TextBytes(topLevelObject.toString()));
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
reporter.incrCounter(Counters.EXCEPTION_DURING_PARSE, 1);
}
}
else {
reporter.incrCounter(Counters.BAD_METADATA, 1);
LOG.info("Bad Metadata:" + value.toString() + " Len:" + value.getLength());
}
}
else {
reporter.incrCounter(Counters.NULL_FP, 1);
LOG.info("NULLFP:" + key.toString());
}
}
}
@Test
public void testPattern() {
String testDatum1 = "rank=0.0 rank10=1.00 crawled";
String testDatum2 = "rank=2.0 rank10=3.01 redir";
String testDatum3 = "rank=0.0 rank10=0.00 crawled\n";
{
Matcher m = METADATA_PATTERN.matcher(testDatum1);
Assert.assertTrue(m.matches());
Assert.assertEquals(m.group(1),"0.0");
Assert.assertEquals(m.group(2),"1.00");
Assert.assertEquals(m.group(3),"crawled");
}
{
Matcher m= METADATA_PATTERN.matcher(testDatum2);
Assert.assertTrue(m.matches());
Assert.assertEquals(m.group(1),"2.0");
Assert.assertEquals(m.group(2),"3.01");
Assert.assertEquals(m.group(3),"redir");
}
{
Matcher m= METADATA_PATTERN.matcher(testDatum3);
Assert.assertTrue(m.matches());
Assert.assertEquals(m.group(1),"0.0");
Assert.assertEquals(m.group(2),"0.00");
Assert.assertEquals(m.group(3),"crawled");
Assert.assertEquals(m.groupCount(),3);
}
}
static void runShardStep(CommandLine commandLine)throws IOException {
if (!commandLine.hasOption("timestamp")) {
throw new IOException("Required timestamp parameter missing!");
}
Path inputPath = new Path(commandLine.getOptionValue("input"));
Path outputPath = new Path(commandLine.getOptionValue("output"));
Configuration conf = new Configuration();
// set the timestamp property in the config ...
conf.setLong(BLKEKKO_TIMESTAMP_PROPERTY, Long.parseLong(commandLine.getOptionValue("timestamp")));
JobConf jobConf = new JobBuilder("Shard Belkko URL Metadata", conf)
.input(inputPath)
.inputFormat(SequenceFileInputFormat.class)
.mapper(BlekkoURLMetadataToJSONMapper.class)
.mapperKeyValue(TextBytes.class, TextBytes.class)
.outputKeyValue(TextBytes.class, TextBytes.class)
.outputFormat(SequenceFileOutputFormat.class)
.partition(CrawlDBKeyPartitioner.class)
.sort(LinkKeyComparator.class)
.numReducers(CrawlDBCommon.NUM_SHARDS)
.speculativeExecution(true)
.output(outputPath)
.compressMapOutput(true)
.maxMapAttempts(4)
.maxReduceAttempts(3)
.maxMapTaskFailures(1)
.compressor(CompressionType.BLOCK, SnappyCodec.class)
.build();
LOG.info("Starting JOB:" + jobConf);
try {
JobClient.runJob(jobConf);
LOG.info("Finished JOB:" + jobConf);
}
catch (Exception e) {
LOG.info("JOB Exec Failed for:" + jobConf);
LOG.error(CCStringUtils.stringifyException(e));
}
}
static void runImportStep(CommandLine commandLine)throws IOException {
if (!commandLine.hasOption("crawldb")) {
throw new IOException("CrawlDB required parameter missing!");
}
Path inputPath = new Path(commandLine.getOptionValue("input"));
Path outputPath = new Path(commandLine.getOptionValue("output"));
Path crawldbPath = new Path(commandLine.getOptionValue("crawldb"));
int shardCount = CrawlDBCommon.NUM_SHARDS;
if (commandLine.hasOption("shards")) {
shardCount = Integer.parseInt(commandLine.getOptionValue("shards"));
}
Configuration conf = new Configuration();
// construct input paths ...
ArrayList<Path> inputPaths = new ArrayList<Path>();
inputPaths.add(inputPath);
inputPaths.add(crawldbPath);
JobConf jobConf = new JobBuilder("Merge Blekko Data", conf)
.inputs(inputPaths)
.inputFormat(MultiFileMergeUtils.MultiFileMergeInputFormat.class)
.mapperKeyValue(IntWritable.class, Text.class)
.outputKeyValue(TextBytes.class, TextBytes.class)
.outputFormat(SequenceFileOutputFormat.class)
.reducer(CrawlDBMergeSortReducer.class,false)
.partition(MultiFileMergeUtils.MultiFileMergePartitioner.class)
.numReducers(shardCount)
.speculativeExecution(true)
.output(outputPath)
.compressMapOutput(true)
.compressor(CompressionType.BLOCK, GzipCodec.class)
.maxMapAttempts(10)
.maxReduceAttempts(4)
.maxMapTaskFailures(1)
.reuseJVM(1)
.build();
LOG.info("Starting JOB:" + jobConf);
try {
JobClient.runJob(jobConf);
LOG.info("Finished JOB:" + jobConf);
}
catch (Exception e) {
LOG.info("JOB Exec Failed for:" + jobConf);
LOG.error(CCStringUtils.stringifyException(e));
}
}
}