package org.commoncrawl.mapred.ec2.postprocess.crawldb;
import java.io.IOException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.TextBytes;
import com.google.common.collect.ImmutableList;
import com.google.gson.JsonObject;
public class PrepareBlekkoDomainMetadata {
static final Log LOG = LogFactory.getLog(CrawlDBIndexWriter.class);
static Options options = new Options();
static {
options.addOption(OptionBuilder.withArgName("input").hasArg(true).isRequired().withDescription("Input Path").create("input"));
}
public static void main(String[] args)throws Exception {
CommandLineParser parser = new GnuParser();
try {
// parse the command line arguments
CommandLine cmdLine = parser.parse( options, args );
// build the index...
importRawDomainMetadta(cmdLine);
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp( "PrepareBlekkoDomainMetadata", options );
throw e;
}
}
static void importRawDomainMetadta(CommandLine commandLine) throws IOException {
Path inputPath = new Path(commandLine.getOptionValue("input"));
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(inputPath.toUri(),conf);
ArrayList<Path> paths = new ArrayList<Path>();
FileStatus[] files = fs.globStatus(new Path(inputPath,"*.seq"));
for (FileStatus file : files)
paths.add(file.getPath());
JobConf job = new JobBuilder("Import Blekko Metadata", new Configuration())
.keyValue(TextBytes.class, TextBytes.class)
.mapper(MapBelkkoMetadata.class)
.numReducers(CrawlEnvironment.NUM_DB_SHARDS/2)
.inputs(paths)
.output(new Path(CrawlDBCommon.BLEKKO_DOMAIN_METADATA_PATH))
.inputIsSeqFile()
.outputIsSeqFile()
.compressor(CompressionType.BLOCK, SnappyCodec.class)
.build();
JobClient.runJob(job);
}
public static class MapBelkkoMetadata implements Mapper<Text,Text,TextBytes,TextBytes> {
@Override
public void configure(JobConf job) {
}
@Override
public void close() throws IOException {
}
enum Counters {
HAD_WWW_PREFIX, HIT_EXCEPTION, HAD_RANK10, HAD_IP, HAD_ISPORN, HAD_ISSPAM
}
static Pattern rank10Extractor = Pattern.compile("rank10=([0-9.]*)");
static Pattern ipExtractor = Pattern.compile("ip=([0-9.]*)");
static Pattern isPorn = Pattern.compile("bool_porn=1");
static Pattern isSpam = Pattern.compile("bool_spam=1");
@Override
public void map(Text key, Text value,OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException {
try {
String domainName = key.toString();
boolean hadWWWPrefix = false;
if (domainName.startsWith("www.")) {
domainName = domainName.substring("www.".length());
hadWWWPrefix = true;
reporter.incrCounter(Counters.HAD_WWW_PREFIX, 1);
}
JsonObject metadataOut = new JsonObject();
if (hadWWWPrefix) {
metadataOut.addProperty(CrawlDBCommon.BLEKKO_METADATA_WWW_PREFIX, true);
}
String metadata = value.toString();
Matcher rank10Matcher = rank10Extractor.matcher(metadata);
if (rank10Matcher.find()) {
metadataOut.addProperty(CrawlDBCommon.BLEKKO_METADATA_RANK_10, Double.parseDouble(rank10Matcher.group(1)));
reporter.incrCounter(Counters.HAD_RANK10, 1);
}
Matcher ipExtractorMatcher = ipExtractor.matcher(metadata);
if (ipExtractorMatcher.find()) {
metadataOut.addProperty(CrawlDBCommon.BLEKKO_METADATA_IP,ipExtractorMatcher.group(1));
reporter.incrCounter(Counters.HAD_IP, 1);
}
if (isPorn.matcher(metadata).find()) {
metadataOut.addProperty(CrawlDBCommon.BLEKKO_METADATA_ISPORN,1);
reporter.incrCounter(Counters.HAD_ISPORN, 1);
}
if (isSpam.matcher(metadata).find()) {
metadataOut.addProperty(CrawlDBCommon.BLEKKO_METADATA_ISSPAM,1);
reporter.incrCounter(Counters.HAD_ISSPAM, 1);
}
output.collect(new TextBytes(domainName),new TextBytes(metadataOut.toString()));
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
reporter.incrCounter(Counters.HIT_EXCEPTION, 1);
}
}
}
}