package org.commoncrawl.mapred.pipelineV3.crawllistgen; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.ClassifyDomains; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLUtils; public class ShardRootDomainClassificationStep extends CrawlPipelineStep{ public static final String OUTPUT_DIR_NAME = "rootClassification"; private static final Log LOG = LogFactory.getLog(ShardRootDomainClassificationStep.class); public ShardRootDomainClassificationStep(CrawlPipelineTask task) { super(task, "Shard Root Classification", OUTPUT_DIR_NAME); } @Override public Log getLogger() { return LOG; } @Override public void runStep(Path outputPathLocation) throws IOException { JobConf job = new JobBuilder("Shard Classification Data", new Configuration()) .input(getRootTask().getOutputDirForStep(ClassifyDomains.class)) .inputIsSeqFile() .mapper(KeyTransformer.class) .keyValue(TextBytes.class, TextBytes.class) .numReducers(8) .output(outputPathLocation) .sort(CrawlDBKey.LinkKeyComparator.class) .outputIsSeqFile() .build(); JobClient.runJob(job); } public static class KeyTransformer implements Mapper<TextBytes,TextBytes,TextBytes,TextBytes> { @Override public void configure(JobConf job) { } @Override public void close() throws IOException { // TODO Auto-generated method stub } enum Counters { BAD_HOSTNAME } @Override public void map(TextBytes key, TextBytes value,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException { URLFPV2 fp = URLUtils.getURLFPV2FromHost(key.toString()); if (fp == null) { reporter.incrCounter(Counters.BAD_HOSTNAME, 1); } else { fp.setDomainHash(Long.MIN_VALUE); fp.setUrlHash(Long.MIN_VALUE); TextBytes outputKey = CrawlDBKey.generateKey(fp, CrawlDBKey.Type.KEY_TYPE_ROOTDOMAIN_METADATA_RECORD, 0L); output.collect(outputKey,value); } } } }