package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.util.JSONUtils; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.TextBytes; import com.google.gson.JsonObject; import com.google.gson.JsonParser; public class ClassifyDomains extends CrawlPipelineStep { public static final String OUTPUT_DIR_NAME = "classifyDomains"; private static final Log LOG = LogFactory.getLog(ClassifyDomains.class); public ClassifyDomains(CrawlPipelineTask task) { super(task, "Classify Root Domains", OUTPUT_DIR_NAME); } @Override public Log getLogger() { return LOG; } @Override public void runStep(Path outputPathLocation) throws IOException { JobConf job = new JobBuilder("Classify Domains", new Configuration()) .input(getOutputDirForStep(CollectSubDomainStatsStep.class)) .inputIsSeqFile() .keyValue(TextBytes.class, TextBytes.class) .output(outputPathLocation) .outputIsSeqFile() .numReducers(8) .mapper(SimpleClassifier.class) .compressor(CompressionType.BLOCK, SnappyCodec.class) .build(); JobClient.runJob(job); } public static class SimpleClassifier implements Mapper<TextBytes,TextBytes,TextBytes,TextBytes> { @Override public void configure(JobConf job) { } @Override public void close() throws IOException { } JsonParser parser = new JsonParser(); @Override public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { JsonObject domainMetadata = parser.parse(value.toString()).getAsJsonObject(); boolean isSuperDomain = JSONUtils.safeGetBoolean(domainMetadata,CrawlStatsCommon.ROOTDOMAIN_STATS_IS_SUPERDOMAIN); int totalURLS = JSONUtils.safeGetInteger(domainMetadata,CrawlStatsCommon.ROOTDOMAIN_STATS_TOTAL_URLS); int subDomains = JSONUtils.safeGetInteger(domainMetadata,CrawlStatsCommon.ROOTDOMAIN_STATS_SUBDOMAIN_COUNT); double blekkoURLS = JSONUtils.safeGetDouble(domainMetadata,CrawlStatsCommon.ROOTDOMAIN_STATS_BLEKKO_TOTAL_URLS); double blekkoCrawledURLS = JSONUtils.safeGetDouble(domainMetadata,CrawlStatsCommon.ROOTDOMAIN_STATS_BLEKKO_TOTAL_CRAWL_COUNT); double domainRankTotal = JSONUtils.safeGetDouble(domainMetadata, CrawlStatsCommon.ROOTDOMAIN_STATS_DOMAIN_RANK_TOTAL); double blekkoRankTotal = JSONUtils.safeGetDouble(domainMetadata, CrawlStatsCommon.ROOTDOMAIN_STATS_BLEKKO_RANK_TOTAL); double isSpam = JSONUtils.safeGetDouble(domainMetadata, CrawlStatsCommon.ROOTDOMAIN_STATS_BLEKKO_IS_SPAM); double isPorn = JSONUtils.safeGetDouble(domainMetadata, CrawlStatsCommon.ROOTDOMAIN_STATS_BLEKKO_IS_PORN); boolean limitedCrawl = false; boolean blackList = false; double relativeDomainRank = 0.0; if (blekkoURLS != 0.0) { relativeDomainRank = blekkoCrawledURLS / blekkoURLS; } if (isSuperDomain) { double dRAvg = domainRankTotal / (double) subDomains; double bRAvg = blekkoRankTotal / (double)subDomains; if (dRAvg < .001 || bRAvg < .001) { isSuperDomain = false; } } if (!isSuperDomain) { if (isPorn > 0.0 || isSpam > 0.0) { blackList = true; } else { if (blekkoCrawledURLS == 0.0) { blackList = true; } else if (blekkoCrawledURLS == 1.0) { limitedCrawl = true; } else { } } } JsonObject objectOut = new JsonObject(); objectOut.addProperty(CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_SUPERDOMAIN,isSuperDomain); objectOut.addProperty(CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_BLACKLISTED,blackList); if (!blackList) { objectOut.addProperty(CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_LIMITED_CRAWL,limitedCrawl); objectOut.addProperty(CrawlStatsCommon.ROOTDOMAIN_CLASSIFY_RELATIVE_RANK,relativeDomainRank); } output.collect(key, new TextBytes(objectOut.toString())); } } }