package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.tools.ant.SubBuildListener; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.rank.GenSuperDomainListStep; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.JSONUtils; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.SuperDomainList; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLUtils; import com.google.gson.JsonObject; import com.google.gson.JsonParser; import static org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCommon.*; public class CollectSubDomainStatsStep extends CrawlPipelineStep { public static final String OUTPUT_DIR_NAME = "AggegateDomainStats"; public static final String SUPER_DOMAIN_FILE_PATH = "superDomainFilePath"; private static final Log LOG = LogFactory.getLog(JoinDomainMetadataStep.class); public CollectSubDomainStatsStep(CrawlPipelineTask task) { super(task, "Aggregate SubDomain Stats", OUTPUT_DIR_NAME); } @Override public Log getLogger() { return LOG; } @Override public void runStep(Path outputPathLocation) throws IOException { Configuration conf = new Configuration(); ArrayList<Path> paths = new ArrayList<Path>(); FileSystem fs = FileSystem.get(outputPathLocation.toUri(),conf); FileStatus files[] = fs.globStatus(new Path(getOutputDirForStep(JoinDomainMetadataStep.class),"part-*")); for (FileStatus file: files) paths.add(file.getPath()); Path superDomainPath = new Path(getRootTask() .getOutputDirForStep(GenSuperDomainListStep.class) , "part-00000"); LOG.info("Super Domain File Path is:" + superDomainPath); JobConf job = new JobBuilder("Collect SubDomain Stats", new Configuration()) .inputIsSeqFile() .inputs(paths) .keyValue(TextBytes.class, TextBytes.class) .mapper(SubDomainStatsMapper.class) .reducer(SubDomainStatsReducer.class, false) .outputIsSeqFile() .compressor(CompressionType.BLOCK, SnappyCodec.class) .numReducers(CrawlEnvironment.NUM_DB_SHARDS/2) .output(outputPathLocation) .set(SUPER_DOMAIN_FILE_PATH, superDomainPath.toString()) .build(); JobClient.runJob(job); } public static class SubDomainStatsMapper implements Mapper<TextBytes,TextBytes,TextBytes,TextBytes> { @Override public void configure(JobConf job) { } @Override public void close() throws IOException { } @Override public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { String domainName = key.toString(); String rootDomain = URLUtils.extractRootDomainName(domainName); if (rootDomain != null) { output.collect(new TextBytes(rootDomain), value); } } } public static class SubDomainStatsReducer implements Reducer<TextBytes,TextBytes,TextBytes,TextBytes> { Set<Long> superDomainIdSet = null; @Override public void configure(JobConf job) { Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH)); try { superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); throw new RuntimeException(e); } } @Override public void close() throws IOException { } JsonParser parser = new JsonParser(); @Override public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { // get root domain fp long rootDomainFP = SuperDomainList.domainFingerprintGivenName(key.toString()); // is super domain boolean isSuperDomain = superDomainIdSet.contains(rootDomainFP); DescriptiveStatistics ccURLSPerDomain = new DescriptiveStatistics(); DescriptiveStatistics blekkoURLSPerDomain = new DescriptiveStatistics(); DescriptiveStatistics blekkoRank = new DescriptiveStatistics(); DescriptiveStatistics domainRank = new DescriptiveStatistics(); DescriptiveStatistics blekkoisPorn = new DescriptiveStatistics(); DescriptiveStatistics blekkoisSpam = new DescriptiveStatistics(); DescriptiveStatistics blekkoCrawlCount = new DescriptiveStatistics(); DescriptiveStatistics blekkoPRCount = new DescriptiveStatistics(); DescriptiveStatistics externallyLinkedCount = new DescriptiveStatistics(); int subDomainCount = 0; int hadBelkkoMetadata = 0; int totalURLS = 0; while (values.hasNext()) { JsonObject subDomainStats = parser.parse(values.next().toString()).getAsJsonObject(); if (subDomainStats.has(CrawlStatsCommon.JOINEDMETDATA_PROPERTY_BELKKO)) { hadBelkkoMetadata++; JsonObject blekkoMetadata = subDomainStats.getAsJsonObject(CrawlStatsCommon.JOINEDMETDATA_PROPERTY_BELKKO); double rank = JSONUtils.safeGetDouble(blekkoMetadata,CrawlDBCommon.BLEKKO_METADATA_RANK_10); int isPorn = JSONUtils.safeGetInteger(blekkoMetadata, CrawlDBCommon.BLEKKO_METADATA_ISPORN); int isSpam = JSONUtils.safeGetInteger(blekkoMetadata, CrawlDBCommon.BLEKKO_METADATA_ISSPAM); if (rank != Double.NaN) { blekkoRank.addValue(rank); } if (isPorn == 1) blekkoisPorn.addValue(1.0); if (isSpam == 1) blekkoisSpam.addValue(1.0); } if (subDomainStats.has(CrawlStatsCommon.JOINEDMETDATA_PROPERTY_CRAWLSTATS)) { JsonObject crawlStatsMetadata = subDomainStats.getAsJsonObject(CrawlStatsCommon.JOINEDMETDATA_PROPERTY_CRAWLSTATS); int blekkoCrawlCountRaw = JSONUtils.safeGetInteger(crawlStatsMetadata, CrawlStatsCommon.CRAWLSTATS_BLEKKO_CRAWLED_COUNT); blekkoCrawlCount.addValue((blekkoCrawlCountRaw == -1)? 0.0 : (double)blekkoCrawlCountRaw); int blekkoPageRankCount = JSONUtils.safeGetInteger(crawlStatsMetadata, CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL_HAD_GT_1_RANK); blekkoPRCount.addValue((blekkoPageRankCount == -1) ? 0.0 : (double) blekkoPageRankCount); int externallyLinkedURLS = JSONUtils.safeGetInteger(crawlStatsMetadata, CrawlStatsCommon.CRAWLSTATS_EXTERNALLY_LINKED_URLS); externallyLinkedCount.addValue((externallyLinkedURLS == -1) ? 0.0 : (double)externallyLinkedURLS); int urlCount = JSONUtils.safeGetInteger(crawlStatsMetadata, CrawlStatsCommon.CRAWLSTATS_URL_COUNT); totalURLS += Math.max(0, urlCount); if (urlCount != 0) { ccURLSPerDomain.addValue((double)urlCount); } if (JSONUtils.safeGetInteger(crawlStatsMetadata, CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL) != -1) { blekkoURLSPerDomain.addValue((double)JSONUtils.safeGetInteger(crawlStatsMetadata, CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL)); } } if (subDomainStats.has(CrawlStatsCommon.JOINEDMETDATA_PROPERTY_RANK)) { domainRank.addValue(subDomainStats.get(CrawlStatsCommon.JOINEDMETDATA_PROPERTY_RANK).getAsDouble()); } subDomainCount++; } JsonObject summaryRecordOut = new JsonObject(); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_IS_SUPERDOMAIN, isSuperDomain); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_TOTAL_URLS,totalURLS); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_SUBDOMAIN_COUNT,subDomainCount); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_MEAN_URLS_PER_DOMAIN,ccURLSPerDomain.getMean()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_MAX_URLS_IN_A_DOMAIN,ccURLSPerDomain.getMax()); //summaryRecordOut.addProperty(ROOTDOMAIN_STATS_URLS_PER_DOMAIN_90TH,ccURLSPerDomain.getPercentile(90)); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_URLS_PER_DOMAIN_STDDEV,ccURLSPerDomain.getStandardDeviation()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_TOTAL_URLS,blekkoURLSPerDomain.getSum()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_MEAN_URLS_PER_DOMAIN,blekkoURLSPerDomain.getMean()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_RANK_MEAN,blekkoRank.getMean()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_RANK_MAX,blekkoRank.getMax()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_RANK_TOTAL,blekkoRank.getSum()); //summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_RANK_90TH,blekkoRank.getPercentile(90)); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_DOMAIN_RANK_MEAN,domainRank.getMean()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_DOMAIN_RANK_MAX,domainRank.getMax()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_DOMAIN_RANK_TOTAL,domainRank.getSum()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_DOMAIN_RANK_STDDEV,domainRank.getStandardDeviation()); //summaryRecordOut.addProperty(ROOTDOMAIN_STATS_DOMAIN_RANK_90TH,domainRank.getPercentile(90)); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_IS_PORN,blekkoisPorn.getN()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_IS_SPAM,blekkoisSpam.getN()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_MEAN_CRAWL_COUNT,blekkoCrawlCount.getMean()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_TOTAL_CRAWL_COUNT,blekkoCrawlCount.getSum()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_MEAN_PR_COUNT,blekkoPRCount.getMean()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_MAX_PR_COUNT,blekkoPRCount.getMax()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_BLEKKO_TOTAL_PR_COUNT,blekkoPRCount.getSum()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_MEAN_EXT_LINKED_URLS,externallyLinkedCount.getMean()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_MAX_EXT_LINKED_URLS,externallyLinkedCount.getMax()); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_TOTAL_EXT_LINKED_URLS,externallyLinkedCount.getSum()); //summaryRecordOut.addProperty(ROOTDOMAIN_STATS_EXT_LINKED_URLS_90TH,externallyLinkedCount.getPercentile(90)); summaryRecordOut.addProperty(ROOTDOMAIN_STATS_EXT_LINKED_URLS_STDDEV,externallyLinkedCount.getStandardDeviation()); output.collect(key, new TextBytes(summaryRecordOut.toString())); } } }