package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.Vector; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.DataInputBuffer; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.util.StringUtils; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBCommon; import org.commoncrawl.mapred.ec2.postprocess.crawldb.CrawlDBKey; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask; import org.commoncrawl.mapred.pipelineV3.domainmeta.rank.GenSuperDomainListStep; import org.commoncrawl.util.CCStringUtils; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.JSONUtils; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.KeyAndValueData; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileInputReader.RawRecordValue; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergeInputFormat; import org.commoncrawl.util.MultiFileMergeUtils.MultiFileMergePartitioner; import org.commoncrawl.util.SuperDomainList; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.Tuples.Pair; import org.commoncrawl.util.URLUtils; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; public class NewCrawlStatsCollectorStep extends CrawlPipelineStep{ private static final Log LOG = LogFactory.getLog(NewCrawlStatsCollectorStep.class); public static final String OUTPUT_DIR_NAME = "crawlDBStatsV2"; public static final String SUPER_DOMAIN_FILE_PATH = "super-domain-list"; public NewCrawlStatsCollectorStep(CrawlPipelineTask task) { super(task, "Crawl DB Stats Collector", OUTPUT_DIR_NAME); } @Override public Log getLogger() { return LOG; } @Override public void runStep(Path outputPathLocation) throws IOException { LOG.info("Task Identity Path is:" + getTaskIdentityPath()); DomainMetadataTask rootTask = (DomainMetadataTask) getRootTask(); ImmutableList<Path> paths = new ImmutableList.Builder<Path>().addAll(rootTask.getRestrictedMergeDBDataPaths()).build(); Path superDomainListPath = new Path(getOutputDirForStep(GenSuperDomainListStep.class), "part-00000"); JobConf jobConf = new JobBuilder("New Domain Stats Collector", getConf()) .inputs(paths) .inputFormat(MultiFileMergeInputFormat.class) .mapperKeyValue(IntWritable.class, Text.class) .outputKeyValue(TextBytes.class, TextBytes.class) .outputFormat(SequenceFileOutputFormat.class) .reducer(CrawlDBStatsCollectingReducer.class, false) .partition(MultiFileMergePartitioner.class) .numReducers(1000) .maxReduceAttempts(4) .maxReduceTaskFailures(10) .speculativeExecution(true) .output(outputPathLocation) .compressMapOutput(false).compressor(CompressionType.BLOCK, SnappyCodec.class) .set(SUPER_DOMAIN_FILE_PATH, superDomainListPath.toString()) .build(); LOG.info("Starting JOB"); JobClient.runJob(jobConf); LOG.info("Finsihed JOB"); } public static class CrawlDBStatsCollectingReducer implements Reducer<IntWritable,Text,TextBytes,TextBytes> { Set<Long> superDomainIdSet; JobConf _jobConf; @Override public void configure(JobConf job) { _jobConf = job; Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH)); try { superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw new RuntimeException(e); } } @Override public void close() throws IOException { } String activeDomain = ""; JsonObject activeRecord = new JsonObject(); OutputCollector<TextBytes, TextBytes> _collector; TextBytes activeDomainKeyBytes = new TextBytes(); TextBytes activeDomainValueBytes = new TextBytes(); HashSet<String> activeDomainIPs = new HashSet<String>(); private void setActiveDomain(String hostName)throws IOException { if (!activeDomain.equalsIgnoreCase(hostName)) { if (activeDomain.length() != 0) { if (activeDomainIPs.size() != 0) { JSONUtils.stringCollectionToJsonArray(activeRecord,CrawlStatsCommon.CRAWLSTATS_IPS,activeDomainIPs); } activeDomainKeyBytes.set(activeDomain); activeDomainValueBytes.set(activeRecord.toString()); _collector.collect(activeDomainKeyBytes, activeDomainValueBytes); } activeDomain = hostName; activeRecord = new JsonObject(); activeDomainIPs.clear(); } } static final String stripWWW(String host) { if (host.startsWith("www.")) { return host.substring("www.".length()); } return host; } enum Counters { HIT_EXCEPTION_PROCESSING_RECORD, NO_SOURCE_URL_PROPERTY, INVALID_URL_DETECTED, HIT_SUMMARY_RECORD, HAD_SUMMARY_DETAILS_ARRAY, HAD_ATTEMPT_COUNT, HAD_CRAWL_COUNT, HAD_REDIRECT_URL_PROPERTY, REDIRECT_WENT_OUT_OF_DOMAIN, WWW_TO_NON_WWW_DETECTED, NON_WWW_TO_WWW_DETECTED, GOT_LINK_STATUS_RECORD, GOT_EXTRADOMAIN_SOURCES_COUNT, GOT_BLEKKO_RECORD, GOT_BLEKKO_GT1_RECORD, GOT_MERGED_RECORD } @Override public void reduce(IntWritable key, Iterator<Text> values,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException { // collect all incoming paths first Vector<Path> incomingPaths = new Vector<Path>(); while (values.hasNext()) { String path = values.next().toString(); LOG.info("Found Incoming Path:" + path); incomingPaths.add(new Path(path)); } // set up merge attributes Configuration localMergeConfig = new Configuration(_jobConf); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_COMPARATOR_CLASS, CrawlDBKey.LinkKeyComparator.class, RawComparator.class); localMergeConfig.setClass(MultiFileInputReader.MULTIFILE_KEY_CLASS, TextBytes.class, WritableComparable.class); FileSystem fs = FileSystem.get(incomingPaths.get(0).toUri(),_jobConf); // ok now spawn merger MultiFileInputReader<TextBytes> multiFileInputReader = new MultiFileInputReader<TextBytes>(fs, incomingPaths, localMergeConfig); try { Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null; TextBytes valueText = new TextBytes(); DataInputBuffer valueStream = new DataInputBuffer(); JsonParser parser = new JsonParser(); _collector = output; while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) { long recordType = CrawlDBKey.getLongComponentFromKey(nextItem.e0._keyObject, CrawlDBKey.ComponentId.TYPE_COMPONENT_ID); if (recordType == CrawlDBKey.Type.KEY_TYPE_MERGED_RECORD.ordinal()) { reporter.incrCounter(Counters.GOT_MERGED_RECORD, 1); // walk records RawRecordValue rawValue = Iterators.getNext(nextItem.e1.iterator(),null); if (rawValue != null) { valueStream.reset(rawValue.data.getData(),0,rawValue.data.getLength()); valueText.setFromRawTextBytes(valueStream); try { JsonObject mergeRecord = parser.parse(valueText.toString()).getAsJsonObject(); if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY)) { String sourceURL = mergeRecord.get(CrawlDBCommon.TOPLEVEL_SOURCE_URL_PROPRETY).getAsString(); GoogleURL urlObject = new GoogleURL(sourceURL); if (urlObject.isValid()) { String hostName = stripWWW(urlObject.getHost()); setActiveDomain(hostName); JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_URL_COUNT); boolean crawled = false; boolean inCrawlDB = false; if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY)) { reporter.incrCounter(Counters.HIT_SUMMARY_RECORD, 1); inCrawlDB = true; JsonObject crawlStatus = mergeRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_SUMMARYRECORD_PROPRETY); if (crawlStatus.has(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY)) { reporter.incrCounter(Counters.HAD_SUMMARY_DETAILS_ARRAY, 1); JsonArray crawlDetails = crawlStatus.getAsJsonArray(CrawlDBCommon.SUMMARYRECORD_CRAWLDETAILS_ARRAY_PROPERTY); for (JsonElement arrayElement : crawlDetails) { JsonObject crawlDetail = arrayElement.getAsJsonObject(); if (crawlDetail.has(CrawlDBCommon.CRAWLDETAIL_SERVERIP_PROPERTY)) { activeDomainIPs.add(crawlDetail.get(CrawlDBCommon.CRAWLDETAIL_SERVERIP_PROPERTY).getAsString()); } } } if (crawlStatus.has(CrawlDBCommon.SUMMARYRECORD_ATTEMPT_COUNT_PROPERTY)) { reporter.incrCounter(Counters.HAD_ATTEMPT_COUNT, 1); JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_ATTEMPTED_COUNT); } if (crawlStatus.has(CrawlDBCommon.SUMMARYRECORD_CRAWLCOUNT_PROPERTY)) { reporter.incrCounter(Counters.HAD_CRAWL_COUNT, 1); if (crawlStatus.get(CrawlDBCommon.SUMMARYRECORD_CRAWLCOUNT_PROPERTY).getAsInt() > 0) { JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_CRAWLED_COUNT); crawled = true; } } if (crawlStatus.has(CrawlDBCommon.SUMMARYRECORD_REDIRECT_URL_PROPERTY)) { reporter.incrCounter(Counters.HAD_REDIRECT_URL_PROPERTY, 1); JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_REDIRECTED_COUNT); String redirectURL = crawlStatus.get(CrawlDBCommon.SUMMARYRECORD_REDIRECT_URL_PROPERTY).getAsString(); GoogleURL redirectURLObj = new GoogleURL(redirectURL); if (redirectURLObj.isValid()) { String originalRootHost = URLUtils.extractRootDomainName(urlObject.getHost()); String redirectRootHost = URLUtils.extractRootDomainName(redirectURLObj.getHost()); if (originalRootHost != null && redirectRootHost != null) { if (!originalRootHost.equalsIgnoreCase(redirectRootHost)){ reporter.incrCounter(Counters.REDIRECT_WENT_OUT_OF_DOMAIN, 1); JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_REDIRECTED_OUT_COUNT); } else { if (!redirectURLObj.getHost().startsWith("www.") && urlObject.getHost().startsWith("www.")) { reporter.incrCounter(Counters.WWW_TO_NON_WWW_DETECTED, 1); JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_WWW_TO_NON_WWW_REDIRECT); } else if (redirectURLObj.getHost().startsWith("www.") && !urlObject.getHost().startsWith("www.")) { reporter.incrCounter(Counters.NON_WWW_TO_WWW_DETECTED, 1); JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_NON_WWW_TO_WWW_REDIRECT); } } } } } } if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY)) { reporter.incrCounter(Counters.GOT_LINK_STATUS_RECORD, 1); inCrawlDB = true; JsonObject linkStatus = mergeRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_LINKSTATUS_PROPERTY); if (linkStatus.has(CrawlDBCommon.LINKSTATUS_EXTRADOMAIN_SOURCES_COUNT_PROPERTY)) { reporter.incrCounter(Counters.GOT_EXTRADOMAIN_SOURCES_COUNT, 1); JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_EXTERNALLY_LINKED_URLS); if (!crawled) { JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_EXTERNALLY_LINKED_NOT_CRAWLED_URLS); } } } if (mergeRecord.has(CrawlDBCommon.TOPLEVEL_BLEKKO_METADATA_PROPERTY)) { reporter.incrCounter(Counters.GOT_BLEKKO_RECORD, 1); JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL); if (!inCrawlDB) { JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL_NOT_IN_CC); } JsonObject blekkoStatus = mergeRecord.getAsJsonObject(CrawlDBCommon.TOPLEVEL_BLEKKO_METADATA_PROPERTY); if (blekkoStatus.has(CrawlDBCommon.BLEKKO_METADATA_STATUS)) { String statusStr = blekkoStatus.get(CrawlDBCommon.BLEKKO_METADATA_STATUS).getAsString(); if (statusStr.equalsIgnoreCase("crawled")) { JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_CRAWLED_COUNT); if (crawled) { JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_AND_CC_CRAWLED_COUNT); } } } if (blekkoStatus.has(CrawlDBCommon.BLEKKO_METADATA_RANK_10)) { if (blekkoStatus.get(CrawlDBCommon.BLEKKO_METADATA_RANK_10).getAsDouble() >= 1.0) { reporter.incrCounter(Counters.GOT_BLEKKO_GT1_RECORD, 1); JSONUtils.safeIncrementJSONCounter(activeRecord,CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL_HAD_GT_1_RANK); } } } } else { reporter.incrCounter(Counters.INVALID_URL_DETECTED, 1); } } else { reporter.incrCounter(Counters.NO_SOURCE_URL_PROPERTY, 1); } } catch (Exception e) { reporter.incrCounter(Counters.HIT_EXCEPTION_PROCESSING_RECORD, 1); LOG.error(CCStringUtils.stringifyException(e)); } } } } setActiveDomain(""); } finally { multiFileInputReader.close(); } } } }