package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats; import java.io.IOException; import java.text.NumberFormat; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile.CompressionType; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep; import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask; import org.commoncrawl.util.JSONUtils; import org.commoncrawl.util.JobBuilder; import org.commoncrawl.util.TextBytes; import com.google.common.collect.ImmutableList; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.gson.JsonObject; import com.google.gson.JsonParser; public class MergeNewDomainStatsStep extends CrawlPipelineStep { private static final Log LOG = LogFactory.getLog(MergeNewDomainStatsStep.class); public static final String OUTPUT_DIR_NAME = "domainStatsMerged"; public static final String SUPER_DOMAIN_FILE_PATH = "super-domain-list"; public MergeNewDomainStatsStep(CrawlPipelineTask task) { super(task, "Merge Domain Stats", OUTPUT_DIR_NAME); } @Override public Log getLogger() { return LOG; } private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance(); static { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } @Override public void runStep(Path outputPathLocation) throws IOException { ImmutableList<Path> inputs = new ImmutableList.Builder<Path>() .add(getOutputDirForStep(NewCrawlStatsCollectorStep.class)) .build(); JobConf job = new JobBuilder(getDescription(), getConf()) .inputs(inputs).inputIsSeqFile() .keyValue(TextBytes.class, TextBytes.class) .reducer(MergingReducer.class,false) .numReducers(CrawlEnvironment.NUM_DB_SHARDS) .outputIsSeqFile() .output(outputPathLocation) .compressor(CompressionType.BLOCK, SnappyCodec.class) .maxMapTaskFailures(5) .build(); JobClient.runJob(job); } public static class MergingReducer implements Reducer<TextBytes,TextBytes,TextBytes,TextBytes> { @Override public void configure(JobConf job) { } @Override public void close() throws IOException { } JsonParser parser = new JsonParser(); HashSet<String> ips = new HashSet<String>(); TextBytes valueText = new TextBytes(); @Override public void reduce(TextBytes key, Iterator<TextBytes> values,OutputCollector<TextBytes, TextBytes> output, Reporter reporter)throws IOException { TextBytes firstValue = Iterators.getNext(values, null); JsonObject firstObject = parser.parse(firstValue.toString()).getAsJsonObject(); ips.clear(); JSONUtils.safeJsonArrayToStringCollection(firstObject, CrawlStatsCommon.CRAWLSTATS_IPS,ips); int mergedObjectCount = 0; while (values.hasNext()) { JsonObject nextObject = parser.parse(values.next().toString()).getAsJsonObject(); // public static final String CRAWLSTATS_IPS = "ips"; JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_URL_COUNT); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_ATTEMPTED_COUNT); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_CRAWLED_COUNT); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_REDIRECTED_COUNT); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_REDIRECTED_OUT_COUNT); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_WWW_TO_NON_WWW_REDIRECT); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_NON_WWW_TO_WWW_REDIRECT); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_EXTERNALLY_LINKED_URLS); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_EXTERNALLY_LINKED_NOT_CRAWLED_URLS); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_BLEKKO_CRAWLED_COUNT); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_BLEKKO_AND_CC_CRAWLED_COUNT); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL_NOT_IN_CC); JSONUtils.mergeCounters(firstObject,nextObject,CrawlStatsCommon.CRAWLSTATS_BLEKKO_URL_HAD_GT_1_RANK); JSONUtils.safeJsonArrayToStringCollection(nextObject, CrawlStatsCommon.CRAWLSTATS_IPS,ips); mergedObjectCount++; } if (mergedObjectCount != 0 && ips.size() != 0) { JSONUtils.stringCollectionToJsonArray(firstObject,CrawlStatsCommon.CRAWLSTATS_IPS,ips); } if (mergedObjectCount != 0) { valueText.set(firstObject.toString()); output.collect(key, valueText); } else { output.collect(key, firstValue); } } } }