package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats;
import static org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCommon.ROOTDOMAIN_STATS_IS_SUPERDOMAIN;
import static org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCommon.ROOTDOMAIN_STATS_SUBDOMAIN_COUNT;
import java.io.IOException;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.TextBytes;
import com.google.common.collect.Iterators;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import static org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats.CrawlStatsCommon.*;
public class WriteAggregatedDomainStatsFileStep extends CrawlPipelineStep {
public static final String OUTPUT_DIR_NAME = "AggegateDomainStatsFile";
private static final Log LOG = LogFactory.getLog(WriteAggregatedDomainStatsFileStep.class);
public WriteAggregatedDomainStatsFileStep(CrawlPipelineTask task) {
super(task, "Write Aggregated SubDomain Stats File", OUTPUT_DIR_NAME);
}
@Override
public Log getLogger() {
return LOG;
}
@Override
public void runStep(Path outputPathLocation) throws IOException {
JobConf job = new JobBuilder("Write Stats File", new Configuration())
.input(getOutputDirForStep(CollectSubDomainStatsStep.class))
.inputIsSeqFile()
.keyValue(TextBytes.class, TextBytes.class)
.output(outputPathLocation)
.outputFormat(TextOutputFormat.class)
.numReducers(8)
.reducer(JSONtoTabReducer.class, false)
.compressType(CompressionType.NONE)
.build();
JobClient.runJob(job);
}
public static class JSONtoTabReducer implements Reducer<TextBytes,TextBytes,TextBytes,TextBytes> {
@Override
public void configure(JobConf job) {
// TODO Auto-generated method stub
}
@Override
public void close() throws IOException {
// TODO Auto-generated method stub
}
JsonParser parser = new JsonParser();
@Override
public void reduce(TextBytes key, Iterator<TextBytes> values,
OutputCollector<TextBytes, TextBytes> output, Reporter reporter)
throws IOException {
JsonObject stats = parser.parse(Iterators.getNext(values, null).toString()).getAsJsonObject();
StringBuffer sb = new StringBuffer();
sb.append(stats.get(ROOTDOMAIN_STATS_IS_SUPERDOMAIN).getAsBoolean()+"\t"); // 2
sb.append(stats.get(ROOTDOMAIN_STATS_TOTAL_URLS).getAsInt()+"\t"); // 3
sb.append(stats.get(ROOTDOMAIN_STATS_SUBDOMAIN_COUNT).getAsInt()+"\t"); // 4
sb.append(stats.get(ROOTDOMAIN_STATS_MEAN_URLS_PER_DOMAIN).getAsDouble()+"\t"); // 5
sb.append(stats.get(ROOTDOMAIN_STATS_MAX_URLS_IN_A_DOMAIN).getAsDouble()+"\t"); // 6
sb.append(stats.get(ROOTDOMAIN_STATS_URLS_PER_DOMAIN_STDDEV).getAsDouble()+"\t"); // 7
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_TOTAL_URLS).getAsDouble()+"\t"); // 8
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_MEAN_URLS_PER_DOMAIN).getAsDouble()+"\t"); // 9
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_RANK_MEAN).getAsDouble()+"\t"); // 10
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_RANK_MAX).getAsDouble()+"\t"); // 11
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_RANK_TOTAL).getAsDouble()+"\t"); // 12
sb.append(stats.get(ROOTDOMAIN_STATS_DOMAIN_RANK_MEAN).getAsDouble()+"\t"); // 13
sb.append(stats.get(ROOTDOMAIN_STATS_DOMAIN_RANK_MAX).getAsDouble()+"\t");// 14
sb.append(stats.get(ROOTDOMAIN_STATS_DOMAIN_RANK_TOTAL).getAsDouble()+"\t"); // 15
sb.append(stats.get(ROOTDOMAIN_STATS_DOMAIN_RANK_STDDEV).getAsDouble()+"\t"); // 16
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_IS_PORN).getAsDouble()+"\t");
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_IS_SPAM).getAsDouble()+"\t");
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_MEAN_CRAWL_COUNT).getAsDouble()+"\t");
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_TOTAL_CRAWL_COUNT).getAsDouble()+"\t");//20
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_MEAN_PR_COUNT).getAsDouble()+"\t");// 21
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_MAX_PR_COUNT).getAsDouble()+"\t");//22
sb.append(stats.get(ROOTDOMAIN_STATS_BLEKKO_TOTAL_PR_COUNT).getAsDouble()+"\t");//23
sb.append(stats.get(ROOTDOMAIN_STATS_MEAN_EXT_LINKED_URLS).getAsDouble()+"\t");//24
sb.append(stats.get(ROOTDOMAIN_STATS_MAX_EXT_LINKED_URLS).getAsDouble()+"\t");//25
sb.append(stats.get(ROOTDOMAIN_STATS_TOTAL_EXT_LINKED_URLS).getAsDouble()+"\t");//26
sb.append(stats.get(ROOTDOMAIN_STATS_EXT_LINKED_URLS_STDDEV).getAsDouble()+"\t");//27
output.collect(key, new TextBytes(sb.toString()));
}
}
}