/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats; import java.io.IOException; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.StringUtils; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.SuperDomainList; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLUtils; import com.google.gson.JsonArray; import com.google.gson.JsonObject; import com.google.gson.JsonParser; /** * * @author rana * */ public class StatsAggregationMapper implements Mapper<TextBytes, TextBytes, TextBytes, TextBytes> { enum Counters { GOT_CRAWL_STATUS, GOT_HTTP_RESULT, RESULT_WAS_HTTP_200, GOT_CRAWL_STATS_ARRAY, GOT_CRAWL_STATS_OBJECT, GOT_EXCEPTION_IN_MAPPER, HIT_TUMBLR_SUB_DOMAIN, HIT_TUMBLR_ROOT_DOMAIN, HIT_TUMBLR_DOMAIN, TUMBLR_DOMAIN_DID_NOT_PASS_SUPERDOMAIN_TEST, BAD_ROOT_DOMAIN, } JsonParser parser = new JsonParser(); private static final Log LOG = LogFactory.getLog(StatsAggregationMapper.class); static final long DURATION_MS_ONE_MONTH = 1000 * 60 * 60 * 24 * 30; Set<Long> superDomainIdSet; @Override public void close() throws IOException { } @Override public void configure(JobConf job) { Path superDomainIdFile = new Path(job.get(CrawlStatsCollectorTask.SUPER_DOMAIN_FILE_PATH)); try { superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw new RuntimeException(e); } } @Override public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { try { JsonObject objectOut = new JsonObject(); JsonObject containerObj = parser.parse(value.toString()).getAsJsonObject(); GoogleURL urlObject = new GoogleURL(key.toString()); if (urlObject.isValid()) { URLFPV2 fp = URLUtils.getURLFPV2FromURLObject(urlObject); if (fp != null) { objectOut.addProperty("dh", fp.getDomainHash()); String rootDomain = URLUtils.extractRootDomainName(urlObject.getHost()); if (rootDomain == null) { reporter.incrCounter(Counters.BAD_ROOT_DOMAIN, 1); return; } if (rootDomain != null) { long rootDomainFP = SuperDomainList.domainFingerprintGivenName(rootDomain); if (superDomainIdSet.contains(rootDomainFP)) { rootDomain = urlObject.getHost(); } JsonObject crawlStatus = containerObj.getAsJsonObject("crawl_status"); if (crawlStatus != null) { reporter.incrCounter(Counters.GOT_CRAWL_STATUS, 1); boolean crawled = crawlStatus.has("http_result"); objectOut.addProperty("crawled", crawled); if (crawled) { int httpResult = crawlStatus.get("http_result").getAsInt(); reporter.incrCounter(Counters.GOT_HTTP_RESULT, 1); if (httpResult == 200) { objectOut.addProperty("200", 1); reporter.incrCounter(Counters.RESULT_WAS_HTTP_200, 1); JsonArray crawlStatsArray = crawlStatus.getAsJsonArray("crawl_stats"); if (crawlStatsArray != null && crawlStatsArray.size() != 0) { reporter.incrCounter(Counters.GOT_CRAWL_STATS_ARRAY, 1); JsonObject crawlStats = crawlStatsArray.get(0).getAsJsonObject(); if (crawlStats != null) { reporter.incrCounter(Counters.GOT_CRAWL_STATS_OBJECT, 1); objectOut.addProperty("server_ip", crawlStats.get("server_ip").getAsString()); } } } else if (httpResult == 403) { objectOut.addProperty("403", 1); } else if (httpResult == 404) { objectOut.addProperty("404", 1); } } } JsonObject linkStatus = containerObj.getAsJsonObject("link_status"); if (linkStatus != null && linkStatus.has("latest_date")) { long delta = System.currentTimeMillis() - linkStatus.get("latest_date").getAsLong(); if (delta < (DURATION_MS_ONE_MONTH * 3)) { objectOut.addProperty("recentlyDiscovered", 1); } } } output.collect(new TextBytes(rootDomain), new TextBytes(objectOut.toString())); } } } catch (Exception e) { reporter.incrCounter(Counters.GOT_EXCEPTION_IN_MAPPER, 1); LOG.error(StringUtils.stringifyException(e)); } } }