/** * Copyright 2012 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.mapred.pipelineV3.domainmeta.iptohost; import java.io.IOException; import java.util.Iterator; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; import org.commoncrawl.protocol.URLFPV2; import org.commoncrawl.util.FPGenerator; import org.commoncrawl.util.GoogleURL; import org.commoncrawl.util.TextBytes; import org.commoncrawl.util.URLFPBloomFilter; import org.commoncrawl.util.URLUtils; import com.google.gson.JsonArray; import com.google.gson.JsonObject; import com.google.gson.JsonParser; /** * * @author rana * */ public class CrawlStatsIPToHostMapperReducer implements Mapper<TextBytes, TextBytes, TextBytes, TextBytes>, Reducer<TextBytes, TextBytes, TextBytes, TextBytes> { enum Counters { GOT_CRAWL_STATUS, GOT_HTTP_RESULT, RESULT_WAS_HTTP_200, GOT_CRAWL_STATS_ARRAY, GOT_CRAWL_STATS_OBJECT, SKIPPED_ALREADY_EMITTED_TUPLE, GOT_EXCEPTION_DURING_PARSE } JsonParser parser = new JsonParser(); public static final int NUM_HASH_FUNCTIONS = 10; public static final int NUM_BITS = 11; public static final int NUM_ELEMENTS = 1 << 17; static URLFPBloomFilter bloomFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS); URLFPV2 fp = new URLFPV2(); @Override public void close() throws IOException { } @Override public void configure(JobConf job) { } @Override public void map(TextBytes unused, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { try { JsonObject containerObj = parser.parse(value.toString()).getAsJsonObject(); if (containerObj.has("source_url")) { GoogleURL urlObject = new GoogleURL(containerObj.get("source_url").getAsString()); if (urlObject.isValid()) { String sourceRootDomain = URLUtils.extractRootDomainName(urlObject.getHost()); if (sourceRootDomain != null) { JsonObject crawlStatus = containerObj.getAsJsonObject("crawl_status"); if (crawlStatus != null) { reporter.incrCounter(Counters.GOT_CRAWL_STATUS, 1); if (crawlStatus.has("http_result")) { int httpResult = crawlStatus.get("http_result").getAsInt(); reporter.incrCounter(Counters.GOT_HTTP_RESULT, 1); if (httpResult == 200) { reporter.incrCounter(Counters.RESULT_WAS_HTTP_200, 1); JsonArray crawlStatsArray = crawlStatus.getAsJsonArray("crawl_stats"); if (crawlStatsArray != null && crawlStatsArray.size() != 0) { reporter.incrCounter(Counters.GOT_CRAWL_STATS_ARRAY, 1); JsonObject crawlStats = crawlStatsArray.get(0).getAsJsonObject(); if (crawlStats != null) { reporter.incrCounter(Counters.GOT_CRAWL_STATS_OBJECT, 1); String serverIP = crawlStats.get("server_ip").getAsString(); // create a unique string from root domain to ip mapping String uniqueStr = serverIP + "->" + sourceRootDomain; // hack a fingerprint of the string ... fp.setDomainHash(FPGenerator.std64.fp(uniqueStr)); fp.setUrlHash(fp.getDomainHash()); // and use the bloom filter to skip if already emitted ... if (!bloomFilter.isPresent(fp)) { // output tuple output.collect(new TextBytes(serverIP), new TextBytes(sourceRootDomain)); // add hacked fp into BF bloomFilter.add(fp); } else { reporter.incrCounter(Counters.SKIPPED_ALREADY_EMITTED_TUPLE, 1); } } } } } } } } } } catch (Exception e) { reporter.incrCounter(Counters.GOT_EXCEPTION_DURING_PARSE, 1); } } @Override public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output, Reporter reporter) throws IOException { String serverIP = key.toString(); while (values.hasNext()) { TextBytes value = values.next(); String hostName = value.toString(); // create a unique string from root domain to ip mapping String uniqueStr = serverIP + "->" + hostName; // hack a fingerprint of the string ... fp.setDomainHash(FPGenerator.std64.fp(uniqueStr)); fp.setUrlHash(fp.getDomainHash()); // and use the bloom filter to skip if already emitted ... if (!bloomFilter.isPresent(fp)) { // output tuple output.collect(key, value); // add hacked fp into BF bloomFilter.add(fp); } } } }