/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.pipelineV3.domainmeta.fuzzydedupe;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.FPGenerator;
import org.commoncrawl.util.JoinByTextSortByTagMapper;
import org.commoncrawl.util.JoinValue;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
/**
*
* @author rana
*
*/
public class HostBlacklistByIPReducer implements Reducer<TextBytes, JoinValue, TextBytes, TextBytes> {
enum Counters {
HOST_WAS_FLAGGED_AS_QUANTCAST_HOST, HOST_WAS_FLAGGED_AS_BAD_IP_HOST, HOST_WAS_FLAGGED_AS_BLACKLISTED_AND_QUANTCAST,
GOT_BLACKLIST_TAG, GOT_QUANTCAST_TAG, GOT_IP_TO_HOST_MAPPING, ENTIRE_HOST_FLAGGED_BAD, INDIVIDUAL_DOMAINS_BAD
}
private static final Log LOG = LogFactory.getLog(HostBlacklistByIPReducer.class);
TextBytes realKey = new TextBytes();
TextBytes tagKey = new TextBytes();
String activeIPAddress = null;
String blackListIPHost;
String blackListReason;
boolean flaggedAsQuantcastHost;
int badHostCount = 0;
int totalHostCount = 0;
int dupeHitsCount = 0;
// risk .. but expedient ...
ArrayList<JsonObject> blackListedItems = new ArrayList<JsonObject>();
public static final int NUM_HASH_FUNCTIONS = 10;
public static final int NUM_BITS = 11;
public static final int NUM_ELEMENTS = 1 << 25;
private URLFPBloomFilter badHostFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
JsonParser parser = new JsonParser();
URLFPV2 fp = new URLFPV2();
double FAILURE_THRESHOLD = .75;
long bloomKeyCount = 0;
static final long BLOOM_FILTER_FLUSH_THRESHOLD = 1 << 21;
@Override
public void close() throws IOException {
}
@Override
public void configure(JobConf job) {
}
@Override
public void reduce(TextBytes key, Iterator<JoinValue> values, OutputCollector<TextBytes, TextBytes> output,
Reporter reporter) throws IOException {
JoinByTextSortByTagMapper.getKeyFromCompositeKey(key, realKey);
JoinByTextSortByTagMapper.getTagFromCompositeKey(key, tagKey);
// get ip address and tag ...
String ipAddress = realKey.toString();
String tag = tagKey.toString();
// ok if ip-address transition ...
LOG.info("Got Key:" + ipAddress.toString() + ":" + tag.toString());
if (activeIPAddress == null || !activeIPAddress.equals(ipAddress)) {
// LOG.info("Reset State");
blackListedItems.clear();
flaggedAsQuantcastHost = false;
badHostCount = 0;
totalHostCount = 0;
// set active ip address ...
activeIPAddress = ipAddress;
dupeHitsCount = 0;
}
if (tag.equals(HostBlacklistByDupesStep.TAG_BAD_IP_MAPPING)) {
dupeHitsCount++;
String value = values.next().getTextValue().toString();
// read the object
JsonObject blackListInfo = parser.parse(value).getAsJsonObject();
// set up a 'fake' fingerprint ...
fp.setDomainHash(FPGenerator.std64.fp(blackListInfo.get("host").getAsString()));
fp.setUrlHash(fp.getDomainHash());
if (!badHostFilter.isPresent(fp)) {
// set it in bad host bloom filter ...
badHostFilter.add(fp);
bloomKeyCount++;
// queue up the object for later ...
blackListedItems.add(blackListInfo);
reporter.incrCounter(Counters.GOT_BLACKLIST_TAG, 1);
}
} else if (tag.equals(HostBlacklistByDupesStep.TAG_QUANTCAST_MAPPING)) {
reporter.incrCounter(Counters.GOT_QUANTCAST_TAG, 1);
flaggedAsQuantcastHost = true;
} else {
reporter.incrCounter(Counters.GOT_IP_TO_HOST_MAPPING, 1);
if (dupeHitsCount != 0) {
ArrayList<String> entireHostList = new ArrayList<String>();
// ok walk hosts ...
while (values.hasNext()) {
String host = values.next().getTextValue().toString();
// add host to list
entireHostList.add(host);
// set up a 'fake' fingerprint ...
fp.setDomainHash(FPGenerator.std64.fp(host));
fp.setUrlHash(fp.getDomainHash());
// increment total host count ...
totalHostCount++;
if (badHostFilter.isPresent(fp)) {
badHostCount++;
}
}
double pctBad = (double) badHostCount / (double) totalHostCount;
JsonObject objectOut = new JsonObject();
if (pctBad >= FAILURE_THRESHOLD) {
objectOut.addProperty("level", "ip");
} else {
objectOut.addProperty("level", "host");
}
JsonArray domains = new JsonArray();
JsonArray sampleArray = null;
objectOut.addProperty("level", "host");
objectOut.addProperty("badHostCount", badHostCount);
objectOut.addProperty("totalHostCount", totalHostCount);
objectOut.add("domains", domains);
for (JsonObject badHost : blackListedItems) {
// add host to array
domains.add(badHost.get("host"));
if (sampleArray == null) {
sampleArray = badHost.getAsJsonArray("samples");
if (sampleArray != null && sampleArray.size() == 0) {
sampleArray = null;
}
}
}
if (sampleArray != null) {
objectOut.add("samples", sampleArray);
}
output.collect(new TextBytes(ipAddress), new TextBytes(objectOut.toString()));
}
if (bloomKeyCount >= BLOOM_FILTER_FLUSH_THRESHOLD) {
badHostFilter.clear();
}
}
}
}