/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.pipelineV3.domainmeta.fuzzydedupe;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Pattern;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.StringUtils;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonPrimitive;
/**
*
* @author rana
*
*/
public class CrossDomainDupesReducer implements Reducer<TextBytes, TextBytes, TextBytes, TextBytes> {
enum Counters {
}
JsonParser parser = new JsonParser();
public static final int NUM_HASH_FUNCTIONS = 10;
public static final int NUM_BITS = 11;
public static final int NUM_ELEMENTS = 1 << 18;
private URLFPBloomFilter filter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
String samples[] = new String[20];
Pattern knownValidDupesPatterns = Pattern.compile("/(wp-includes|xmlrpc.php)");
@Override
public void close() throws IOException {
}
@Override
public void configure(JobConf job) {
}
public double otherDomainToLocalDomainScore(double totalHits, double otherDomainHits) {
return Math.log(otherDomainHits) / Math.log(totalHits);
}
@Override
public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output,
Reporter reporter) throws IOException {
filter.clear();
double crossDomainDupesCount = 0;
double totalHitsCount = 0;
double uniqueRootDomainsCount = 0;
double uniqueIPs = 0;
double validDupePatternMatches = 0;
URLFPV2 rootFP = URLUtils.getURLFPV2FromHost(key.toString());
URLFPV2 fp = new URLFPV2();
int sampleCount = 0;
ArrayList<Integer> ipAddresses = new ArrayList<Integer>();
JsonArray thisHostsDupes = new JsonArray();
DescriptiveStatistics lengthStats = new DescriptiveStatistics();
while (values.hasNext()) {
JsonArray jsonArray = parser.parse(values.next().toString()).getAsJsonArray();
for (JsonElement elem : jsonArray) {
totalHitsCount++;
fp.setRootDomainHash(elem.getAsJsonObject().get("dh").getAsLong());
if (fp.getRootDomainHash() != rootFP.getRootDomainHash()) {
crossDomainDupesCount++;
fp.setDomainHash(fp.getRootDomainHash());
fp.setUrlHash(fp.getRootDomainHash());
// track length average ....
lengthStats.addValue(elem.getAsJsonObject().get("length").getAsInt());
if (!filter.isPresent(fp)) {
uniqueRootDomainsCount++;
filter.add(fp);
if (sampleCount < samples.length) {
String url = elem.getAsJsonObject().get("url").getAsString();
GoogleURL urlObject = new GoogleURL(url);
if (knownValidDupesPatterns.matcher(urlObject.getCanonicalURL()).find()) {
validDupePatternMatches++;
}
samples[sampleCount++] = url;
}
}
} else {
thisHostsDupes.add(elem);
}
int ipAddress = elem.getAsJsonObject().get("ip").getAsInt();
fp.setRootDomainHash(ipAddress);
fp.setDomainHash(ipAddress);
fp.setUrlHash(ipAddress);
if (!filter.isPresent(fp)) {
uniqueIPs++;
filter.add(fp);
ipAddresses.add(ipAddress);
}
}
}
if (totalHitsCount > 15 && crossDomainDupesCount >= 2) {
double otherDomainToLocalScore = otherDomainToLocalDomainScore(totalHitsCount, crossDomainDupesCount);
double spamIPScore = spamHostScore(totalHitsCount, crossDomainDupesCount, uniqueIPs);
if (otherDomainToLocalScore >= .50 || spamIPScore > .50) {
JsonObject objectOut = new JsonObject();
objectOut.addProperty("ratio", (crossDomainDupesCount / totalHitsCount));
objectOut.addProperty("totalHits", totalHitsCount);
objectOut.addProperty("crossDomainDupes", crossDomainDupesCount);
objectOut.addProperty("uniqueRootDomains", uniqueRootDomainsCount);
objectOut.addProperty("otherDomainToLocalScore", otherDomainToLocalScore);
objectOut.addProperty("spamIPScore", spamIPScore);
objectOut.addProperty("validDupeMatches", validDupePatternMatches);
objectOut.addProperty("content-len-mean", lengthStats.getMean());
objectOut.addProperty("content-len-geo-mean", lengthStats.getGeometricMean());
for (int i = 0; i < sampleCount; ++i) {
objectOut.addProperty("sample-" + i, samples[i]);
}
// compute path edit distance ...
if (sampleCount > 1) {
int sampleEditDistanceSize = Math.min(sampleCount, 5);
DescriptiveStatistics stats = new DescriptiveStatistics();
for (int j = 0; j < sampleEditDistanceSize; ++j) {
for (int k = 0; k < sampleEditDistanceSize; ++k) {
if (k != j) {
GoogleURL urlObjectA = new GoogleURL(samples[j]);
GoogleURL urlObjectB = new GoogleURL(samples[k]);
if (urlObjectA.getPath().length() < 100 && urlObjectB.getPath().length() < 100) {
stats.addValue(StringUtils.getLevenshteinDistance(urlObjectA.getPath(), urlObjectB.getPath()));
}
}
}
}
if (stats.getMean() != 0.0) {
objectOut.addProperty("lev-distance-mean", stats.getMean());
objectOut.addProperty("lev-distance-geomean", stats.getGeometricMean());
}
}
JsonArray ipAddressArray = new JsonArray();
for (int j = 0; j < Math.min(1000, ipAddresses.size()); ++j) {
ipAddressArray.add(new JsonPrimitive(ipAddresses.get(j)));
}
if (ipAddresses.size() != 0) {
objectOut.add("ipList", ipAddressArray);
}
objectOut.add("thisHostDupes", thisHostsDupes);
output.collect(key, new TextBytes(objectOut.toString()));
}
}
}
public double spamHostScore(double totalHits, double otherDomainHits, double uniqueIPAddresses) {
double uniqueIPScore = 1 - (Math.log(uniqueIPAddresses) / Math.log(totalHits));
return (Math.log(Math.max(otherDomainHits, 2)) / Math.log(Math.max(totalHits, 2))) * uniqueIPScore;
}
}