/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.pipelineV3.domainmeta.crawlstats;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.crawl.common.internal.CrawlEnvironment;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineStep;
import org.commoncrawl.mapred.pipelineV3.CrawlPipelineTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.DomainMetadataTask;
import org.commoncrawl.mapred.pipelineV3.domainmeta.rank.GenSuperDomainListStep;
import org.commoncrawl.protocol.URLFPV2;
import org.commoncrawl.util.GoogleURL;
import org.commoncrawl.util.JobBuilder;
import org.commoncrawl.util.SuperDomainList;
import org.commoncrawl.util.TextBytes;
import org.commoncrawl.util.URLFPBloomFilter;
import org.commoncrawl.util.URLUtils;
import com.google.gson.JsonArray;
import com.google.gson.JsonPrimitive;
/**
*
* @author rana
*
*/
public class NonSuperSubdomainCollectorStep extends CrawlPipelineStep implements
Mapper<TextBytes, TextBytes, TextBytes, TextBytes>, Reducer<TextBytes, TextBytes, TextBytes, TextBytes> {
enum Counters {
HIT_MAXSUBDOMAIN_LIMIT, SKIPPED_SUBDOMAIN_SAME_AS_ROOT_VIA_ID, SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PREFIX,
SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PATTERN_MATCH
}
private static final Log LOG = LogFactory.getLog(NonSuperSubdomainCollectorStep.class);
static final int NUM_HASH_FUNCTIONS = 10;
static final int NUM_BITS = 11;
static final int NUM_ELEMENTS = 1 << 29;
static final int FLUSH_THRESHOLD = 1 << 23;
public static final String SUPER_DOMAIN_FILE_PATH = "super-domain-list";
URLFPBloomFilter subDomainFilter;
public static final String OUTPUT_DIR_NAME = "nonsuper-subdomains";
URLFPV2 bloomKey = new URLFPV2();
TextBytes emptyTextBytes = new TextBytes();
Pattern wwwMatchPattern = Pattern.compile("www[\\-0-9]*\\.");
Set<Long> superDomainIdSet;
HashSet<String> domains = new HashSet<String>();
static final int MAX_SUBDOMAINS_ALLOWED = 100;
public NonSuperSubdomainCollectorStep() {
super(null, null, null);
}
public NonSuperSubdomainCollectorStep(CrawlPipelineTask task) {
super(task, "SubDomain Collector", OUTPUT_DIR_NAME);
}
@Override
public void close() throws IOException {
}
@Override
public void configure(JobConf job) {
if (job.getBoolean("mapred.task.is.map", false)) {
Path superDomainIdFile = new Path(job.get(SUPER_DOMAIN_FILE_PATH));
try {
superDomainIdSet = SuperDomainList.loadSuperDomainIdList(job, superDomainIdFile);
} catch (IOException e) {
LOG.error(StringUtils.stringifyException(e));
throw new RuntimeException(e);
}
subDomainFilter = new URLFPBloomFilter(NUM_ELEMENTS, NUM_HASH_FUNCTIONS, NUM_BITS);
}
}
@Override
public Log getLogger() {
return LOG;
}
@Override
public void map(TextBytes key, TextBytes value, OutputCollector<TextBytes, TextBytes> output, Reporter reporter)
throws IOException {
String url = key.toString();
GoogleURL urlObject = new GoogleURL(url);
if (urlObject.isValid()) {
String rootDomain = URLUtils.extractRootDomainName(urlObject.getHost());
if (rootDomain != null) {
long rootDomainId = SuperDomainList.domainFingerprintGivenName(rootDomain);
if (!superDomainIdSet.contains(rootDomainId)) {
long subDomainId = SuperDomainList.domainFingerprintGivenName(urlObject.getHost());
if (subDomainId == rootDomainId) {
reporter.incrCounter(Counters.SKIPPED_SUBDOMAIN_SAME_AS_ROOT_VIA_ID, 1);
return;
}
// extract prefix ...
String prefix = urlObject.getHost().substring(0, urlObject.getHost().length() - rootDomain.length());
// straight match ...
if (prefix.equals("www.")) {
reporter.incrCounter(Counters.SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PREFIX, 1);
return; // skip
} else if (prefix.startsWith("www") && wwwMatchPattern.matcher(prefix).matches()) {
reporter.incrCounter(Counters.SKIPEED_SUBDOMAIN_SAME_AS_ROOT_BUT_WWW_PATTERN_MATCH, 1);
return;
}
bloomKey.setDomainHash(subDomainId);
bloomKey.setUrlHash(subDomainId);
if (subDomainFilter.isPresent(bloomKey)) {
// hacky but ,oh well, pressed for time
return;
}
// add it to the BF NOW
subDomainFilter.add(bloomKey);
// emit as root domain , sub domain
output.collect(new TextBytes(rootDomain), new TextBytes(urlObject.getHost()));
}
}
}
}
@Override
public void reduce(TextBytes key, Iterator<TextBytes> values, OutputCollector<TextBytes, TextBytes> output,
Reporter reporter) throws IOException {
while (values.hasNext()) {
domains.add(values.next().toString());
if (domains.size() >= MAX_SUBDOMAINS_ALLOWED) {
reporter.incrCounter(Counters.HIT_MAXSUBDOMAIN_LIMIT, 1);
break;
}
}
if (domains.size() != 0 && domains.size() < MAX_SUBDOMAINS_ALLOWED) {
JsonArray array = new JsonArray();
for (String domain : domains) {
array.add(new JsonPrimitive(domain));
}
output.collect(key, new TextBytes(array.toString()));
}
domains.clear();
}
@Override
public void runStep(Path outputPathLocation) throws IOException {
DomainMetadataTask rootTask = findTaskOfType(DomainMetadataTask.class);
Path superDomainListPath = new Path(getOutputDirForStep(GenSuperDomainListStep.class), "part-00000");
JobConf job = new JobBuilder(getDescription(), getConf()).inputs(rootTask.getRestrictedMergeDBDataPaths())
.inputIsSeqFile().mapper(NonSuperSubdomainCollectorStep.class).reducer(NonSuperSubdomainCollectorStep.class,
false).numReducers(CrawlEnvironment.NUM_DB_SHARDS / 2).keyValue(TextBytes.class, TextBytes.class).output(
outputPathLocation).outputIsSeqFile().set(SUPER_DOMAIN_FILE_PATH, superDomainListPath.toString()).reuseJVM(
1000).build();
JobClient.runJob(job);
}
}