package org.commoncrawl.service.crawler.filters;
import static org.junit.Assert.*;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.Vector;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.mapred.JobConf;
import org.commoncrawl.io.NIODNSCache;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.rpc.base.shared.RPCStruct;
import org.commoncrawl.service.crawler.filters.DomainFilterData;
import org.commoncrawl.service.crawler.filters.FilterResults;
import org.commoncrawl.service.crawler.filters.Filter.FilterResult;
import org.commoncrawl.util.URLUtils;
import org.junit.Test;
import com.google.common.collect.SortedSetMultimap;
import com.google.common.collect.TreeMultimap;
public class CrawlRateOverrideFilter extends Filter {
private static final Log LOG = LogFactory.getLog(CrawlRateOverrideFilter.class);
public CrawlRateOverrideFilter() {
}
public CrawlRateOverrideFilter(String filterPath,boolean hasMasterFile) {
super(filterPath,hasMasterFile);
}
static class CrawlRateBoostItem implements Comparable<CrawlRateBoostItem> {
public CrawlRateBoostItem(String subDomainRegEx,String pathRegEx,int crawlRateValue) {
if (subDomainRegEx.length() != 0 && !subDomainRegEx.equals("*")) {
this.subDomainRegEx = Pattern.compile(subDomainRegEx);
}
if (!pathRegEx.equals("*")) {
this.pathRegEx = Pattern.compile(pathRegEx);
}
this.crawlRateValue = crawlRateValue;
}
Pattern subDomainRegEx = null;
Pattern pathRegEx = null;
int crawlRateValue;
@Override
public int compareTo(CrawlRateBoostItem o) {
if (crawlRateValue < o.crawlRateValue)
return -1;
else if (crawlRateValue > o.crawlRateValue)
return 1;
return 0;
}
};
private SortedSetMultimap<String,CrawlRateBoostItem> rootDomainToBoostMap = TreeMultimap.create();
// private Map<String,Map<String,CrawlRateBoostItem> > rootDomainToBoostMap = new HashMap<String,Map<String,CrawlRateBoostItem>>();
@Override
public void loadFilterItem(String filterItemLine) throws IOException {
String items[] = filterItemLine.split(",");
if (items.length == 4) {
String rootDomain = items[0];
String subDomainRegExp = items[1];
String pathRegExp = items[2];
int desiredCrawlRate = Integer.parseInt(items[3]);
LOG.info("Processing Valid Line. "
+ "RootDomain:" + rootDomain
+ " SubDomain:" + subDomainRegExp
+ " Path:" + pathRegExp
+ " CrawRate:" + desiredCrawlRate);
CrawlRateBoostItem boostItem = new CrawlRateBoostItem(subDomainRegExp,pathRegExp, desiredCrawlRate);
rootDomainToBoostMap.put(rootDomain, boostItem);
}
else {
LOG.error("Skipping invalid line:" + filterItemLine);
}
}
private static String[] testInputs = new String[] {
"amazon.de,*,*,50",
"amazon.com,*,*,50",
"amazon.ca,*,*,50",
"amazon.fr,*,*,50",
"amazon.co.jp,*,*,50",
"amazon.co.uk,*,*,50",
"barnesandnoble.com,*,*,1000",
"borders.com,*,*,1000",
"allbookstores.com,*,*,15000",
"booksamillion.com,*,*,5000",
"ebay.com,*,*,50",
"yelp.com,*,*,2500",
"tripadvisor.com,*,*,2500",
"yahoo.com,shopping.yahoo.com,*,50",
"books-by-isbn.com,*,*,1500",
"blogspot.com,*,*,10",
"wordpress.com,*,*,50"
};
@Override
public FilterResult filterItem(String rootDomain,String domainName, String urlPath,CrawlURLMetadata metadataIn,FilterResults resultsOut) {
SortedSet<CrawlRateBoostItem> items = rootDomainToBoostMap.get(rootDomain);
for (CrawlRateBoostItem item : items) {
if (item.subDomainRegEx == null || item.subDomainRegEx.matcher(domainName).matches()) {
if (item.pathRegEx == null || item.pathRegEx.matcher(urlPath).matches()) {
resultsOut.setCrawlRateOverride(item.crawlRateValue);
return FilterResult.Filter_Modified;
}
}
}
return FilterResult.Filter_NoAction;
}
@Override
public void clear() {
rootDomainToBoostMap.clear();
}
public static int checkForCrawlRateOverride(CrawlRateOverrideFilter filter,URL url) {
FilterResults resultsOut = new FilterResults();
String rootDomain = URLUtils.extractRootDomainName(url.getHost());
if (rootDomain != null) {
if (filter.filterItem(rootDomain,url.getHost(), url.getPath(), null, resultsOut) == FilterResult.Filter_Modified) {
return resultsOut.getCrawlRateOverride();
}
}
return -1;
}
public static void main(String[] args) {
CrawlRateOverrideFilter filter = new CrawlRateOverrideFilter();
for (String inputLine : testInputs) {
try {
filter.loadFilterItem(inputLine);
} catch (IOException e) {
e.printStackTrace();
}
}
try {
URL targetURL = new URL("http://chicagoconnie.blogspot.com/");
LOG.info("CrawlRate:" + checkForCrawlRateOverride(filter,targetURL));
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
for (String inputLine : testInputs) {
String parts[] = inputLine.split(",");
String rootDomain = parts[0];
String subDomain = parts[1];
String path = parts[2];
FilterResults filterResults = new FilterResults();
if (subDomain.equals("*")) {
//assertTrue(filter.filterItem(rootDomain,"www." + rootDomain, "/foobar", null, filterResults) == FilterResult.Filter_Modified);
//assertTrue(filter.filterItem(rootDomain+"Other","www." + rootDomain+"Other", "/foobar", null, filterResults) == FilterResult.Filter_NoAction);
}
else {
//assertTrue(filter.filterItem(rootDomain,subDomain, "/foobar", null, filterResults) == FilterResult.Filter_Modified);
//assertTrue(filter.filterItem(rootDomain,"prefix" + subDomain, "/foobar", null, filterResults) == FilterResult.Filter_NoAction);
}
}
}
}