package org.commoncrawl.service.crawler.filters;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.Vector;
import java.util.regex.Pattern;
import org.apache.hadoop.mapred.JobConf;
import org.commoncrawl.io.NIODNSCache;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.rpc.base.shared.RPCStruct;
import org.commoncrawl.service.crawler.filters.DomainFilterData;
import org.commoncrawl.service.crawler.filters.FilterResults;
import org.commoncrawl.util.URLUtils;
import org.junit.Test;
import com.google.common.collect.Multimap;
import com.google.common.collect.TreeMultimap;
public class ReCrawlTimeModifierFilter extends Filter {
public ReCrawlTimeModifierFilter(String itemPath,boolean hasMasterFile) {
super(itemPath,hasMasterFile);
}
static class RecrawlTimeModifierItem implements Comparable<RecrawlTimeModifierItem> {
public RecrawlTimeModifierItem(String domainRegEx,String pathRegEx,long recrawlTimeValue) {
domainPatterStr = domainRegEx;
pathPatternStr = pathRegEx;
if (domainRegEx.length() != 0) {
this.domainPattern = Pattern.compile(domainRegEx);
}
this.pathPattern = Pattern.compile(pathRegEx);
this.recrawlTimeValue = recrawlTimeValue;
}
String domainPatterStr;
String pathPatternStr;
Pattern domainPattern = null;
Pattern pathPattern = null;
long recrawlTimeValue;
@Override
public int compareTo(RecrawlTimeModifierItem o) {
int result = domainPatterStr.compareTo(o.domainPatterStr);
if (result == 0) {
result = pathPatternStr.compareTo(o.pathPatternStr);
}
return result;
}
};
private Vector<RecrawlTimeModifierItem> globalBoostItems = new Vector<RecrawlTimeModifierItem>();
private TreeMultimap<String,RecrawlTimeModifierItem> domainToModifierListMap = TreeMultimap.create();
@Override
public void loadFilterItem(String filterItemLine) throws IOException {
String items[] = filterItemLine.split(",");
if (items.length == 4) {
String rootDomain = items[0];
String domainPattern = items[1];
String urlPattern = items[2];
long modifiedTimeValue = Long.parseLong(items[3]);
Map<String,RecrawlTimeModifierItem> modifierItemMap = null;
if (!rootDomain.equals("*") && !rootDomain.equals(".*")) {
RecrawlTimeModifierItem newItem = new RecrawlTimeModifierItem(domainPattern,urlPattern,modifiedTimeValue);
domainToModifierListMap.put(rootDomain, newItem);
}
else {
RecrawlTimeModifierItem boostItem = new RecrawlTimeModifierItem(domainPattern,urlPattern,modifiedTimeValue);
globalBoostItems.add(boostItem);
}
}
else {
throw new IOException("Invalid Boost Fileter Line:" + filterItemLine);
}
}
@Test
public void testFilter() throws Exception {
loadFilterItem("*,(^/$|(^/index\\.[^/]*$)),0");
loadFilterItem("twitter.com,/[^/]*,0");
CrawlURLMetadata metadata = new CrawlURLMetadata();
FilterResults resultsOut = new FilterResults();
filterItem("google.com","","/",metadata,resultsOut);
filterItem("twitter.com","","/foobar",metadata,resultsOut);
filterItem("kotay.com","","/index.html",metadata,resultsOut);
filterItem("kotay.com","","/index.php",metadata,resultsOut);
}
@Override
public FilterResult filterItem(String rootDomainName,String fullyQualifiedDomainName, String urlPath,CrawlURLMetadata metadataIn,FilterResults resultsOut) {
for (RecrawlTimeModifierItem globalBoostItem : globalBoostItems) {
if (globalBoostItem.pathPattern.matcher(urlPath).matches()) {
if (resultsOut.isFieldDirty(FilterResults.Field_MODIFIEDRECRAWLTIME))
resultsOut.setModifiedRecrawlTime(Math.min(resultsOut.getModifiedRecrawlTime(),globalBoostItem.recrawlTimeValue));
else
resultsOut.setModifiedRecrawlTime(globalBoostItem.recrawlTimeValue);
}
}
Set<RecrawlTimeModifierItem> items = domainToModifierListMap.get(rootDomainName);
for (RecrawlTimeModifierItem boostItem : items) {
if (boostItem.domainPattern == null || boostItem.domainPattern.matcher(fullyQualifiedDomainName).matches()) {
if (boostItem.pathPattern.matcher(urlPath).matches()) {
if (resultsOut.isFieldDirty(FilterResults.Field_MODIFIEDRECRAWLTIME))
resultsOut.setModifiedRecrawlTime(Math.min(resultsOut.getModifiedRecrawlTime(),boostItem.recrawlTimeValue));
else
resultsOut.setModifiedRecrawlTime(boostItem.recrawlTimeValue);
}
}
}
return (resultsOut.isFieldDirty(FilterResults.Field_MODIFIEDRECRAWLTIME)) ? FilterResult.Filter_Modified : FilterResult.Filter_NoAction;
}
@Override
public void clear() {
globalBoostItems.clear();
domainToModifierListMap.clear();
}
}