package org.commoncrawl.service.crawler.filters; import static org.junit.Assert.*; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.Vector; import java.util.regex.Pattern; import junit.framework.TestFailure; import org.apache.hadoop.mapred.JobConf; import org.commoncrawl.io.NIODNSCache; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.rpc.base.shared.RPCStruct; import org.commoncrawl.service.crawler.filters.DomainFilterData; import org.commoncrawl.service.crawler.filters.FilterResults; import org.commoncrawl.util.URLUtils; import org.junit.Test; import com.google.common.collect.TreeMultimap; public class URLPatternBlockFilter extends Filter { public URLPatternBlockFilter() { } public URLPatternBlockFilter(String filterPath,boolean hasMasterFile) { super(filterPath,hasMasterFile); } static class DomainURLPatternItem implements Comparable<DomainURLPatternItem>{ public DomainURLPatternItem(String domainRegEx,String pathRegEx) { this.domainRegEx = domainRegEx; if (domainRegEx.length() != 0) { this.domainPatternObj = Pattern.compile(domainRegEx); } this.pathRegEx = pathRegEx; this.pathPatternObj = Pattern.compile(pathRegEx); } String domainRegEx; Pattern domainPatternObj = null; String pathRegEx; Pattern pathPatternObj; @Override public int compareTo(DomainURLPatternItem o) { int result = domainRegEx.compareTo(o.domainRegEx); if (result == 0) { result = pathRegEx.compareTo(o.pathRegEx); } return result; } } private Vector<DomainURLPatternItem> globalPatternList = new Vector<DomainURLPatternItem>(); private TreeMultimap<String,DomainURLPatternItem> domainToPatternList = TreeMultimap.create(); @Override public void loadFilterItem(String filterItemLine) throws IOException { int indexOfFirstComma = filterItemLine.indexOf(','); if (indexOfFirstComma != -1) { int indexOfNextComma = filterItemLine.indexOf(',',indexOfFirstComma + 1); if (indexOfNextComma != -1) { String rootDomain = filterItemLine.substring(0,indexOfFirstComma); String subDomainRegExp = ""; if (indexOfNextComma - indexOfFirstComma > 1) { subDomainRegExp = filterItemLine.substring(indexOfFirstComma + 1,indexOfNextComma); } String urlPattern = filterItemLine.substring(indexOfNextComma + 1); if (!rootDomain.equals("*") & !rootDomain.equals(".*")) { domainToPatternList.put(rootDomain,new DomainURLPatternItem(subDomainRegExp,urlPattern)); } else { globalPatternList.add(new DomainURLPatternItem(subDomainRegExp,urlPattern)); } } } else { throw new IOException("Invalid Boost Fileter Line:" + filterItemLine); } } @Test public void testFilter() throws Exception { loadFilterItem("google.com,photos.google.com,.*"); loadFilterItem("biblio.com,,/review.php.*"); loadFilterItem("*,,.*\\.gif"); CrawlURLMetadata metadata = new CrawlURLMetadata(); FilterResults resultsOut = new FilterResults(); assertTrue(filterItem("biblio.com","","/review.php",metadata,resultsOut) == FilterResult.Filter_Reject); assertTrue(filterItem("google.com","photos.google.com","/foobar/zzzz",metadata,resultsOut) == FilterResult.Filter_Reject); assertFalse(filterItem("google.com","","/foobar/zzzz",metadata,resultsOut) == FilterResult.Filter_Reject); assertFalse(filterItem("google.com","feeds.google.com","/foobar/zzzz",metadata,resultsOut) == FilterResult.Filter_Reject); assertTrue(filterItem("twitter.com","","/foobar.gif",metadata,resultsOut) == FilterResult.Filter_Reject); } @Override public FilterResult filterItem(String rootDomainName,String fullyQualifiedDomainName, String urlPath,CrawlURLMetadata metadataIn,FilterResults resultsOut) { for (DomainURLPatternItem globalBoostItem : globalPatternList) { if (globalBoostItem.domainPatternObj == null || globalBoostItem.domainPatternObj.matcher(fullyQualifiedDomainName).matches()) { if (globalBoostItem.pathPatternObj.matcher(urlPath).matches()) { return FilterResult.Filter_Reject; } } } Set<DomainURLPatternItem> boostItemSet = domainToPatternList.get(rootDomainName); for (DomainURLPatternItem boostItem : boostItemSet) { if (boostItem.domainPatternObj == null || boostItem.domainPatternObj.matcher(fullyQualifiedDomainName).matches()) { if (boostItem.pathPatternObj.matcher(urlPath).matches()) { return FilterResult.Filter_Reject; } } } return FilterResult.Filter_NoAction; } @Override public void clear() { globalPatternList.clear(); domainToPatternList.clear(); } public static void main(String[] args) { URLPatternBlockFilter filter = new URLPatternBlockFilter(); try { filter.testFilter(); } catch (Exception e) { e.printStackTrace(); } } }