package org.commoncrawl.service.crawler.filters; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.Vector; import java.util.regex.Pattern; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.mapred.JobConf; import org.commoncrawl.io.NIODNSCache; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.rpc.base.shared.RPCStruct; import org.commoncrawl.service.crawler.filters.DomainFilterData; import org.commoncrawl.service.crawler.filters.FilterResults; import org.commoncrawl.util.URLUtils; import org.junit.Test; import com.google.common.collect.TreeMultimap; public class PageRankBoostFilter extends Filter { private static final Log LOG = LogFactory.getLog(PageRankBoostFilter.class); public PageRankBoostFilter(String filterPath,boolean hasMasterFile) { super(filterPath,hasMasterFile); } static class PageRankBoostItem implements Comparable<PageRankBoostItem>{ public PageRankBoostItem(String domainRegEx,String pathRegEx,float urlBoostValue) { this.domainRegEx = domainRegEx; this.pathRegEx = pathRegEx; if (domainRegEx.length() != 0) { domainPatternObj = Pattern.compile(domainRegEx); } this.pathPatternObj = Pattern.compile(pathRegEx); this.urlBoostValue = urlBoostValue; } String domainRegEx; String pathRegEx; Pattern domainPatternObj = null; Pattern pathPatternObj; float urlBoostValue; @Override public int compareTo(PageRankBoostItem o) { int result = domainRegEx.compareTo(o.domainRegEx); if (result == 0) { result = pathRegEx.compareTo(o.pathRegEx); } return result; } }; private Vector<PageRankBoostItem> globalBoostItems = new Vector<PageRankBoostItem>(); private TreeMultimap<String,PageRankBoostItem> domainToBoostMap = TreeMultimap.create(); @Override public void loadFilterItem(String filterItemLine) throws IOException { String items[] = filterItemLine.split(","); if (items.length == 4) { String rootDomain = items[0]; String fullyQualifiedDomain = items[1]; String urlPattern = items[2]; float boostValue = Float.parseFloat(items[3]); Map<String,PageRankBoostItem> boostItemMap = null; if (!rootDomain.equals("*") && !rootDomain.equals(".*")) { domainToBoostMap.put(rootDomain,new PageRankBoostItem(fullyQualifiedDomain, urlPattern, boostValue)); } else { PageRankBoostItem boostItem = new PageRankBoostItem(fullyQualifiedDomain,urlPattern,boostValue); globalBoostItems.add(boostItem); } } else { LOG.error("Invalid Boost Fileter Line:" + filterItemLine); } } @Test public void testFilter() throws Exception { loadFilterItem("*,,.*,2.00"); loadFilterItem("google.com,,/foobar/.*,1.00"); loadFilterItem("twitter.com,,/[^/]*,1.00"); CrawlURLMetadata metadata = new CrawlURLMetadata(); FilterResults resultsOut = new FilterResults(); filterItem("google.com","","/foobar/zzzz",metadata,resultsOut); filterItem("twitter.com","","/foobar",metadata,resultsOut); } @Override public FilterResult filterItem(String rootDomainName,String fullyQualifiedDomainName,String urlPath,CrawlURLMetadata metadataIn,FilterResults resultsOut) { for (PageRankBoostItem globalBoostItem : globalBoostItems) { if (globalBoostItem.pathPatternObj.matcher(urlPath).matches()) { resultsOut.setPageRankBoostValue(resultsOut.getPageRankBoostValue() + globalBoostItem.urlBoostValue); } } Set<PageRankBoostItem> boostItemSet = domainToBoostMap.get(rootDomainName); for (PageRankBoostItem boostItem : boostItemSet) { if (boostItem.domainPatternObj == null || boostItem.domainPatternObj.matcher(fullyQualifiedDomainName).matches()) { if (boostItem.pathPatternObj.matcher(urlPath).matches()) { resultsOut.setPageRankBoostValue(resultsOut.getPageRankBoostValue() + boostItem.urlBoostValue); } } } return (resultsOut.isFieldDirty(FilterResults.Field_PAGERANKBOOSTVALUE)) ? FilterResult.Filter_Modified : FilterResult.Filter_NoAction; } @Override public void clear() { globalBoostItems.clear(); domainToBoostMap.clear(); } }