package org.commoncrawl.service.crawler.filters; import java.io.IOException; import java.util.Map; import java.util.TreeMap; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.service.crawler.filters.FilterResults; import org.commoncrawl.util.URLFingerprint; import org.commoncrawl.util.URLUtils; public class DomainHashFilter extends Filter { private TreeMap<Integer,Integer> _mapDomainHashToPos = new TreeMap<Integer,Integer>(); private int lastPosition = 0; public DomainHashFilter() { super(); } public DomainHashFilter(String filterPath,boolean hasMasterFile) { super(filterPath, hasMasterFile); } @Override public void loadFilterItem(String filterItemLine) throws IOException { if (filterItemLine.length() == 0) { // LOG.error("filterItemLine is zero length"); } else { // LOG.info("Processing Filter Line:" + filterItemLine); if (filterItemLine.charAt(0) == '.') filterItemLine = filterItemLine.substring(1); String items[] = filterItemLine.split(","); if (items.length >= 1) { _mapDomainHashToPos.put(URLFingerprint.generate32BitHostFP(items[0].toLowerCase()),++lastPosition); } } } public int getHashValueCount() { return _mapDomainHashToPos.size(); } @Override public void clear() { _mapDomainHashToPos.clear(); } @Override public FilterResult filterItem(String rootDomainName,String fullyQualifiedDomainName, String urlPath,CrawlURLMetadata metadata, FilterResults results) { int fingerprint = URLFingerprint.generate32BitHostFP(rootDomainName.toLowerCase()); Integer position = _mapDomainHashToPos.get(fingerprint); if (position != null) { results.setPosition(position); return FilterResult.Filter_Accept; } return FilterResult.Filter_Reject; } }