package org.commoncrawl.service.crawler.filters; import java.io.IOException; import java.net.InetAddress; import org.apache.hadoop.mapred.JobConf; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.rpc.base.shared.RPCStruct; import org.commoncrawl.service.crawler.filters.FilterResults; public abstract class Filter { public enum FilterResult { Filter_NoAction, Filter_Reject, Filter_Accept, Filter_Modified } protected String _filterPath = null; protected boolean _hasMasterFile = false; public Filter() { } public Filter(String filterPath,boolean hasMasterFile) { _filterPath = filterPath; _hasMasterFile = hasMasterFile; } public void publishFilter(JobConf job)throws IOException { if (_filterPath != null) { Utils.publishFilterToCache(job, this, _filterPath, _hasMasterFile); } } public void load(InetAddress directorServerAddress) throws IOException { Utils.loadFilterFromPath(directorServerAddress, this, _filterPath, _hasMasterFile); } public void loadFromPath(InetAddress directoryServerAddress,String filterPath,boolean hasMasterFile) throws IOException { _filterPath = filterPath; _hasMasterFile = hasMasterFile; Utils.loadFilterFromPath(directoryServerAddress, this, filterPath, hasMasterFile); } public void loadFromCache(JobConf job)throws IOException { if (_filterPath != null) { Utils.loadFilterFromCache(job, _filterPath, this); } } public void loadFilterItem(String filterItemLine) throws IOException {} public abstract FilterResult filterItem(String rootDomain,String domainName,String urlPath, CrawlURLMetadata metadata,FilterResults results); public abstract void clear(); }