package org.commoncrawl.service.crawler.filters;
import java.io.IOException;
import org.commoncrawl.io.NIODNSCache;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.service.crawler.filters.FilterResults;
import org.commoncrawl.util.URLUtils;
public class BigDomainListFilter extends Filter {
NIODNSCache _cache = new NIODNSCache();
public BigDomainListFilter(String filterFile,boolean hasMasterFile) {
super(filterFile,hasMasterFile);
}
@Override
public void clear() {
_cache = new NIODNSCache();
}
@Override
public void loadFilterItem(String filterItemLine) throws IOException {
if (URLUtils.isValidDomainName(filterItemLine)) {
_cache.cacheIPAddressForHost(filterItemLine,1,Long.MAX_VALUE,null).markAsSuperNode();
}
}
@Override
public FilterResult filterItem(String rootDomain, String domainName,String urlPath, CrawlURLMetadata metadata, FilterResults results) {
NIODNSCache.Node node = _cache.findNode(rootDomain);
// if we found a node then this domain, then truncate to the specified super domain name ...
if (node != null && node.isSuperNode()) {
return FilterResult.Filter_Accept;
}
return FilterResult.Filter_NoAction;
}
}