package org.commoncrawl.service.crawler.filters;
import java.io.IOException;
import java.util.Set;
import java.util.TreeSet;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.service.crawler.filters.FilterResults;
import org.commoncrawl.util.FPGenerator;
import org.commoncrawl.util.URLFingerprint;
import org.commoncrawl.util.URLUtils;
public class SuperDomainFilter extends Filter{
Set<Integer> _validFingerprints = new TreeSet<Integer>();
Set<String> _validNames = new TreeSet<String>();
Set<Long> _validV2Fingerprints = new TreeSet<Long>();
/**
* default constructor where filter is loaded directly from a file
*/
public SuperDomainFilter() {
}
public SuperDomainFilter(String filterFile) {
super(filterFile,false);
}
@Override
public void clear() {
_validFingerprints.clear();
_validV2Fingerprints.clear();
}
@Override
public void loadFilterItem(String filterItemLine) throws IOException {
String domainName = URLUtils.normalizeHostName(filterItemLine, false);
if (domainName != null) {
_validNames.add(domainName);
_validFingerprints.add(URLFingerprint.generate32BitHostFP(domainName));
_validV2Fingerprints.add(FPGenerator.std64.fp(domainName));
}
}
public FilterResult filterItemByHashId(int hashId) {
return _validFingerprints.contains(hashId) ? FilterResult.Filter_Accept : FilterResult.Filter_NoAction;
}
public FilterResult filterItemByHashIdV2(long hashId) {
return _validV2Fingerprints.contains(hashId) ? FilterResult.Filter_Accept : FilterResult.Filter_NoAction;
}
@Override
public FilterResult filterItem(String rootDomain, String domainName,String urlPath, CrawlURLMetadata metadata, FilterResults results) {
if (_validNames.contains(rootDomain)) {
return FilterResult.Filter_Accept;
}
return FilterResult.Filter_NoAction;
}
}