package org.commoncrawl.service.crawler.filters; import static org.junit.Assert.*; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.ByteArrayOutputStream; import java.io.DataOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.StringReader; import java.net.InetAddress; import java.nio.charset.Charset; import java.nio.charset.CharsetEncoder; import java.util.HashSet; import java.util.Set; import java.util.TreeSet; import java.io.InputStreamReader; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.mapred.JobConf; import org.commoncrawl.crawl.common.internal.CrawlEnvironment; import org.commoncrawl.io.NIODNSCache; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.rpc.base.shared.RPCStruct; import org.commoncrawl.service.crawler.filters.DomainFilterData; import org.commoncrawl.service.crawler.filters.FilterResults; import org.commoncrawl.service.directory.BlockingClient; import org.commoncrawl.util.URLFingerprint; import org.commoncrawl.util.URLUtils; import org.junit.Test; import com.google.common.collect.ImmutableSet; public class DomainFilter extends URLPatternBlockFilter { private static final Log LOG = LogFactory.getLog(DomainFilter.class); private DomainFilterData filterDataObject = new DomainFilterData(); public DomainFilter(int filterType) { filterDataObject.setFilterType((byte)filterType); } public DomainFilter(int filterType,String filterPath,boolean hasMasterFile) { super(filterPath,hasMasterFile); filterDataObject.setFilterType((byte)filterType); } @Override public FilterResult filterItem(String rootDomain,String fullyQualifiedDomain,String urlText, CrawlURLMetadata metadata,FilterResults results) { if (super.filterItem(rootDomain, fullyQualifiedDomain, "*", metadata, results) == FilterResult.Filter_Reject) { if (filterDataObject.getFilterType() == DomainFilterData.Type.Type_ExlusionFilter) return FilterResult.Filter_Reject; else return FilterResult.Filter_Accept; } return FilterResult.Filter_NoAction; } @Override public void loadFilterItem(String filterItemLine) throws IOException { // LOG.info("Processing Filter Line:" + filterItemLine); int indexOfFirstComma = filterItemLine.indexOf(','); if (indexOfFirstComma != -1) { int indexOfNextComma = filterItemLine.indexOf(',',indexOfFirstComma + 1); if (indexOfNextComma != -1) { String rootDomain = filterItemLine.substring(0,indexOfFirstComma); String subDomainRegExp = ""; if (indexOfNextComma - indexOfFirstComma > 1) { subDomainRegExp = filterItemLine.substring(indexOfFirstComma + 1,indexOfNextComma); } super.loadFilterItem(rootDomain + "," + subDomainRegExp + ",.*"); } } } }