package org.commoncrawl.service.crawler.filters;
import static org.junit.Assert.*;
import java.io.IOException;
import java.net.InetAddress;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.commoncrawl.io.NIODNSCache;
import org.commoncrawl.protocol.CrawlURLMetadata;
import org.commoncrawl.service.crawler.filters.DomainFilterData;
import org.commoncrawl.service.crawler.filters.FilterResults;
import org.commoncrawl.service.crawler.filters.Filter.FilterResult;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.IPAddressUtils;
import org.commoncrawl.util.URLUtils;
import org.junit.Test;
public class IPAddressHintFilter extends Filter {
private static final Log LOG = LogFactory.getLog(IPAddressHintFilter.class);
private NIODNSCache cache = new NIODNSCache();
public IPAddressHintFilter() {
}
public IPAddressHintFilter(String filterPath,boolean hasMasterFile) {
super(filterPath,hasMasterFile);
}
@Override
public FilterResult filterItem(String rootDomainName,String fullyQualifiedDomainName, String urlPath,CrawlURLMetadata metadata, FilterResults results) {
String normalizedName = URLUtils.normalizeHostName(fullyQualifiedDomainName,true);
if (normalizedName != null) {
NIODNSCache.Node node = cache.findNode(normalizedName);
// if we found a node then this domain, then truncate to the specified super domain name ...
if (node != null && node.isSuperNode()) {
results.setIpAddressHint(node.getIPAddress());
return FilterResult.Filter_Modified;
}
}
return FilterResult.Filter_NoAction;
}
@Override
public void loadFilterItem(String filterItemLine) throws IOException {
if (filterItemLine.length() == 0) {
LOG.error("filterItemLine is zero length");
}
else {
String items[] = filterItemLine.split("\t");
if (items.length == 2) {
String domainName = items[0];
if (domainName.charAt(0) == '.')
domainName = domainName.substring(1);
domainName = URLUtils.normalizeHostName(domainName,true);
try {
int ipAddress = IPAddressUtils.IPV4AddressToInteger(InetAddress.getByName(items[1]).getAddress());
cache.cacheIPAddressForHost(filterItemLine,ipAddress,Long.MAX_VALUE,null).markAsSuperNode();
}
catch (Exception e) {
LOG.error(CCStringUtils.stringifyException(e));
}
}
}
}
@Override
public void clear() {
cache = new NIODNSCache();
};
@Test
public void testname() throws Exception {
cache.cacheIPAddressForHost("cnn.com", 1, Long.MAX_VALUE,null).markAsSuperNode();
cache.cacheIPAddressForHost("netscape.cnn.com",2, Long.MAX_VALUE,null).markAsSuperNode();
NIODNSCache.Node node1 = cache.findNode("netscape.cnn.com");
NIODNSCache.Node node2 = cache.findNode("www.cnn.com");
NIODNSCache.Node node3 = cache.findNode("cnn.com");
NIODNSCache.Node node4 = cache.findNode("ccnn.com");
assertTrue(node1 != null);
assertTrue(node2 != null);
assertTrue(node3 != null);
assertTrue(node4 == null);
assertTrue(node1.isSuperNode());
assertTrue(node2.isSuperNode());
assertTrue(node3.isSuperNode());
assertTrue(node1.getIPAddress() == 2);
assertTrue(node2.getIPAddress() == 1);
assertTrue(node3.getIPAddress() == 1);
}
}