/** * Copyright 2008 - CommonCrawl Foundation * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ package org.commoncrawl.service.dns; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.util.Vector; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.commoncrawl.protocol.CrawlURLMetadata; import org.commoncrawl.service.crawler.filters.FilterResults; import org.commoncrawl.service.crawler.filters.Filter; import org.commoncrawl.util.CCStringUtils; import org.junit.Test; /** * * @author rana * */ public class DNSRewriteFilter extends Filter { private static final Log LOG = LogFactory.getLog(DNSRewriteFilter.class); static class DNSRewriteItem { public enum TestType { Inclusion, Exclusion }; public TestType testType = TestType.Inclusion; public String tldName; public Pattern pattern; public String rewriteRule; } private Vector<DNSRewriteItem> rewriteItems = new Vector<DNSRewriteItem>(); public DNSRewriteFilter() { } public DNSRewriteFilter(String filterPath) { super(filterPath,false); } @Override public void clear() { rewriteItems.clear(); } @Override public void loadFilterItem(String filterItemLine) throws java.io.IOException { String parts[] = filterItemLine.split(","); if (parts.length == 3) { DNSRewriteItem rewriteItem = new DNSRewriteItem(); rewriteItem.tldName = parts[0]; String patternStr = parts[1]; if (patternStr.charAt(0) == '!') { rewriteItem.testType = DNSRewriteItem.TestType.Exclusion; patternStr = patternStr.substring(1); } try { rewriteItem.pattern = Pattern.compile(patternStr); } catch (PatternSyntaxException e) { throw new IOException("Pattern syntax exception parsing line:" + filterItemLine + "\nException:" + CCStringUtils.stringifyException(e)); } rewriteItem.rewriteRule = parts[2]; rewriteItems.add(rewriteItem); } else { throw new IOException("Invalid Filter Line:" + filterItemLine); } }; @Override public FilterResult filterItem(String rootDomainName,String fullyQualifiedDomainName, String urlPath,CrawlURLMetadata metadata, FilterResults results) { for (DNSRewriteItem item : rewriteItems) { if (rootDomainName.equals(item.tldName)) { Matcher matcher = item.pattern.matcher(fullyQualifiedDomainName); boolean matches = matcher.matches(); if (matches && item.testType == DNSRewriteItem.TestType.Inclusion) { StringBuffer finalString = new StringBuffer(); int searchIndexStart = 0; while (searchIndexStart != item.rewriteRule.length()) { int indexOfNextSlash = item.rewriteRule.indexOf('\\',searchIndexStart); if (indexOfNextSlash == -1) { finalString.append(item.rewriteRule.substring(searchIndexStart)); searchIndexStart = item.rewriteRule.length(); } else { if (indexOfNextSlash - searchIndexStart != 0) { finalString.append(item.rewriteRule.substring(searchIndexStart,indexOfNextSlash)); } searchIndexStart = indexOfNextSlash + 1; if (indexOfNextSlash + 1 != item.rewriteRule.length() && (item.rewriteRule.charAt(indexOfNextSlash+1) >= '1' && item.rewriteRule.charAt(indexOfNextSlash+1) <= '9')) { searchIndexStart++; int index = Integer.parseInt(item.rewriteRule.substring(indexOfNextSlash+1,indexOfNextSlash+2)); if (index < matcher.groupCount()) { finalString.append(matcher.group(index)); } else if (index == matcher.groupCount()) { finalString.append(rootDomainName); } else { LOG.error("Invalid group index specified in rewrite rule:" + index); return FilterResult.Filter_NoAction; } } else { finalString.append('\\'); } } } results.setRewrittenDomainName(finalString.toString()); return FilterResult.Filter_Modified; } else if (!matches && item.testType == DNSRewriteItem.TestType.Exclusion) { results.setRewrittenDomainName(item.rewriteRule); return FilterResult.Filter_Modified; } } } return FilterResult.Filter_NoAction; } @Test public void testFilter() throws Exception { FilterResults filterResults = new FilterResults(); loadFilterItem("blogspot.com,!(www\\.)(blogspot\\.com),blogspot.l.google.com"); loadFilterItem("deviantart.com,(.*)(deviantart\\.com),www.deviantart.com"); loadFilterItem("wordpress.com,!((www\\.)|(.*files\\.))(wordpress\\.com),lb.wordpress.com"); loadFilterItem("alibaba.com,(.*)(cn\\.alibaba\\.com),cn.alibaba.com"); loadFilterItem("alibaba.com,(.*)(blog\\.china\\.alibaba\\.com),blog.china.alibaba.com"); loadFilterItem("alibaba.com,(.*)(en\\.alibaba\\.com),minisite.alibaba.com"); loadFilterItem("43people.com,(.*)(43people\\.com),lb1.43people.com"); loadFilterItem("blog.co.uk,(.*)(blog\\.co\\.uk),blog.co.uk"); loadFilterItem("ning.com,!(www\\.)(ning\\.com),ning.com"); loadFilterItem("dtdns.net,(.*)(vernos\\.dtdns\\.net),vernos.dtdns.net"); loadFilterItem("typepad.com,!(www\\.)(typepad\\.com),members.typepad.com"); loadFilterItem("blog4ever.com,!(www\\.)(blog4ever\\.com),blog4ever.com"); loadFilterItem("yahoo.com,(.*)(zhan\\.cn\\.yahoo\\.com),cn.yahoo.com"); loadFilterItem("hi5.com,!(www\\.)(hi5\\.com),hi5.com"); loadFilterItem("nireblog.com,!(www\\.)(nireblog\\.com),members.nireblog.com"); loadFilterItem("blog.com,!(www\\.)(blog\\.com),members.blog.com"); loadFilterItem("4t.com,!(www\\.)(4t\\.com),members.4t.com"); loadFilterItem("wordpress.com,(.*)(files\\.wordpress\\.com),lb.files.wordpress.com"); loadFilterItem("somedomain.com,([^\\.]*)\\.(somedomain\\.com),\\2"); assertTrue(filterItem("blogspot.com","foobar.blogspot.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("blogspot.l.google.com")); filterResults.clear(); assertTrue(filterItem("blogspot.com","www.blogspot.com",null,null,filterResults) == FilterResult.Filter_NoAction); assertTrue(filterItem("somedomain.com","joe.somedomain.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("somedomain.com")); filterResults.clear(); assertTrue(filterItem("deviantart.com","animefangirl11.deviantart.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("www.deviantart.com")); filterResults.clear(); assertTrue(filterItem("wordpress.com","brianmschoedel.wordpress.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("lb.wordpress.com")); filterResults.clear(); assertTrue(filterItem("alibaba.com","wxjiugongge.cn.alibaba.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("cn.alibaba.com")); filterResults.clear(); assertTrue(filterItem("alibaba.com","petertoy.blog.china.alibaba.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("blog.china.alibaba.com")); filterResults.clear(); assertTrue(filterItem("alibaba.com","tomacamera.en.alibaba.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("minisite.alibaba.com")); filterResults.clear(); assertTrue(filterItem("blog.co.uk","jaspalsingh.blog.co.uk",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("blog.co.uk")); filterResults.clear(); assertTrue(filterItem("ning.com","alumnosieslaasuncion.ning.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("ning.com")); filterResults.clear(); assertTrue(filterItem("dtdns.net","ashleeandserena-com.vernos.dtdns.net",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("vernos.dtdns.net")); filterResults.clear(); assertTrue(filterItem("typepad.com","test.typepad.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("members.typepad.com")); filterResults.clear(); assertTrue(filterItem("blog4ever.com","member.blog4ever.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("blog4ever.com")); filterResults.clear(); assertTrue(filterItem("yahoo.com","renpinwangzi.zhan.cn.yahoo.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("cn.yahoo.com")); filterResults.clear(); assertTrue(filterItem("hi5.com","foobar.hi5.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("hi5.com")); filterResults.clear(); assertTrue(filterItem("blog.com","foobar.blog.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("members.blog.com")); filterResults.clear(); assertTrue(filterItem("4t.com","foobar.4t.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("members.4t.com")); filterResults.clear(); assertTrue(filterItem("wordpress.com","foobar.files.wordpress.com",null,null,filterResults) == FilterResult.Filter_Modified); assertTrue(filterResults.getRewrittenDomainName().equals("lb.files.wordpress.com")); } public static void main(String[] args) { DNSRewriteFilter filter = new DNSRewriteFilter(); try { filter.testFilter(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } }