/* * Copyright 2011 Internet Archive * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You * may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.archive.bacon.url; import java.io.*; import java.net.*; import java.util.*; import java.util.regex.*; /** * <p> * Helper class for handling (international) domain names which * determines which part of a fully-qualified hostname is the * domain, or "site". * </p> * <p> * It's designed to use the rules maintained by Mozilla and the Public * Suffix List project: * <ul> * <li>http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/effective_tld_names.dat?raw=1</li> * <li>http://publicsuffix.org/index.html</li> * </ul> * </p> * <p> * Typically, it is instantiated with the rules from the * <code>effective_tld_names.dat</code> file, but the rules can be * augmented for custom domain name determination. * For example, if a project wanted to treat each subdomain * under <code>blogger.com</code> as a separate domain, then * the rule could be added: * <pre>blogger.com</pre> * which would yield: * <pre>foo.blogger.com * bar.blogger.com * baz.blogger.com</pre> * as separate domains. Without this rule, they would all be * collasped into just <code>blogger.come</code>. * </p> */ public class IDNHelper { public Set<String> exact = new HashSet<String>(); public Set<String> exclude = new HashSet<String>(); public Set<Pattern> wild = new HashSet<Pattern>(); public void addRule( String rule ) { // Handle simple wildcards rules if ( rule.startsWith( "*." ) ) { if ( rule.length() < 3 ) return ; rule = IDN.toASCII( rule.substring( 2 ) ); // Transform the rule string into regex syntax rule = "[^.]+[.][^.]+[.]" + rule.replace( ".", "[.]" ) ; Pattern p = Pattern.compile( rule ); wild.add( p ); return ; } // Full-blown regex rules if ( rule.startsWith( "~" ) ) { rule = rule.substring( 1 ); Pattern p = Pattern.compile( rule ); wild.add( p ); return ; } // Exact and exclude rules. Set<String> rules = exact; if ( rule.startsWith( "!" ) ) { if ( rule.length() == 1 ) return ; rules = exclude; rule = rule.substring( 1 ); } rules.add( IDN.toASCII( rule ) ); } /** * Adds rules from the given Reader. Rules are expected to conform * to syntax in Mozilla's effective_tld_names.txt document. */ public void addRules( Reader r ) throws IOException { BufferedReader reader = new BufferedReader( r ); String line; while ( (line = reader.readLine() ) != null ) { line = line.trim(); if ( line.length() == 0 || line.startsWith( "//" ) ) continue; this.addRule( line ); } } /** * Return the domain of the given url, according to the rules added * to the IDNHelper. */ public String getDomain( URL u ) { return getDomain( u.getHost( ) ); } /** * Return the domain of the given host string, according to the * rules added to the IDNHelper. The input host string is expected * to be a valid fully-qualified hostname, such as those returned by * URL.getHost(). * * Returns <code>null</code> if domain cannot be determined. */ public String getDomain( String host ) { try { host = IDN.toASCII( host, IDN.ALLOW_UNASSIGNED ); } catch ( Exception e ) { host = null; } if ( host == null ) return null; int i; while ( (i = host.indexOf( '.' ) ) != -1 ) { String test = host.substring( i + 1 ); if ( exact.contains( test ) ) { return host; } if ( exclude.contains( test ) ) { return test; } if ( exclude.contains( host ) ) { return host; } for ( Pattern p : wild ) { Matcher m = p.matcher( host ); if ( m.matches( ) ) { if ( m.groupCount() > 0 ) { return m.group( 1 ); } return host; } } host = test; } return null; } /** * Constructs a new IDNHelper object, populating it with rules from * given Reader. Rules are expected to be in the same form as * Mozilla's effective_tld_names.dat file. */ public static IDNHelper build( Reader reader ) throws IOException { IDNHelper helper = new IDNHelper( ); helper.addRules( reader ); return helper; } /** * Command-line test driver. */ public static void main( String[] args ) throws Exception { if ( args.length < 2 || args[0].equals( "-h" ) || args[0].equals( "--help" ) ) { usage(); System.exit( 0 ); } Reader reader = new InputStreamReader( new FileInputStream( args[0] ), "utf-8" ); IDNHelper helper = build( reader ); for ( int i = 1; i < args.length ; i++ ) { if ( args[i].equals("-") ) { BufferedReader r = new BufferedReader( new InputStreamReader( System.in, "utf-8" ) ); String line; while ( ( line = r.readLine() ) != null ) { line = line.trim(); if ( line.length() == 0 || line.startsWith( "#" ) || line.startsWith( "//" ) ) continue; URL u = new URL( line ); System.out.println( helper.getDomain( u.getHost( ) ) + "\t" + line ); } } else { URL u = new URL( args[i] ); System.out.println( helper.getDomain( u.getHost( ) ) + "\t" + args[i] ); } } } public static void usage( ) { System.out.println( "IDNHelper <rules> <url>..." ); System.out.println( " Load rules and emit domain for given URLs" ); System.out.println( " If <url> is '-' then URLs will be read from stdin." ); } }