/* * Copyright 2011 Internet Archive * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You * may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.archive.bacon.url; import java.io.*; import java.net.*; import org.apache.pig.EvalFunc; import org.apache.pig.data.Tuple; import org.apache.pig.impl.util.WrappedIOException; import org.archive.wayback.util.url.AggressiveUrlCanonicalizer; /** * Pig EvalFunc which calls the Wayback aggressive URL canonicalizer. * * The Wayback aggressive URL canonicalizer does <em>not</em> strip * 'www.' from the front of many URLs (JIRA ACC-109), so we do that * ourselves, after the Wayback canonicalizer is finished. */ public class Canonicalize extends EvalFunc<String> { AggressiveUrlCanonicalizer canonicalizer; public Canonicalize( ) throws IOException { this.canonicalizer = new AggressiveUrlCanonicalizer(); } public String exec( Tuple input ) throws IOException { if ( input == null || input.size() == 0 ) return null; try { String c = this.canonicalizer.canonicalize( (String) input.get(0) ); // In some rare cases the Wayback canonicalizer can return null. if ( c == null ) return null; // See JIRA ACC-109 if ( c.length() > 10 ) { if ( c.startsWith("http://www.") ) { c = "http://" + c.substring(11); } else if ( c.startsWith("https://www.") ) { c = "https://" + c.substring(12); } } // Ensure i18n domains are canonicalized into PunyCode. try { URL u = new URL(c); String host = u.getHost(); if ( host != null ) { String ahost = IDN.toASCII( host, java.net.IDN.ALLOW_UNASSIGNED ); if ( ! host.equals( ahost ) ) { u = new URL( u.getProtocol(), ahost, u.getPort(), u.getFile() ); c = u.toString(); } } // Ensure http://example.org has trailing '/' String path = u.getPath() == "" ? "/" : u.getPath(); // Run the path through a decoder, then re-encode and normalize using the URI class. path = path.replaceAll( "[+]", "%2b" ); // Don't decode '+' into ' '. path = URLDecoder.decode( path, "utf-8" ); URI uriPath = new URI( null, null, path, null ); try { uriPath = uriPath.normalize(); } catch ( InternalError ie ) { System.err.println( "A-ha, triggers URI's InternalError: " + path ); } path = uriPath.getRawPath(); // Hacks for the query String query = u.getQuery(); if ( query != null ) { // Strip multiple & and any trailing & query = "?" + query; query = query.replaceAll( "[&][&]+", "&" ); query = query.replaceAll( "[&]$", "" ); } else { query = ""; } // Hack for "#!/foo" stuff, a la twitter, etc. String ref = u.getRef() != null && u.getRef().startsWith( "!" ) ? u.getRef() : null; if ( ref != null ) { ref = "#" + ref; } else { ref = ""; } // Now, rebuild the URL path + query + ref using the modified values. u = new URL( u, path + query + ref ); // If the protocol is http and the port is explicitly set // as "80", strip out the explicit port. if ( ("http" .equals(u.getProtocol()) && 80 == u.getPort()) || ("https".equals(u.getProtocol()) && 443 == u.getPort()) ) { u = new URL( u.getProtocol(), u.getHost(), -1, u.getFile() ); } c = u.toString(); } catch ( Throwable t ) { // Do nothing, leave the canonicalized URI as it is. } if ( c == null ) return null; // Last, but not least, ensure no whitespace in the URL // Change ' ' to %20 but remove all other whitespace. c = c.trim(); c = c.replaceAll( "[ ]", "%20" ); c = c.replaceAll( "\\s", "" ); return c; } catch ( Exception e ) { throw WrappedIOException.wrap("Caught exception processing input row ", e); } } }