Canonicalize.java example

/*
 * Copyright 2011 Internet Archive
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You
 * may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */
package org.archive.bacon.url;

import java.io.*;
import java.net.*;

import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.WrappedIOException;

import org.archive.wayback.util.url.AggressiveUrlCanonicalizer;

/**
 * Pig EvalFunc which calls the Wayback aggressive URL canonicalizer.
 * 
 * The Wayback aggressive URL canonicalizer does <em>not</em> strip
 * 'www.' from the front of many URLs (JIRA ACC-109), so we do that
 * ourselves, after the Wayback canonicalizer is finished.
 */ 
public class Canonicalize extends EvalFunc<String>
{
  AggressiveUrlCanonicalizer canonicalizer;

  public Canonicalize( )
    throws IOException
  {
    this.canonicalizer = new AggressiveUrlCanonicalizer();
  }

  public String exec( Tuple input )
    throws IOException 
  {
    if ( input == null || input.size() == 0 ) return null;

    try
      {
        String c = this.canonicalizer.canonicalize( (String) input.get(0) );

        // In some rare cases the Wayback canonicalizer can return null.
        if ( c == null ) return null;

        // See JIRA ACC-109
        if ( c.length() > 10 )
          {
            if ( c.startsWith("http://www.") ) 
              {
                c = "http://" + c.substring(11);
              }
            else if ( c.startsWith("https://www.") )
              {
                c = "https://" + c.substring(12);
              }
          }

        // Ensure i18n domains are canonicalized into PunyCode.
        try
          {
            URL u = new URL(c);

            String host  = u.getHost();

            if ( host != null )
              {
                String ahost = IDN.toASCII( host, java.net.IDN.ALLOW_UNASSIGNED );
                
                if ( ! host.equals( ahost ) )
                  {
                    u = new URL( u.getProtocol(),
                                 ahost,
                                 u.getPort(),
                                 u.getFile() );
                    
                    c = u.toString();
                  }
              }

            // Ensure http://example.org has trailing '/'
            String path = u.getPath() == "" ? "/" : u.getPath();

            // Run the path through a decoder, then re-encode and normalize using the URI class.
            path = path.replaceAll( "[+]", "%2b" );  // Don't decode '+' into ' '.
            path = URLDecoder.decode( path, "utf-8" );
            URI uriPath = new URI( null, null, path, null );
            try
              {
                uriPath = uriPath.normalize();
              }
            catch ( InternalError ie )
              {
                System.err.println( "A-ha, triggers URI's InternalError: " + path );
              }
            path = uriPath.getRawPath();
            
            // Hacks for the query
            String query = u.getQuery();
            if ( query != null )
              {
                // Strip multiple & and any trailing &
                query = "?" + query;
                query = query.replaceAll( "[&][&]+", "&" );
                query = query.replaceAll( "[&]$", "" );
              }
            else
              {
                query = "";
              }

            // Hack for "#!/foo" stuff, a la twitter, etc.
            String ref = u.getRef() != null && u.getRef().startsWith( "!" ) ? u.getRef() : null;
            if ( ref != null )
              {
                ref = "#" + ref;
              }
            else
              {
                ref = "";
              }

            // Now, rebuild the URL path + query + ref using the modified values.
            u = new URL( u, path + query + ref );

            // If the protocol is http and the port is explicitly set
            // as "80", strip out the explicit port.
            if ( ("http" .equals(u.getProtocol()) && 80  == u.getPort()) ||
                 ("https".equals(u.getProtocol()) && 443 == u.getPort()) )
              {
                u = new URL( u.getProtocol(), 
                             u.getHost(),
                             -1,
                             u.getFile() );
              }

            c = u.toString();
          }
        catch ( Throwable t )
          {
            // Do nothing, leave the canonicalized URI as it is.
          }

        if ( c == null ) return null;

        // Last, but not least, ensure no whitespace in the URL
        // Change ' ' to %20 but remove all other whitespace.
        c = c.trim();
        c = c.replaceAll( "[ ]", "%20" );
        c = c.replaceAll( "\\s", "" );
        
        return c;
      }
    catch ( Exception e )
      {
        throw WrappedIOException.wrap("Caught exception processing input row ", e);
      }
  }

}