/*
* Copyright 2011 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License"); you
* may not use this file except in compliance with the License. You
* may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package org.archive.bacon.url;
import java.io.*;
import java.net.*;
import java.util.*;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.util.WrappedIOException;
/**
* Simple Pig EvalFunc which takes a chararray assumed to be a URL and
* returns the domain, as determined by the IDNHelper.
*/
public class Domain extends EvalFunc<String>
{
IDNHelper helper;
Map<String,String> cache = new HashMap<String,String>();
public Domain( )
throws IOException
{
InputStream is = IDNHelper.class.getClassLoader( ).getResourceAsStream( "effective_tld_names.dat" );
if ( is == null )
{
throw new RuntimeException( "Cannot load tld rules: effective_tld_names.dat" );
}
Reader r = new InputStreamReader( is, "utf-8" );
this.helper = IDNHelper.build( r );
}
public String exec( Tuple input )
throws IOException
{
if ( input == null || input.size() == 0 ) return null;
try
{
String hostname = (String) input.get(0);
String cachedValue = this.cache.get( hostname );
if ( cachedValue != null ) return cachedValue;
String domain = this.helper.getDomain( hostname );
// If domain cannot be determined, return empty string.
if ( domain == null ) domain = "";
// Ensure i18n domains are in Unicode format.
domain = java.net.IDN.toUnicode( domain, java.net.IDN.ALLOW_UNASSIGNED );
this.cache.put( hostname, domain );
return domain;
}
catch ( Exception e )
{
throw WrappedIOException.wrap("Caught exception processing input row ", e);
}
}
}