/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.tuple.hadoop;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import cascading.CascadingException;
import cascading.tuple.Comparison;
import cascading.tuple.IndexTuple;
import cascading.tuple.Tuple;
import cascading.tuple.TupleException;
import cascading.tuple.TuplePair;
import cascading.util.Util;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.serializer.Deserializer;
import org.apache.hadoop.io.serializer.Serialization;
import org.apache.hadoop.io.serializer.SerializationFactory;
import org.apache.hadoop.io.serializer.Serializer;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
/**
* Class TupleSerialization is an implementation of Hadoop's {@link Serialization} interface.
* <p/>
* Typically developers will not use this implementation directly as it is automatically added
* to any relevant MapReduce jobs via the {@link JobConf}.
* <p/>
* By default, all primitive types are natively handled, and {@link org.apache.hadoop.io.BytesWritable}
* has a pre-configured serialization token since byte arrays are not handled natively by {@link Tuple}.
*/
@SerializationToken(
tokens = {127},
classNames = {"org.apache.hadoop.io.BytesWritable"})
public class TupleSerialization extends Configured implements Serialization
{
/** Field LOG */
private static final Logger LOG = Logger.getLogger( TupleSerialization.class );
/** Field classCache */
private Map<String, Class> classCache = new HashMap<String, Class>();
/** Field serializationFactory */
private SerializationFactory serializationFactory;
/** Field tokenClassesMap */
private HashMap<Integer, String> tokenClassesMap;
/** Field classesTokensMap */
private HashMap<String, Integer> classesTokensMap;
/** Field tokenMapSize */
private long tokensSize = 0;
/**
* Adds the given token and className pair as a serialization token property. During object serialization and deserialization,
* the given token will be used instead of the className when an instance of the className is encountered.
*
* @param properties of type Map
* @param token of type int
* @param className of type String
*/
public static void addSerializationToken( Map<Object, Object> properties, int token, String className )
{
String tokens = getSerializationTokens( properties );
properties.put( "cascading.serialization.tokens", Util.join( ",", Util.removeNulls( tokens, token + "=" + className ) ) );
}
/**
* Returns the serialization tokens property.
*
* @param properties of type Map
* @return returns a String
*/
public static String getSerializationTokens( Map<Object, Object> properties )
{
return (String) properties.get( "cascading.serialization.tokens" );
}
static String getSerializationTokens( JobConf jobConf )
{
return jobConf.get( "cascading.serialization.tokens" );
}
/**
* Adds the given className as a Hadoop IO serialization class.
*
* @param properties of type Map
* @param className of type String
*/
public static void addSerialization( Map<Object, Object> properties, String className )
{
String serializations = (String) properties.get( "io.serializations" );
properties.put( "io.serializations", Util.join( ",", Util.removeNulls( serializations, className ) ) );
}
/**
* Adds this class as a Hadoop Serialization class. This method is safe to call redundantly.
*
* @param jobConf of type JobConf
*/
public static void setSerializations( JobConf jobConf )
{
String serializations = getSerializations( jobConf );
if( serializations.contains( TupleSerialization.class.getName() ) )
return;
jobConf.set( "io.serializations", Util.join( ",", Util.removeNulls( serializations, TupleSerialization.class.getName() ) ) );
}
static String getSerializations( JobConf jobConf )
{
return jobConf.get( "io.serializations", "" );
}
/** Constructor TupleSerialization creates a new TupleSerialization instance. */
public TupleSerialization()
{
}
/**
* Constructor TupleSerialization creates a new TupleSerialization instance.
*
* @param conf of type Configuration
*/
public TupleSerialization( Configuration conf )
{
super( conf );
}
@Override
public Configuration getConf()
{
if( super.getConf() == null )
setConf( new JobConf() );
return super.getConf();
}
SerializationFactory getSerializationFactory()
{
if( serializationFactory == null )
serializationFactory = new SerializationFactory( getConf() );
return serializationFactory;
}
/** Must be called before {@link #getClassNameFor(int)} and {@link #getTokenFor(String)} methods. */
void initTokenMaps()
{
if( tokenClassesMap != null )
return;
tokenClassesMap = new HashMap<Integer, String>();
classesTokensMap = new HashMap<String, Integer>();
String tokenProperty = getSerializationTokens( (JobConf) getConf() );
if( tokenProperty != null )
{
tokenProperty = tokenProperty.replaceAll( "\\s", "" ); // allow for whitespace in token set
for( String pair : tokenProperty.split( "," ) )
{
String[] elements = pair.split( "=" );
addToken( null, Integer.parseInt( elements[ 0 ] ), elements[ 1 ] );
}
}
String serializationsString = getSerializations( (JobConf) getConf() );
if( serializationsString == null )
return;
String[] serializations = serializationsString.split( "," );
for( String serializationName : serializations )
{
try
{
Class type = getConf().getClassByName( serializationName );
SerializationToken tokenAnnotation = (SerializationToken) type.getAnnotation( SerializationToken.class );
if( tokenAnnotation == null )
continue;
if( tokenAnnotation.tokens().length != tokenAnnotation.classNames().length )
throw new CascadingException( "serialization annotation tokens and classNames must be the same length" );
int[] tokens = tokenAnnotation.tokens();
for( int i = 0; i < tokens.length; i++ )
addToken( type, tokens[ i ], tokenAnnotation.classNames()[ i ] );
}
catch( ClassNotFoundException exception )
{
LOG.warn( "unable to load serialization class: " + serializationName, exception );
}
}
tokensSize = tokenClassesMap.size();
return;
}
private void addToken( Class type, int token, String className )
{
if( type != null && !type.getName().startsWith( "cascading." ) && token < 128 )
throw new CascadingException( "serialization annotation tokens may not be less than 128, was: " + token );
if( tokenClassesMap.containsKey( token ) )
{
if( type == null )
throw new IllegalStateException( "duplicate serialization token: " + token + " for class: " + className + " found in properties" );
throw new IllegalStateException( "duplicate serialization token: " + token + " for class: " + className + " on serialization: " + type.getName() );
}
if( classesTokensMap.containsKey( className ) )
{
if( type == null )
throw new IllegalStateException( "duplicate serialization classname: " + className + " for token: " + token + " found in properties " );
throw new IllegalStateException( "duplicate serialization classname: " + className + " for token: " + token + " on serialization: " + type.getName() );
}
tokenClassesMap.put( token, className );
classesTokensMap.put( className, token );
}
/**
* Returns the className for the given token.
*
* @param token of type int
* @return a String
*/
final String getClassNameFor( int token )
{
if( tokensSize == 0 )
return null;
return tokenClassesMap.get( token );
}
/**
* Returns the token for the given className.
*
* @param className of type String
* @return an Integer
*/
final Integer getTokenFor( String className )
{
if( tokensSize == 0 )
return null;
return classesTokensMap.get( className );
}
public Comparator getComparator( Class type )
{
Serialization serialization = getSerialization( type );
if( serialization instanceof Comparison )
return ( (Comparison) serialization ).getComparator( type );
return null;
}
Serialization getSerialization( String className )
{
return getSerialization( getClass( className ) );
}
Serialization getSerialization( Class type )
{
return getSerializationFactory().getSerialization( type );
}
Serializer getNewSerializer( Class type )
{
try
{
return getSerializationFactory().getSerializer( type );
}
catch( NullPointerException exception )
{
throw new CascadingException( "unable to load serializer for: " + type.getName() + " from: " + getSerializationFactory().getClass().getName() );
}
}
Deserializer getNewDeserializer( String className )
{
try
{
return getSerializationFactory().getDeserializer( getClass( className ) );
}
catch( NullPointerException exception )
{
throw new CascadingException( "unable to load deserializer for: " + className + " from: " + getSerializationFactory().getClass().getName() );
}
}
TuplePairDeserializer getTuplePairDeserializer()
{
return new TuplePairDeserializer( getElementReader() );
}
/**
* Method getElementReader returns the elementReader of this TupleSerialization object.
*
* @return the elementReader (type SerializationElementReader) of this TupleSerialization object.
*/
public SerializationElementReader getElementReader()
{
return new SerializationElementReader( this );
}
TupleDeserializer getTupleDeserializer()
{
return new TupleDeserializer( getElementReader() );
}
private TuplePairSerializer getTuplePairSerializer()
{
return new TuplePairSerializer( getElementWriter() );
}
IndexTupleDeserializer getIndexTupleDeserializer()
{
return new IndexTupleDeserializer( getElementReader() );
}
/**
* Method getElementWriter returns the elementWriter of this TupleSerialization object.
*
* @return the elementWriter (type SerializationElementWriter) of this TupleSerialization object.
*/
public SerializationElementWriter getElementWriter()
{
return new SerializationElementWriter( this );
}
private TupleSerializer getTupleSerializer()
{
return new TupleSerializer( getElementWriter() );
}
private IndexTupleSerializer getIndexTupleSerializer()
{
return new IndexTupleSerializer( getElementWriter() );
}
/**
* Method accept implements {@link Serialization#accept(Class)}.
*
* @param c of type Class
* @return boolean
*/
public boolean accept( Class c )
{
return Tuple.class == c || TuplePair.class == c || IndexTuple.class == c;
}
/**
* Method getDeserializer implements {@link Serialization#getDeserializer(Class)}.
*
* @param c of type Class
* @return Deserializer
*/
public Deserializer getDeserializer( Class c )
{
if( c == Tuple.class )
return getTupleDeserializer();
else if( c == TuplePair.class )
return getTuplePairDeserializer();
else if( c == IndexTuple.class )
return getIndexTupleDeserializer();
throw new IllegalArgumentException( "unknown class, cannot deserialize: " + c.getName() );
}
/**
* Method getSerializer implements {@link Serialization#getSerializer(Class)}.
*
* @param c of type Class
* @return Serializer
*/
public Serializer getSerializer( Class c )
{
if( c == Tuple.class )
return getTupleSerializer();
else if( c == TuplePair.class )
return getTuplePairSerializer();
else if( c == IndexTuple.class )
return getIndexTupleSerializer();
throw new IllegalArgumentException( "unknown class, cannot serialize: " + c.getName() );
}
public Class getClass( String className )
{
Class type = classCache.get( className );
if( type != null )
return type;
try
{
if( className.charAt( 0 ) == '[' )
type = Class.forName( className, true, Thread.currentThread().getContextClassLoader() );
else
type = Thread.currentThread().getContextClassLoader().loadClass( className );
}
catch( ClassNotFoundException exception )
{
throw new TupleException( "unable to load class named: " + className, exception );
}
classCache.put( className, type );
return type;
}
}