/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.tap.hadoop;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import cascading.CascadingException;
import cascading.util.Util;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;
import org.jets3t.service.S3ServiceException;
/**
* Class MultiInputFormat accepts multiple InputFormat class declarations allowing a single MR job
* to read data from incompatible file types.
*/
public class MultiInputFormat implements InputFormat
{
/** Field LOG */
private static final Logger LOG = Logger.getLogger( MultiInputFormat.class );
/**
* Used to set the current JobConf with all sub jobs configurations.
*
* @param toJob
* @param fromJobs
*/
public static void addInputFormat( JobConf toJob, JobConf... fromJobs )
{
toJob.setInputFormat( MultiInputFormat.class );
List<Map<String, String>> configs = new ArrayList<Map<String, String>>();
List<Path> allPaths = new ArrayList<Path>();
boolean isLocal = false;
for( JobConf fromJob : fromJobs )
{
configs.add( getConfig( toJob, fromJob ) );
Collections.addAll( allPaths, FileInputFormat.getInputPaths( fromJob ) );
if( !isLocal )
isLocal = fromJob.get( "mapred.job.tracker" ).equalsIgnoreCase( "local" );
}
FileInputFormat.setInputPaths( toJob, (Path[]) allPaths.toArray( new Path[allPaths.size()] ) );
try
{
toJob.set( "cascading.multiinputformats", Util.serializeBase64( configs ) );
}
catch( IOException exception )
{
throw new CascadingException( "unable to pack input formats", exception );
}
if( isLocal )
toJob.set( "mapred.job.tracker", "local" );
}
public static Map<String, String> getConfig( JobConf toJob, JobConf fromJob )
{
Map<String, String> configs = new HashMap<String, String>();
for( Map.Entry<String, String> entry : fromJob )
configs.put( entry.getKey(), entry.getValue() );
for( Map.Entry<String, String> entry : toJob )
{
String value = configs.get( entry.getKey() );
if( entry.getValue() == null )
continue;
if( value == null && entry.getValue() == null )
configs.remove( entry.getKey() );
if( value != null && value.equals( entry.getValue() ) )
configs.remove( entry.getKey() );
configs.remove( "mapred.working.dir" );
}
return configs;
}
public static JobConf[] getJobConfs( JobConf job, List<Map<String, String>> configs )
{
JobConf[] jobConfs = new JobConf[configs.size()];
for( int i = 0; i < jobConfs.length; i++ )
jobConfs[ i ] = mergeConf( job, configs.get( i ), false );
return jobConfs;
}
static JobConf mergeConf( JobConf job, Map<String, String> config, boolean directly )
{
JobConf currentConf = directly ? job : new JobConf( job );
for( String key : config.keySet() )
{
if( LOG.isDebugEnabled() )
LOG.debug( "merging key: " + key + " value: " + config.get( key ) );
currentConf.set( key, config.get( key ) );
}
return currentConf;
}
static InputFormat[] getInputFormats( JobConf[] jobConfs )
{
InputFormat[] inputFormats = new InputFormat[jobConfs.length];
for( int i = 0; i < jobConfs.length; i++ )
inputFormats[ i ] = jobConfs[ i ].getInputFormat();
return inputFormats;
}
private List<Map<String, String>> getConfigs( JobConf job ) throws IOException
{
return (List<Map<String, String>>) Util.deserializeBase64( job.get( "cascading.multiinputformats" ) );
}
public void validateInput( JobConf job ) throws IOException
{
// do nothing, is deprecated
}
/**
* Method getSplits delegates to the appropriate InputFormat.
*
* @param job of type JobConf
* @param numSplits of type int
* @return InputSplit[]
* @throws IOException when
*/
public InputSplit[] getSplits( JobConf job, int numSplits ) throws IOException
{
numSplits = numSplits == 0 ? 1 : numSplits;
List<Map<String, String>> configs = getConfigs( job );
JobConf[] jobConfs = getJobConfs( job, configs );
InputFormat[] inputFormats = getInputFormats( jobConfs );
// if only one InputFormat, just return what ever it suggests
if( inputFormats.length == 1 )
return collapse( getSplits( inputFormats, jobConfs, new int[]{numSplits} ), configs );
int[] indexedSplits = new int[inputFormats.length];
// if we need only a few, the return one for each
if( numSplits <= inputFormats.length )
{
Arrays.fill( indexedSplits, 1 );
return collapse( getSplits( inputFormats, jobConfs, indexedSplits ), configs );
}
// attempt to get splits proportionally sized per input format
long[] inputSplitSizes = getInputSplitSizes( inputFormats, jobConfs, numSplits );
long totalSplitSize = sum( inputSplitSizes );
if( totalSplitSize == 0 )
{
Arrays.fill( indexedSplits, 1 );
return collapse( getSplits( inputFormats, jobConfs, indexedSplits ), configs );
}
for( int i = 0; i < inputSplitSizes.length; i++ )
{
int useSplits = (int) Math.ceil( (double) numSplits * inputSplitSizes[ i ] / (double) totalSplitSize );
indexedSplits[ i ] = useSplits == 0 ? 1 : useSplits;
}
return collapse( getSplits( inputFormats, jobConfs, indexedSplits ), configs );
}
private long sum( long[] inputSizes )
{
long size = 0;
for( long inputSize : inputSizes )
size += inputSize;
return size;
}
private InputSplit[] collapse( InputSplit[][] splits, List<Map<String, String>> configs )
{
List<InputSplit> splitsList = new ArrayList<InputSplit>();
for( int i = 0; i < splits.length; i++ )
{
InputSplit[] split = splits[ i ];
for( int j = 0; j < split.length; j++ )
splitsList.add( new MultiInputSplit( split[ j ], configs.get( i ) ) );
}
return splitsList.toArray( new InputSplit[splitsList.size()] );
}
private InputSplit[][] getSplits( InputFormat[] inputFormats, JobConf[] jobConfs, int[] numSplits ) throws IOException
{
InputSplit[][] inputSplits = new InputSplit[inputFormats.length][];
for( int i = 0; i < inputFormats.length; i++ )
inputSplits[ i ] = inputFormats[ i ].getSplits( jobConfs[ i ], numSplits[ i ] );
return inputSplits;
}
private long[] getInputSplitSizes( InputFormat[] inputFormats, JobConf[] jobConfs, int numSplits ) throws IOException
{
long[] inputSizes = new long[inputFormats.length];
for( int i = 0; i < inputFormats.length; i++ )
{
InputFormat inputFormat = inputFormats[ i ];
InputSplit[] splits = inputFormat.getSplits( jobConfs[ i ], numSplits );
inputSizes[ i ] = splits.length;
}
return inputSizes;
}
/**
* Method getRecordReader delegates to the appropriate InputFormat.
*
* @param split of type InputSplit
* @param job of type JobConf
* @param reporter of type Reporter
* @return RecordReader
* @throws IOException when
*/
public RecordReader getRecordReader( InputSplit split, JobConf job, final Reporter reporter ) throws IOException
{
final MultiInputSplit multiSplit = (MultiInputSplit) split;
final JobConf currentConf = mergeConf( job, multiSplit.config, true );
try
{
return Util.retry( LOG, 3, 20, "unable to get record reader", new Util.RetryOperator<RecordReader>()
{
@Override
public RecordReader operate() throws Exception
{
return currentConf.getInputFormat().getRecordReader( multiSplit.inputSplit, currentConf, reporter );
}
@Override
public boolean rethrow( Exception exception )
{
return !( exception.getCause() instanceof S3ServiceException );
}
} );
}
catch( Exception exception )
{
if( exception instanceof RuntimeException )
throw (RuntimeException) exception;
else
throw (IOException) exception;
}
}
}