/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap.hadoop; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import cascading.CascadingException; import cascading.util.Util; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.InputFormat; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RecordReader; import org.apache.hadoop.mapred.Reporter; import org.apache.log4j.Logger; import org.jets3t.service.S3ServiceException; /** * Class MultiInputFormat accepts multiple InputFormat class declarations allowing a single MR job * to read data from incompatible file types. */ public class MultiInputFormat implements InputFormat { /** Field LOG */ private static final Logger LOG = Logger.getLogger( MultiInputFormat.class ); /** * Used to set the current JobConf with all sub jobs configurations. * * @param toJob * @param fromJobs */ public static void addInputFormat( JobConf toJob, JobConf... fromJobs ) { toJob.setInputFormat( MultiInputFormat.class ); List<Map<String, String>> configs = new ArrayList<Map<String, String>>(); List<Path> allPaths = new ArrayList<Path>(); boolean isLocal = false; for( JobConf fromJob : fromJobs ) { configs.add( getConfig( toJob, fromJob ) ); Collections.addAll( allPaths, FileInputFormat.getInputPaths( fromJob ) ); if( !isLocal ) isLocal = fromJob.get( "mapred.job.tracker" ).equalsIgnoreCase( "local" ); } FileInputFormat.setInputPaths( toJob, (Path[]) allPaths.toArray( new Path[allPaths.size()] ) ); try { toJob.set( "cascading.multiinputformats", Util.serializeBase64( configs ) ); } catch( IOException exception ) { throw new CascadingException( "unable to pack input formats", exception ); } if( isLocal ) toJob.set( "mapred.job.tracker", "local" ); } public static Map<String, String> getConfig( JobConf toJob, JobConf fromJob ) { Map<String, String> configs = new HashMap<String, String>(); for( Map.Entry<String, String> entry : fromJob ) configs.put( entry.getKey(), entry.getValue() ); for( Map.Entry<String, String> entry : toJob ) { String value = configs.get( entry.getKey() ); if( entry.getValue() == null ) continue; if( value == null && entry.getValue() == null ) configs.remove( entry.getKey() ); if( value != null && value.equals( entry.getValue() ) ) configs.remove( entry.getKey() ); configs.remove( "mapred.working.dir" ); } return configs; } public static JobConf[] getJobConfs( JobConf job, List<Map<String, String>> configs ) { JobConf[] jobConfs = new JobConf[configs.size()]; for( int i = 0; i < jobConfs.length; i++ ) jobConfs[ i ] = mergeConf( job, configs.get( i ), false ); return jobConfs; } static JobConf mergeConf( JobConf job, Map<String, String> config, boolean directly ) { JobConf currentConf = directly ? job : new JobConf( job ); for( String key : config.keySet() ) { if( LOG.isDebugEnabled() ) LOG.debug( "merging key: " + key + " value: " + config.get( key ) ); currentConf.set( key, config.get( key ) ); } return currentConf; } static InputFormat[] getInputFormats( JobConf[] jobConfs ) { InputFormat[] inputFormats = new InputFormat[jobConfs.length]; for( int i = 0; i < jobConfs.length; i++ ) inputFormats[ i ] = jobConfs[ i ].getInputFormat(); return inputFormats; } private List<Map<String, String>> getConfigs( JobConf job ) throws IOException { return (List<Map<String, String>>) Util.deserializeBase64( job.get( "cascading.multiinputformats" ) ); } public void validateInput( JobConf job ) throws IOException { // do nothing, is deprecated } /** * Method getSplits delegates to the appropriate InputFormat. * * @param job of type JobConf * @param numSplits of type int * @return InputSplit[] * @throws IOException when */ public InputSplit[] getSplits( JobConf job, int numSplits ) throws IOException { numSplits = numSplits == 0 ? 1 : numSplits; List<Map<String, String>> configs = getConfigs( job ); JobConf[] jobConfs = getJobConfs( job, configs ); InputFormat[] inputFormats = getInputFormats( jobConfs ); // if only one InputFormat, just return what ever it suggests if( inputFormats.length == 1 ) return collapse( getSplits( inputFormats, jobConfs, new int[]{numSplits} ), configs ); int[] indexedSplits = new int[inputFormats.length]; // if we need only a few, the return one for each if( numSplits <= inputFormats.length ) { Arrays.fill( indexedSplits, 1 ); return collapse( getSplits( inputFormats, jobConfs, indexedSplits ), configs ); } // attempt to get splits proportionally sized per input format long[] inputSplitSizes = getInputSplitSizes( inputFormats, jobConfs, numSplits ); long totalSplitSize = sum( inputSplitSizes ); if( totalSplitSize == 0 ) { Arrays.fill( indexedSplits, 1 ); return collapse( getSplits( inputFormats, jobConfs, indexedSplits ), configs ); } for( int i = 0; i < inputSplitSizes.length; i++ ) { int useSplits = (int) Math.ceil( (double) numSplits * inputSplitSizes[ i ] / (double) totalSplitSize ); indexedSplits[ i ] = useSplits == 0 ? 1 : useSplits; } return collapse( getSplits( inputFormats, jobConfs, indexedSplits ), configs ); } private long sum( long[] inputSizes ) { long size = 0; for( long inputSize : inputSizes ) size += inputSize; return size; } private InputSplit[] collapse( InputSplit[][] splits, List<Map<String, String>> configs ) { List<InputSplit> splitsList = new ArrayList<InputSplit>(); for( int i = 0; i < splits.length; i++ ) { InputSplit[] split = splits[ i ]; for( int j = 0; j < split.length; j++ ) splitsList.add( new MultiInputSplit( split[ j ], configs.get( i ) ) ); } return splitsList.toArray( new InputSplit[splitsList.size()] ); } private InputSplit[][] getSplits( InputFormat[] inputFormats, JobConf[] jobConfs, int[] numSplits ) throws IOException { InputSplit[][] inputSplits = new InputSplit[inputFormats.length][]; for( int i = 0; i < inputFormats.length; i++ ) inputSplits[ i ] = inputFormats[ i ].getSplits( jobConfs[ i ], numSplits[ i ] ); return inputSplits; } private long[] getInputSplitSizes( InputFormat[] inputFormats, JobConf[] jobConfs, int numSplits ) throws IOException { long[] inputSizes = new long[inputFormats.length]; for( int i = 0; i < inputFormats.length; i++ ) { InputFormat inputFormat = inputFormats[ i ]; InputSplit[] splits = inputFormat.getSplits( jobConfs[ i ], numSplits ); inputSizes[ i ] = splits.length; } return inputSizes; } /** * Method getRecordReader delegates to the appropriate InputFormat. * * @param split of type InputSplit * @param job of type JobConf * @param reporter of type Reporter * @return RecordReader * @throws IOException when */ public RecordReader getRecordReader( InputSplit split, JobConf job, final Reporter reporter ) throws IOException { final MultiInputSplit multiSplit = (MultiInputSplit) split; final JobConf currentConf = mergeConf( job, multiSplit.config, true ); try { return Util.retry( LOG, 3, 20, "unable to get record reader", new Util.RetryOperator<RecordReader>() { @Override public RecordReader operate() throws Exception { return currentConf.getInputFormat().getRecordReader( multiSplit.inputSplit, currentConf, reporter ); } @Override public boolean rethrow( Exception exception ) { return !( exception.getCause() instanceof S3ServiceException ); } } ); } catch( Exception exception ) { if( exception instanceof RuntimeException ) throw (RuntimeException) exception; else throw (IOException) exception; } } }