/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.flow; import java.beans.ConstructorProperties; import java.io.IOException; import java.util.HashMap; import java.util.Map; import cascading.scheme.Scheme; import cascading.tap.Hfs; import cascading.tap.Tap; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.log4j.Logger; /** * Class MapReduceFlow is a {@link Flow} subclass that supports custom MapReduce jobs pre-configured via the {@link JobConf} * object. * <p/> * Use this class to allow custom MapReduce jobs to participate in the {@link cascading.cascade.Cascade} scheduler. If * other Flow instances in the Cascade share resources with this Flow instance, all participants will be scheduled * according to their dependencies (topologically). * <p/> * Set the parameter {@code deleteSinkOnInit} to {@code true} if the outputPath in the jobConf should be deleted before executing the MapReduce job. */ public class MapReduceFlow extends Flow { /** Field LOG */ private static final Logger LOG = Logger.getLogger( MapReduceFlow.class ); /** Field deleteSinkOnInit */ private boolean deleteSinkOnInit = false; /** * Constructor MapReduceFlow creates a new MapReduceFlow instance. * * @param jobConf of type JobConf */ @ConstructorProperties({"jobConf"}) public MapReduceFlow( JobConf jobConf ) { this( jobConf.getJobName(), jobConf, false ); } /** * Constructor MapReduceFlow creates a new MapReduceFlow instance. * * @param jobConf of type JobConf * @param deleteSinkOnInit of type boolean */ @ConstructorProperties({"jobConf", "deleteSinkOnInit"}) public MapReduceFlow( JobConf jobConf, boolean deleteSinkOnInit ) { this( jobConf.getJobName(), jobConf, deleteSinkOnInit ); } /** * Constructor MapReduceFlow creates a new MapReduceFlow instance. * * @param name of type String * @param jobConf of type JobConf */ @ConstructorProperties({"name", "jobConf"}) public MapReduceFlow( String name, JobConf jobConf ) { this( name, jobConf, false ); } /** * Constructor MapReduceFlow creates a new MapReduceFlow instance. * * @param name of type String * @param jobConf of type JobConf * @param deleteSinkOnInit of type boolean */ @ConstructorProperties({"name", "jobConf", "deleteSinkOnInit"}) public MapReduceFlow( String name, JobConf jobConf, boolean deleteSinkOnInit ) { this( name, jobConf, deleteSinkOnInit, true ); } /** * Constructor MapReduceFlow creates a new MapReduceFlow instance. * * @param name of type String * @param jobConf of type JobConf * @param deleteSinkOnInit of type boolean * @param stopJobsOnExit of type boolean */ @ConstructorProperties({"name", "jobConf", "deleteSinkOnInit", "stopJobsOnExit"}) public MapReduceFlow( String name, JobConf jobConf, boolean deleteSinkOnInit, boolean stopJobsOnExit ) { this.deleteSinkOnInit = deleteSinkOnInit; this.stopJobsOnExit = stopJobsOnExit; setName( name ); setSources( createSources( jobConf ) ); setSinks( createSinks( jobConf ) ); setTraps( createTraps( jobConf ) ); setStepGraph( makeStepGraph( jobConf ) ); } private StepGraph makeStepGraph( JobConf jobConf ) { StepGraph stepGraph = new StepGraph(); Tap sink = getSinksCollection().iterator().next(); FlowStep step = new MapReduceFlowStep( sink.toString(), jobConf, sink ); step.setParentFlowName( getName() ); stepGraph.addVertex( step ); return stepGraph; } private Map<String, Tap> createSources( JobConf jobConf ) { Path[] paths = FileInputFormat.getInputPaths( jobConf ); Map<String, Tap> taps = new HashMap<String, Tap>(); for( Path path : paths ) taps.put( path.toString(), new Hfs( new NullScheme(), path.toString() ) ); return taps; } private Map<String, Tap> createSinks( JobConf jobConf ) { Map<String, Tap> taps = new HashMap<String, Tap>(); String path = FileOutputFormat.getOutputPath( jobConf ).toString(); taps.put( path, new Hfs( new NullScheme(), path, deleteSinkOnInit ) ); return taps; } private Map<String, Tap> createTraps( JobConf jobConf ) { return new HashMap<String, Tap>(); } class NullScheme extends Scheme { public void sourceInit( Tap tap, JobConf conf ) throws IOException { } public void sinkInit( Tap tap, JobConf conf ) throws IOException { } public Tuple source( Object key, Object value ) { if( value instanceof Comparable ) return new Tuple( (Comparable) key, (Comparable) value ); else return new Tuple( (Comparable) key ); } @Override public String toString() { return getClass().getSimpleName(); } public void sink( TupleEntry tupleEntry, OutputCollector outputCollector ) throws IOException { throw new UnsupportedOperationException( "sinking is not supported in the scheme" ); } } }