/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.flow;
import java.beans.ConstructorProperties;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import cascading.scheme.Scheme;
import cascading.tap.Hfs;
import cascading.tap.Tap;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.log4j.Logger;
/**
* Class MapReduceFlow is a {@link Flow} subclass that supports custom MapReduce jobs pre-configured via the {@link JobConf}
* object.
* <p/>
* Use this class to allow custom MapReduce jobs to participate in the {@link cascading.cascade.Cascade} scheduler. If
* other Flow instances in the Cascade share resources with this Flow instance, all participants will be scheduled
* according to their dependencies (topologically).
* <p/>
* Set the parameter {@code deleteSinkOnInit} to {@code true} if the outputPath in the jobConf should be deleted before executing the MapReduce job.
*/
public class MapReduceFlow extends Flow
{
/** Field LOG */
private static final Logger LOG = Logger.getLogger( MapReduceFlow.class );
/** Field deleteSinkOnInit */
private boolean deleteSinkOnInit = false;
/**
* Constructor MapReduceFlow creates a new MapReduceFlow instance.
*
* @param jobConf of type JobConf
*/
@ConstructorProperties({"jobConf"})
public MapReduceFlow( JobConf jobConf )
{
this( jobConf.getJobName(), jobConf, false );
}
/**
* Constructor MapReduceFlow creates a new MapReduceFlow instance.
*
* @param jobConf of type JobConf
* @param deleteSinkOnInit of type boolean
*/
@ConstructorProperties({"jobConf", "deleteSinkOnInit"})
public MapReduceFlow( JobConf jobConf, boolean deleteSinkOnInit )
{
this( jobConf.getJobName(), jobConf, deleteSinkOnInit );
}
/**
* Constructor MapReduceFlow creates a new MapReduceFlow instance.
*
* @param name of type String
* @param jobConf of type JobConf
*/
@ConstructorProperties({"name", "jobConf"})
public MapReduceFlow( String name, JobConf jobConf )
{
this( name, jobConf, false );
}
/**
* Constructor MapReduceFlow creates a new MapReduceFlow instance.
*
* @param name of type String
* @param jobConf of type JobConf
* @param deleteSinkOnInit of type boolean
*/
@ConstructorProperties({"name", "jobConf", "deleteSinkOnInit"})
public MapReduceFlow( String name, JobConf jobConf, boolean deleteSinkOnInit )
{
this( name, jobConf, deleteSinkOnInit, true );
}
/**
* Constructor MapReduceFlow creates a new MapReduceFlow instance.
*
* @param name of type String
* @param jobConf of type JobConf
* @param deleteSinkOnInit of type boolean
* @param stopJobsOnExit of type boolean
*/
@ConstructorProperties({"name", "jobConf", "deleteSinkOnInit", "stopJobsOnExit"})
public MapReduceFlow( String name, JobConf jobConf, boolean deleteSinkOnInit, boolean stopJobsOnExit )
{
this.deleteSinkOnInit = deleteSinkOnInit;
this.stopJobsOnExit = stopJobsOnExit;
setName( name );
setSources( createSources( jobConf ) );
setSinks( createSinks( jobConf ) );
setTraps( createTraps( jobConf ) );
setStepGraph( makeStepGraph( jobConf ) );
}
private StepGraph makeStepGraph( JobConf jobConf )
{
StepGraph stepGraph = new StepGraph();
Tap sink = getSinksCollection().iterator().next();
FlowStep step = new MapReduceFlowStep( sink.toString(), jobConf, sink );
step.setParentFlowName( getName() );
stepGraph.addVertex( step );
return stepGraph;
}
private Map<String, Tap> createSources( JobConf jobConf )
{
Path[] paths = FileInputFormat.getInputPaths( jobConf );
Map<String, Tap> taps = new HashMap<String, Tap>();
for( Path path : paths )
taps.put( path.toString(), new Hfs( new NullScheme(), path.toString() ) );
return taps;
}
private Map<String, Tap> createSinks( JobConf jobConf )
{
Map<String, Tap> taps = new HashMap<String, Tap>();
String path = FileOutputFormat.getOutputPath( jobConf ).toString();
taps.put( path, new Hfs( new NullScheme(), path, deleteSinkOnInit ) );
return taps;
}
private Map<String, Tap> createTraps( JobConf jobConf )
{
return new HashMap<String, Tap>();
}
class NullScheme extends Scheme
{
public void sourceInit( Tap tap, JobConf conf ) throws IOException
{
}
public void sinkInit( Tap tap, JobConf conf ) throws IOException
{
}
public Tuple source( Object key, Object value )
{
if( value instanceof Comparable )
return new Tuple( (Comparable) key, (Comparable) value );
else
return new Tuple( (Comparable) key );
}
@Override
public String toString()
{
return getClass().getSimpleName();
}
public void sink( TupleEntry tupleEntry, OutputCollector outputCollector ) throws IOException
{
throw new UnsupportedOperationException( "sinking is not supported in the scheme" );
}
}
}