/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.flow;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import cascading.operation.Operation;
import cascading.pipe.Group;
import cascading.pipe.Operator;
import cascading.pipe.Pipe;
import cascading.tap.Tap;
import cascading.tap.TempHfs;
import cascading.tap.hadoop.Hadoop18TapUtil;
import cascading.tap.hadoop.MultiInputFormat;
import cascading.tap.hadoop.TapIterator;
import cascading.tuple.Fields;
import cascading.tuple.IndexTuple;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryIterator;
import cascading.tuple.TuplePair;
import cascading.tuple.hadoop.CoGroupingComparator;
import cascading.tuple.hadoop.CoGroupingPartitioner;
import cascading.tuple.hadoop.GroupingComparator;
import cascading.tuple.hadoop.GroupingPartitioner;
import cascading.tuple.hadoop.GroupingSortingComparator;
import cascading.tuple.hadoop.IndexTupleCoGroupingComparator;
import cascading.tuple.hadoop.ReverseGroupingSortingComparator;
import cascading.tuple.hadoop.ReverseTupleComparator;
import cascading.tuple.hadoop.TupleComparator;
import cascading.tuple.hadoop.TupleSerialization;
import cascading.util.Util;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.jgrapht.graph.SimpleDirectedGraph;
/**
* Class FlowStep is an internal representation of a given Job to be executed on a remote cluster. During
* planning, pipe assemblies are broken down into "steps" and encapsulated in this class.
* <p/>
* FlowSteps are submitted in order of dependency. If two or more steps do not share the same dependencies and all
* can be scheduled simultaneously, the {@link #getSubmitPriority()} value determines the order in which
* all steps will be submitted for execution. The default submit priority is 5.
* <p/>
* This class is for internal use, there are no stable public methods.
*/
public class FlowStep implements Serializable
{
/** Field LOG */
private static final Logger LOG = Logger.getLogger( FlowStep.class );
/** Field properties */
private Map<Object, Object> properties = null;
/** Field parentFlowName */
private String parentFlowName;
/** Field submitPriority */
private int submitPriority = 5;
/** Field name */
String name;
/** Field id */
private int id;
/** Field graph */
final SimpleDirectedGraph<FlowElement, Scope> graph = new SimpleDirectedGraph<FlowElement, Scope>( Scope.class );
/** Field sources */
final Map<Tap, String> sources = new HashMap<Tap, String>(); // all sources and all sinks must have same scheme
/** Field sink */
protected Tap sink;
/** Field mapperTraps */
private final Map<String, Tap> mapperTraps = new HashMap<String, Tap>();
/** Field reducerTraps */
private final Map<String, Tap> reducerTraps = new HashMap<String, Tap>();
/** Field tempSink */
TempHfs tempSink; // used if we need to bypass
/** Field group */
private Group group;
protected FlowStep( String name, int id )
{
this.name = name;
this.id = id;
}
/**
* Method getId returns the id of this FlowStep object.
*
* @return the id (type int) of this FlowStep object.
*/
public int getID()
{
return id;
}
/**
* Method getName returns the name of this FlowStep object.
*
* @return the name (type String) of this FlowStep object.
*/
public String getName()
{
return name;
}
public void setName( String name )
{
if( name == null || name.isEmpty() )
throw new IllegalArgumentException( "step name may not be null or empty" );
this.name = name;
}
/**
* Method getParentFlowName returns the parentFlowName of this FlowStep object.
*
* @return the parentFlowName (type Flow) of this FlowStep object.
*/
public String getParentFlowName()
{
return parentFlowName;
}
/**
* Method setParentFlowName sets the parentFlowName of this FlowStep object.
*
* @param parentFlowName the parentFlowName of this FlowStep object.
*/
public void setParentFlowName( String parentFlowName )
{
this.parentFlowName = parentFlowName;
}
/**
* Method getStepName returns the stepName of this FlowStep object.
*
* @return the stepName (type String) of this FlowStep object.
*/
public String getStepName()
{
return String.format( "%s[%s]", getParentFlowName(), getName() );
}
/**
* Method getSubmitPriority returns the submitPriority of this FlowStep object.
* <p/>
* 10 is lowest, 1 is the highest, 5 is the default.
*
* @return the submitPriority (type int) of this FlowStep object.
*/
public int getSubmitPriority()
{
return submitPriority;
}
/**
* Method setSubmitPriority sets the submitPriority of this FlowStep object.
* <p/>
* 10 is lowest, 1 is the highest, 5 is the default.
*
* @param submitPriority the submitPriority of this FlowStep object.
*/
public void setSubmitPriority( int submitPriority )
{
this.submitPriority = submitPriority;
}
public Group getGroup()
{
return group;
}
protected void setGroup( Group group )
{
this.group = group;
}
public Map<String, Tap> getMapperTraps()
{
return mapperTraps;
}
public Map<String, Tap> getReducerTraps()
{
return reducerTraps;
}
/**
* Method getProperties returns the properties of this FlowStep object.
*
* @return the properties (type Map<Object, Object>) of this FlowStep object.
*/
public Map<Object, Object> getProperties()
{
if( properties == null )
properties = new Properties();
return properties;
}
/**
* Method setProperties sets the properties of this FlowStep object.
*
* @param properties the properties of this FlowStep object.
*/
public void setProperties( Map<Object, Object> properties )
{
this.properties = properties;
}
/**
* Method hasProperties returns {@code true} if there are properties associated with this FlowStep.
*
* @return boolean
*/
public boolean hasProperties()
{
return properties != null && !properties.isEmpty();
}
protected JobConf getJobConf() throws IOException
{
return getJobConf( null );
}
protected JobConf getJobConf( JobConf parentConf ) throws IOException
{
JobConf conf = parentConf == null ? new JobConf() : new JobConf( parentConf );
// set values first so they can't break things downstream
if( hasProperties() )
{
for( Map.Entry entry : getProperties().entrySet() )
conf.set( entry.getKey().toString(), entry.getValue().toString() );
}
// disable warning
conf.setBoolean( "mapred.used.genericoptionsparser", true );
conf.setJobName( getStepName() );
conf.setOutputKeyClass( Tuple.class );
conf.setOutputValueClass( Tuple.class );
conf.setMapperClass( FlowMapper.class );
conf.setReducerClass( FlowReducer.class );
// set for use by the shuffling phase
TupleSerialization.setSerializations( conf );
initFromSources( conf );
initFromSink( conf );
initFromTraps( conf );
if( sink.getScheme().getNumSinkParts() != 0 )
{
// if no reducer, set num map tasks to control parts
if( getGroup() != null )
conf.setNumReduceTasks( sink.getScheme().getNumSinkParts() );
else
conf.setNumMapTasks( sink.getScheme().getNumSinkParts() );
}
conf.setOutputKeyComparatorClass( TupleComparator.class );
if( getGroup() == null )
{
conf.setNumReduceTasks( 0 ); // disable reducers
}
else
{
// must set map output defaults when performing a reduce
conf.setMapOutputKeyClass( Tuple.class );
conf.setMapOutputValueClass( Tuple.class );
// handles the case the groupby sort should be reversed
if( getGroup().isSortReversed() )
conf.setOutputKeyComparatorClass( ReverseTupleComparator.class );
addComparators( conf, "cascading.group.comparator", getGroup().getGroupingSelectors() );
if( getGroup().isGroupBy() )
addComparators( conf, "cascading.sort.comparator", getGroup().getSortingSelectors() );
if( !getGroup().isGroupBy() )
{
conf.setPartitionerClass( CoGroupingPartitioner.class );
conf.setMapOutputKeyClass( IndexTuple.class ); // allows groups to be sorted by index
conf.setMapOutputValueClass( IndexTuple.class );
conf.setOutputKeyComparatorClass( IndexTupleCoGroupingComparator.class ); // sorts by group, then by index
conf.setOutputValueGroupingComparator( CoGroupingComparator.class );
}
if( getGroup().isSorted() )
{
conf.setPartitionerClass( GroupingPartitioner.class );
conf.setMapOutputKeyClass( TuplePair.class );
if( getGroup().isSortReversed() )
conf.setOutputKeyComparatorClass( ReverseGroupingSortingComparator.class );
else
conf.setOutputKeyComparatorClass( GroupingSortingComparator.class );
// no need to supply a reverse comparator, only equality is checked
conf.setOutputValueGroupingComparator( GroupingComparator.class );
}
}
// perform last so init above will pass to tasks
conf.setInt( "cascading.flow.step.id", id );
conf.set( "cascading.flow.step", Util.serializeBase64( this ) );
return conf;
}
private void addComparators( JobConf conf, String property, Map<String, Fields> map ) throws IOException
{
Iterator<Fields> fieldsIterator = map.values().iterator();
if( !fieldsIterator.hasNext() )
return;
Fields fields = fieldsIterator.next();
if( fields.hasComparators() )
{
conf.set( property, Util.serializeBase64( fields ) );
return;
}
// use resolved fields if there are no comparators.
Set<Scope> previousScopes = getPreviousScopes( getGroup() );
fields = previousScopes.iterator().next().getOutValuesFields();
if( fields.size() != 0 ) // allows fields.UNKNOWN to be used
conf.setInt( property + ".size", fields.size() );
return;
}
private void initFromTraps( JobConf conf ) throws IOException
{
initFromTraps( conf, getMapperTraps() );
initFromTraps( conf, getReducerTraps() );
}
private void initFromTraps( JobConf conf, Map<String, Tap> traps ) throws IOException
{
if( !traps.isEmpty() )
{
JobConf trapConf = new JobConf( conf );
for( Tap tap : traps.values() )
tap.sinkInit( trapConf );
}
}
private void initFromSources( JobConf conf ) throws IOException
{
JobConf[] fromJobs = new JobConf[ sources.size() ];
int i = 0;
for( Tap tap : sources.keySet() )
{
fromJobs[ i ] = new JobConf( conf );
tap.sourceInit( fromJobs[ i ] );
fromJobs[ i ].set( "cascading.step.source", Util.serializeBase64( tap ) );
i++;
}
MultiInputFormat.addInputFormat( conf, fromJobs );
}
private void initFromSink( JobConf conf ) throws IOException
{
// init sink first so tempSink can take precedence
if( sink != null )
sink.sinkInit( conf );
// tempSink exists because sink is writeDirect
if( tempSink != null )
tempSink.sinkInit( conf );
}
public TapIterator openSourceForRead( JobConf conf ) throws IOException
{
return new TapIterator( sources.keySet().iterator().next(), conf );
}
public TupleEntryIterator openSinkForRead( JobConf conf ) throws IOException
{
return sink.openForRead( conf );
}
public Tap getMapperTrap( String name )
{
return getMapperTraps().get( name );
}
public Tap getReducerTrap( String name )
{
return getReducerTraps().get( name );
}
/**
* Method getPreviousScopes returns the previous Scope instances. If the flowElement is a Group (specifically a CoGroup),
* there will be more than one instance.
*
* @param flowElement of type FlowElement
* @return Set<Scope>
*/
public Set<Scope> getPreviousScopes( FlowElement flowElement )
{
assertFlowElement( flowElement );
return graph.incomingEdgesOf( flowElement );
}
/**
* Method getNextScope returns the next Scope instance in the graph. There will always only be one next.
*
* @param flowElement of type FlowElement
* @return Scope
*/
public Scope getNextScope( FlowElement flowElement )
{
assertFlowElement( flowElement );
Set<Scope> set = graph.outgoingEdgesOf( flowElement );
if( set.size() != 1 )
throw new IllegalStateException( "should only be one scope after current flow element: " + flowElement + " found: " + set.size() );
return set.iterator().next();
}
public Set<Scope> getNextScopes( FlowElement flowElement )
{
assertFlowElement( flowElement );
return graph.outgoingEdgesOf( flowElement );
}
private void assertFlowElement( FlowElement flowElement )
{
if( !graph.containsVertex( flowElement ) )
{
String message = "unable to find %s in plan, class and serializable fields must implement #hashCode() and #equals()";
if( flowElement instanceof Pipe )
message = Util.formatTrace( (Pipe) flowElement, String.format( message, "pipe" ) );
else if( flowElement instanceof Tap )
message = Util.formatTrace( (Tap) flowElement, String.format( message, "tap" ) );
throw new IllegalStateException( message );
}
}
public FlowElement getNextFlowElement( Scope scope )
{
return graph.getEdgeTarget( scope );
}
public String getSourceName( Tap source )
{
return sources.get( source );
}
public Collection<Operation> getAllOperations()
{
Set<FlowElement> vertices = graph.vertexSet();
List<Operation> operations = new ArrayList<Operation>(); // operations impl equals, so two instance may be the same
for( FlowElement vertice : vertices )
{
if( vertice instanceof Operator )
operations.add( ( (Operator) vertice ).getOperation() );
}
return operations;
}
public boolean containsPipeNamed( String pipeName )
{
Set<FlowElement> vertices = graph.vertexSet();
for( FlowElement vertice : vertices )
{
if( vertice instanceof Pipe && ( (Pipe) vertice ).getName().equals( pipeName ) )
return true;
}
return false;
}
/**
* Method clean removes any temporary files used by this FlowStep instance. It will log any IOExceptions thrown.
*
* @param jobConf of type JobConf
*/
public void clean( JobConf jobConf )
{
if( tempSink != null )
{
try
{
tempSink.deletePath( jobConf );
}
catch( Exception exception )
{
// sink all exceptions, don't fail app
logWarn( "unable to remove temporary file: " + tempSink, exception );
}
}
if( sink instanceof TempHfs )
{
try
{
sink.deletePath( jobConf );
}
catch( Exception exception )
{
// sink all exceptions, don't fail app
logWarn( "unable to remove temporary file: " + sink, exception );
}
}
else
{
cleanTap( jobConf, sink );
}
for( Tap tap : getMapperTraps().values() )
cleanTap( jobConf, tap );
for( Tap tap : getReducerTraps().values() )
cleanTap( jobConf, tap );
}
private void cleanTap( JobConf jobConf, Tap tap )
{
try
{
Hadoop18TapUtil.cleanupTap( jobConf, tap );
}
catch( IOException exception )
{
// ignore exception
}
}
@Override
public boolean equals( Object object )
{
if( this == object )
return true;
if( object == null || getClass() != object.getClass() )
return false;
FlowStep flowStep = (FlowStep) object;
if( name != null ? !name.equals( flowStep.name ) : flowStep.name != null )
return false;
return true;
}
@Override
public int hashCode()
{
return name != null ? name.hashCode() : 0;
}
@Override
public String toString()
{
StringBuffer buffer = new StringBuffer();
buffer.append( getClass().getSimpleName() );
buffer.append( "[name: " ).append( getName() ).append( "]" );
return buffer.toString();
}
protected FlowStepJob createFlowStepJob( JobConf parentConf ) throws IOException
{
return new FlowStepJob( this, getName(), getJobConf( parentConf ) );
}
protected final boolean isInfoEnabled()
{
return LOG.isInfoEnabled();
}
protected final boolean isDebugEnabled()
{
return LOG.isDebugEnabled();
}
protected void logDebug( String message )
{
LOG.debug( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message );
}
protected void logInfo( String message )
{
LOG.info( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message );
}
protected void logWarn( String message )
{
LOG.warn( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message );
}
protected void logWarn( String message, Throwable throwable )
{
LOG.warn( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message, throwable );
}
protected void logError( String message, Throwable throwable )
{
LOG.error( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message, throwable );
}
}