/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.flow; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Properties; import java.util.Set; import cascading.operation.Operation; import cascading.pipe.Group; import cascading.pipe.Operator; import cascading.pipe.Pipe; import cascading.tap.Tap; import cascading.tap.TempHfs; import cascading.tap.hadoop.Hadoop18TapUtil; import cascading.tap.hadoop.MultiInputFormat; import cascading.tap.hadoop.TapIterator; import cascading.tuple.Fields; import cascading.tuple.IndexTuple; import cascading.tuple.Tuple; import cascading.tuple.TupleEntryIterator; import cascading.tuple.TuplePair; import cascading.tuple.hadoop.CoGroupingComparator; import cascading.tuple.hadoop.CoGroupingPartitioner; import cascading.tuple.hadoop.GroupingComparator; import cascading.tuple.hadoop.GroupingPartitioner; import cascading.tuple.hadoop.GroupingSortingComparator; import cascading.tuple.hadoop.IndexTupleCoGroupingComparator; import cascading.tuple.hadoop.ReverseGroupingSortingComparator; import cascading.tuple.hadoop.ReverseTupleComparator; import cascading.tuple.hadoop.TupleComparator; import cascading.tuple.hadoop.TupleSerialization; import cascading.util.Util; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; import org.jgrapht.graph.SimpleDirectedGraph; /** * Class FlowStep is an internal representation of a given Job to be executed on a remote cluster. During * planning, pipe assemblies are broken down into "steps" and encapsulated in this class. * <p/> * FlowSteps are submitted in order of dependency. If two or more steps do not share the same dependencies and all * can be scheduled simultaneously, the {@link #getSubmitPriority()} value determines the order in which * all steps will be submitted for execution. The default submit priority is 5. * <p/> * This class is for internal use, there are no stable public methods. */ public class FlowStep implements Serializable { /** Field LOG */ private static final Logger LOG = Logger.getLogger( FlowStep.class ); /** Field properties */ private Map<Object, Object> properties = null; /** Field parentFlowName */ private String parentFlowName; /** Field submitPriority */ private int submitPriority = 5; /** Field name */ String name; /** Field id */ private int id; /** Field graph */ final SimpleDirectedGraph<FlowElement, Scope> graph = new SimpleDirectedGraph<FlowElement, Scope>( Scope.class ); /** Field sources */ final Map<Tap, String> sources = new HashMap<Tap, String>(); // all sources and all sinks must have same scheme /** Field sink */ protected Tap sink; /** Field mapperTraps */ private final Map<String, Tap> mapperTraps = new HashMap<String, Tap>(); /** Field reducerTraps */ private final Map<String, Tap> reducerTraps = new HashMap<String, Tap>(); /** Field tempSink */ TempHfs tempSink; // used if we need to bypass /** Field group */ private Group group; protected FlowStep( String name, int id ) { this.name = name; this.id = id; } /** * Method getId returns the id of this FlowStep object. * * @return the id (type int) of this FlowStep object. */ public int getID() { return id; } /** * Method getName returns the name of this FlowStep object. * * @return the name (type String) of this FlowStep object. */ public String getName() { return name; } public void setName( String name ) { if( name == null || name.isEmpty() ) throw new IllegalArgumentException( "step name may not be null or empty" ); this.name = name; } /** * Method getParentFlowName returns the parentFlowName of this FlowStep object. * * @return the parentFlowName (type Flow) of this FlowStep object. */ public String getParentFlowName() { return parentFlowName; } /** * Method setParentFlowName sets the parentFlowName of this FlowStep object. * * @param parentFlowName the parentFlowName of this FlowStep object. */ public void setParentFlowName( String parentFlowName ) { this.parentFlowName = parentFlowName; } /** * Method getStepName returns the stepName of this FlowStep object. * * @return the stepName (type String) of this FlowStep object. */ public String getStepName() { return String.format( "%s[%s]", getParentFlowName(), getName() ); } /** * Method getSubmitPriority returns the submitPriority of this FlowStep object. * <p/> * 10 is lowest, 1 is the highest, 5 is the default. * * @return the submitPriority (type int) of this FlowStep object. */ public int getSubmitPriority() { return submitPriority; } /** * Method setSubmitPriority sets the submitPriority of this FlowStep object. * <p/> * 10 is lowest, 1 is the highest, 5 is the default. * * @param submitPriority the submitPriority of this FlowStep object. */ public void setSubmitPriority( int submitPriority ) { this.submitPriority = submitPriority; } public Group getGroup() { return group; } protected void setGroup( Group group ) { this.group = group; } public Map<String, Tap> getMapperTraps() { return mapperTraps; } public Map<String, Tap> getReducerTraps() { return reducerTraps; } /** * Method getProperties returns the properties of this FlowStep object. * * @return the properties (type Map<Object, Object>) of this FlowStep object. */ public Map<Object, Object> getProperties() { if( properties == null ) properties = new Properties(); return properties; } /** * Method setProperties sets the properties of this FlowStep object. * * @param properties the properties of this FlowStep object. */ public void setProperties( Map<Object, Object> properties ) { this.properties = properties; } /** * Method hasProperties returns {@code true} if there are properties associated with this FlowStep. * * @return boolean */ public boolean hasProperties() { return properties != null && !properties.isEmpty(); } protected JobConf getJobConf() throws IOException { return getJobConf( null ); } protected JobConf getJobConf( JobConf parentConf ) throws IOException { JobConf conf = parentConf == null ? new JobConf() : new JobConf( parentConf ); // set values first so they can't break things downstream if( hasProperties() ) { for( Map.Entry entry : getProperties().entrySet() ) conf.set( entry.getKey().toString(), entry.getValue().toString() ); } // disable warning conf.setBoolean( "mapred.used.genericoptionsparser", true ); conf.setJobName( getStepName() ); conf.setOutputKeyClass( Tuple.class ); conf.setOutputValueClass( Tuple.class ); conf.setMapperClass( FlowMapper.class ); conf.setReducerClass( FlowReducer.class ); // set for use by the shuffling phase TupleSerialization.setSerializations( conf ); initFromSources( conf ); initFromSink( conf ); initFromTraps( conf ); if( sink.getScheme().getNumSinkParts() != 0 ) { // if no reducer, set num map tasks to control parts if( getGroup() != null ) conf.setNumReduceTasks( sink.getScheme().getNumSinkParts() ); else conf.setNumMapTasks( sink.getScheme().getNumSinkParts() ); } conf.setOutputKeyComparatorClass( TupleComparator.class ); if( getGroup() == null ) { conf.setNumReduceTasks( 0 ); // disable reducers } else { // must set map output defaults when performing a reduce conf.setMapOutputKeyClass( Tuple.class ); conf.setMapOutputValueClass( Tuple.class ); // handles the case the groupby sort should be reversed if( getGroup().isSortReversed() ) conf.setOutputKeyComparatorClass( ReverseTupleComparator.class ); addComparators( conf, "cascading.group.comparator", getGroup().getGroupingSelectors() ); if( getGroup().isGroupBy() ) addComparators( conf, "cascading.sort.comparator", getGroup().getSortingSelectors() ); if( !getGroup().isGroupBy() ) { conf.setPartitionerClass( CoGroupingPartitioner.class ); conf.setMapOutputKeyClass( IndexTuple.class ); // allows groups to be sorted by index conf.setMapOutputValueClass( IndexTuple.class ); conf.setOutputKeyComparatorClass( IndexTupleCoGroupingComparator.class ); // sorts by group, then by index conf.setOutputValueGroupingComparator( CoGroupingComparator.class ); } if( getGroup().isSorted() ) { conf.setPartitionerClass( GroupingPartitioner.class ); conf.setMapOutputKeyClass( TuplePair.class ); if( getGroup().isSortReversed() ) conf.setOutputKeyComparatorClass( ReverseGroupingSortingComparator.class ); else conf.setOutputKeyComparatorClass( GroupingSortingComparator.class ); // no need to supply a reverse comparator, only equality is checked conf.setOutputValueGroupingComparator( GroupingComparator.class ); } } // perform last so init above will pass to tasks conf.setInt( "cascading.flow.step.id", id ); conf.set( "cascading.flow.step", Util.serializeBase64( this ) ); return conf; } private void addComparators( JobConf conf, String property, Map<String, Fields> map ) throws IOException { Iterator<Fields> fieldsIterator = map.values().iterator(); if( !fieldsIterator.hasNext() ) return; Fields fields = fieldsIterator.next(); if( fields.hasComparators() ) { conf.set( property, Util.serializeBase64( fields ) ); return; } // use resolved fields if there are no comparators. Set<Scope> previousScopes = getPreviousScopes( getGroup() ); fields = previousScopes.iterator().next().getOutValuesFields(); if( fields.size() != 0 ) // allows fields.UNKNOWN to be used conf.setInt( property + ".size", fields.size() ); return; } private void initFromTraps( JobConf conf ) throws IOException { initFromTraps( conf, getMapperTraps() ); initFromTraps( conf, getReducerTraps() ); } private void initFromTraps( JobConf conf, Map<String, Tap> traps ) throws IOException { if( !traps.isEmpty() ) { JobConf trapConf = new JobConf( conf ); for( Tap tap : traps.values() ) tap.sinkInit( trapConf ); } } private void initFromSources( JobConf conf ) throws IOException { JobConf[] fromJobs = new JobConf[ sources.size() ]; int i = 0; for( Tap tap : sources.keySet() ) { fromJobs[ i ] = new JobConf( conf ); tap.sourceInit( fromJobs[ i ] ); fromJobs[ i ].set( "cascading.step.source", Util.serializeBase64( tap ) ); i++; } MultiInputFormat.addInputFormat( conf, fromJobs ); } private void initFromSink( JobConf conf ) throws IOException { // init sink first so tempSink can take precedence if( sink != null ) sink.sinkInit( conf ); // tempSink exists because sink is writeDirect if( tempSink != null ) tempSink.sinkInit( conf ); } public TapIterator openSourceForRead( JobConf conf ) throws IOException { return new TapIterator( sources.keySet().iterator().next(), conf ); } public TupleEntryIterator openSinkForRead( JobConf conf ) throws IOException { return sink.openForRead( conf ); } public Tap getMapperTrap( String name ) { return getMapperTraps().get( name ); } public Tap getReducerTrap( String name ) { return getReducerTraps().get( name ); } /** * Method getPreviousScopes returns the previous Scope instances. If the flowElement is a Group (specifically a CoGroup), * there will be more than one instance. * * @param flowElement of type FlowElement * @return Set<Scope> */ public Set<Scope> getPreviousScopes( FlowElement flowElement ) { assertFlowElement( flowElement ); return graph.incomingEdgesOf( flowElement ); } /** * Method getNextScope returns the next Scope instance in the graph. There will always only be one next. * * @param flowElement of type FlowElement * @return Scope */ public Scope getNextScope( FlowElement flowElement ) { assertFlowElement( flowElement ); Set<Scope> set = graph.outgoingEdgesOf( flowElement ); if( set.size() != 1 ) throw new IllegalStateException( "should only be one scope after current flow element: " + flowElement + " found: " + set.size() ); return set.iterator().next(); } public Set<Scope> getNextScopes( FlowElement flowElement ) { assertFlowElement( flowElement ); return graph.outgoingEdgesOf( flowElement ); } private void assertFlowElement( FlowElement flowElement ) { if( !graph.containsVertex( flowElement ) ) { String message = "unable to find %s in plan, class and serializable fields must implement #hashCode() and #equals()"; if( flowElement instanceof Pipe ) message = Util.formatTrace( (Pipe) flowElement, String.format( message, "pipe" ) ); else if( flowElement instanceof Tap ) message = Util.formatTrace( (Tap) flowElement, String.format( message, "tap" ) ); throw new IllegalStateException( message ); } } public FlowElement getNextFlowElement( Scope scope ) { return graph.getEdgeTarget( scope ); } public String getSourceName( Tap source ) { return sources.get( source ); } public Collection<Operation> getAllOperations() { Set<FlowElement> vertices = graph.vertexSet(); List<Operation> operations = new ArrayList<Operation>(); // operations impl equals, so two instance may be the same for( FlowElement vertice : vertices ) { if( vertice instanceof Operator ) operations.add( ( (Operator) vertice ).getOperation() ); } return operations; } public boolean containsPipeNamed( String pipeName ) { Set<FlowElement> vertices = graph.vertexSet(); for( FlowElement vertice : vertices ) { if( vertice instanceof Pipe && ( (Pipe) vertice ).getName().equals( pipeName ) ) return true; } return false; } /** * Method clean removes any temporary files used by this FlowStep instance. It will log any IOExceptions thrown. * * @param jobConf of type JobConf */ public void clean( JobConf jobConf ) { if( tempSink != null ) { try { tempSink.deletePath( jobConf ); } catch( Exception exception ) { // sink all exceptions, don't fail app logWarn( "unable to remove temporary file: " + tempSink, exception ); } } if( sink instanceof TempHfs ) { try { sink.deletePath( jobConf ); } catch( Exception exception ) { // sink all exceptions, don't fail app logWarn( "unable to remove temporary file: " + sink, exception ); } } else { cleanTap( jobConf, sink ); } for( Tap tap : getMapperTraps().values() ) cleanTap( jobConf, tap ); for( Tap tap : getReducerTraps().values() ) cleanTap( jobConf, tap ); } private void cleanTap( JobConf jobConf, Tap tap ) { try { Hadoop18TapUtil.cleanupTap( jobConf, tap ); } catch( IOException exception ) { // ignore exception } } @Override public boolean equals( Object object ) { if( this == object ) return true; if( object == null || getClass() != object.getClass() ) return false; FlowStep flowStep = (FlowStep) object; if( name != null ? !name.equals( flowStep.name ) : flowStep.name != null ) return false; return true; } @Override public int hashCode() { return name != null ? name.hashCode() : 0; } @Override public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append( getClass().getSimpleName() ); buffer.append( "[name: " ).append( getName() ).append( "]" ); return buffer.toString(); } protected FlowStepJob createFlowStepJob( JobConf parentConf ) throws IOException { return new FlowStepJob( this, getName(), getJobConf( parentConf ) ); } protected final boolean isInfoEnabled() { return LOG.isInfoEnabled(); } protected final boolean isDebugEnabled() { return LOG.isDebugEnabled(); } protected void logDebug( String message ) { LOG.debug( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message ); } protected void logInfo( String message ) { LOG.info( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message ); } protected void logWarn( String message ) { LOG.warn( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message ); } protected void logWarn( String message, Throwable throwable ) { LOG.warn( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message, throwable ); } protected void logError( String message, Throwable throwable ) { LOG.error( "[" + Util.truncate( getParentFlowName(), 25 ) + "] " + message, throwable ); } }