/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.flow; import java.beans.ConstructorProperties; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Properties; import java.util.Set; import cascading.CascadingException; import cascading.operation.Assertion; import cascading.operation.AssertionLevel; import cascading.operation.DebugLevel; import cascading.pipe.Pipe; import cascading.scheme.SequenceFile; import cascading.tap.Tap; import cascading.util.Util; import org.apache.hadoop.mapred.JobConf; /** * Use the FlowConnector to link source and sink {@link Tap} instances with an assembly of {@link Pipe} instances into * an executable {@link Flow}. * <p/> * FlowConnector invokes a planner for the target execution environment. Currently only {@link MultiMapReducePlanner} * is supported (for Hadoop). If you have just one pre-existing custom Hadoop job to execute, see {@link MapReduceFlow}. * <p/> * Note that all {@code connect} methods take a single {@code tail} or an array of {@code tail} Pipe instances. "tail" * refers to the last connected Pipe instances in a pipe-assembly. Pipe-assemblies are graphs of object with "heads" * and "tails". From a given "tail", all connected heads can be found, but not the reverse. So "tails" must be * supplied by the user. * <p/> * The FlowConnector, resulting Flow, and the underlying execution framework (Hadoop) can be configured via a * {@link Map} or {@link Properties} instance given to the constructor. This properties map can be * populated before constructing a FlowConnector instance through static methods on FlowConnector and * MultiMapReducePlanner. These properties are used to influence the current planner and are also passed down to the * execution framework (Hadoop) to override any default values (the number of reducers or mappers, etc. by using * application specific properties). * <p/> * Custom operations (Functions, Filter, etc) may also retrieve these property values at runtime through calls to * {@link FlowProcess#getProperty(String)}. * <p/> * Most applications will need to call {@link #setApplicationJarClass(java.util.Map, Class)} or * {@link #setApplicationJarPath(java.util.Map, String)} so that the correct application jar file is passed through * to all child processes. The Class or path must reference * the custom application jar, not a Cascading library class or jar. The easiest thing to do is give setApplicationJarClass * the Class with your static main function and let Cascading figure out which jar to use. * <p/> * Note that Map<Object,Object> is compatible with the {@link Properties} class, so properties can be loaded at * runtime from a configuration file. * <p/> * By default, all {@link Assertion}s are planned into the resulting Flow instance. This can be * changed by calling {@link #setAssertionLevel(java.util.Map, cascading.operation.AssertionLevel)}. * <p/> * Also by default, all {@link cascading.operation.Debug}s are planned into the resulting Flow instance. This can be * changed by calling {@link #setDebugLevel(java.util.Map, cascading.operation.DebugLevel)}. * <p/> * <strong>Properties</strong><br/> * <ul> * <li>cascading.flowconnector.appjar.class</li> * <li>cascading.flowconnector.appjar.path</li> * <li>cascading.flowconnector.assertionlevel</li> * <li>cascading.flowconnector.debuglevel</li> * <li>cascading.flowconnector.intermediateschemeclass</li> * </ul> * * @see MapReduceFlow */ public class FlowConnector { /** Field properties */ private Map<Object, Object> properties; /** * Method setAssertionLevel sets the target planner {@link cascading.operation.AssertionLevel}. * * @param properties of type Map<Object, Object> * @param assertionLevel of type AssertionLevel */ public static void setAssertionLevel( Map<Object, Object> properties, AssertionLevel assertionLevel ) { if( assertionLevel != null ) properties.put( "cascading.flowconnector.assertionlevel", assertionLevel.toString() ); } /** * Method getAssertionLevel returns the configured target planner {@link cascading.operation.AssertionLevel}. * * @param properties of type Map<Object, Object> * @return AssertionLevel the configured AssertionLevel */ public static AssertionLevel getAssertionLevel( Map<Object, Object> properties ) { String assertionLevel = Util.getProperty( properties, "cascading.flowconnector.assertionlevel", AssertionLevel.STRICT.name() ); return AssertionLevel.valueOf( assertionLevel ); } /** * Method setDebugLevel sets the target planner {@link cascading.operation.DebugLevel}. * * @param properties of type Map<Object, Object> * @param debugLevel of type DebugLevel */ public static void setDebugLevel( Map<Object, Object> properties, DebugLevel debugLevel ) { if( debugLevel != null ) properties.put( "cascading.flowconnector.debuglevel", debugLevel.toString() ); } /** * Method getDebugLevel returns the configured target planner {@link cascading.operation.DebugLevel}. * * @param properties of type Map<Object, Object> * @return DebugLevel the configured DebugLevel */ public static DebugLevel getDebugLevel( Map<Object, Object> properties ) { String debugLevel = Util.getProperty( properties, "cascading.flowconnector.debuglevel", DebugLevel.DEFAULT.name() ); return DebugLevel.valueOf( debugLevel ); } /** * Method setIntermediateSchemeClass is used for debugging. The default Scheme for intermediate files is {@link SequenceFile}. * * @param properties of type Map<Object, Object> * @param intermediateSchemeClass of type Class */ public static void setIntermediateSchemeClass( Map<Object, Object> properties, Class intermediateSchemeClass ) { properties.put( "cascading.flowconnector.intermediateschemeclass", intermediateSchemeClass ); } /** * Method setIntermediateSchemeClass is used for debugging. The default Scheme for intermediate files is {@link SequenceFile}. * * @param properties of type Map<Object, Object> * @param intermediateSchemeClass of type String */ public static void setIntermediateSchemeClass( Map<Object, Object> properties, String intermediateSchemeClass ) { properties.put( "cascading.flowconnector.intermediateschemeclass", intermediateSchemeClass ); } /** * Method getIntermediateSchemeClass is used for debugging. The default Scheme for intermediate files is {@link SequenceFile}. * * @param properties of type Map<Object, Object> * @return Class */ public static Class getIntermediateSchemeClass( Map<Object, Object> properties ) { // supporting stuffed classes to overcome classloading issue Object type = Util.getProperty( properties, "cascading.flowconnector.intermediateschemeclass", (Object) null ); if( type == null ) return SequenceFile.class; if( type instanceof Class ) return (Class) type; try { return FlowConnector.class.getClassLoader().loadClass( type.toString() ); } catch( ClassNotFoundException exception ) { throw new CascadingException( "unable to load class: " + type.toString(), exception ); } } /** * Method setApplicationJarClass is used to set the application jar file. * </p> * All cluster executed Cascading applications * need to call setApplicationJarClass(java.util.Map, Class) or * {@link #setApplicationJarPath(java.util.Map, String)}, otherwise ClassNotFound exceptions are likely. * * @param properties of type Map * @param type of type Class */ public static void setApplicationJarClass( Map<Object, Object> properties, Class type ) { if( type != null ) properties.put( "cascading.flowconnector.appjar.class", type ); } /** * Method getApplicationJarClass returns the Class set by the setApplicationJarClass method. * * @param properties of type Map<Object, Object> * @return Class */ public static Class getApplicationJarClass( Map<Object, Object> properties ) { return Util.getProperty( properties, "cascading.flowconnector.appjar.class", (Class) null ); } /** * Method setApplicationJarPath is used to set the application jar file. * </p> * All cluster executed Cascading applications * need to call {@link #setApplicationJarClass(java.util.Map, Class)} or * setApplicationJarPath(java.util.Map, String), otherwise ClassNotFound exceptions are likely. * * @param properties of type Map * @param path of type String */ public static void setApplicationJarPath( Map<Object, Object> properties, String path ) { if( path != null ) properties.put( "cascading.flowconnector.appjar.path", path ); } /** * Method getApplicationJarPath return the path set by the setApplicationJarPath method. * * @param properties of type Map<Object, Object> * @return String */ public static String getApplicationJarPath( Map<Object, Object> properties ) { return Util.getProperty( properties, "cascading.flowconnector.appjar.path", (String) null ); } /** Constructor FlowConnector creates a new FlowConnector instance. */ public FlowConnector() { } /** * Constructor FlowConnector creates a new FlowConnector instance using the given {@link Properties} instance as * default value for the underlying jobs. All properties are copied to a new {@link JobConf} instance. * * @param properties of type Properties */ @ConstructorProperties({"properties"}) public FlowConnector( Map<Object, Object> properties ) { this.properties = properties; } /** * Method getProperties returns the properties of this FlowConnector object. The returned Map instance * is immutable to prevent changes to the underlying property values in this FlowConnector instance. * * @return the properties (type Map<Object, Object>) of this FlowConnector object. */ public Map<Object, Object> getProperties() { return Collections.unmodifiableMap( properties ); } /** * Method connect links the given source and sink Taps to the given pipe assembly. * * @param source source Tap to bind to the head of the given tail Pipe * @param sink sink Tap to bind to the given tail Pipe * @param tail tail end of a pipe assembly * @return Flow */ public Flow connect( Tap source, Tap sink, Pipe tail ) { return connect( null, source, sink, tail ); } /** * Method connect links the given source and sink Taps to the given pipe assembly. * * @param name name to give the resulting Flow * @param source source Tap to bind to the head of the given tail Pipe * @param sink sink Tap to bind to the given tail Pipe * @param tail tail end of a pipe assembly * @return Flow */ public Flow connect( String name, Tap source, Tap sink, Pipe tail ) { Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put( tail.getHeads()[ 0 ].getName(), source ); return connect( name, sources, sink, tail ); } /** * Method connect links the given source, sink, and trap Taps to the given pipe assembly. The given trap will * be linked to the assembly head along with the source. * * @param name name to give the resulting Flow * @param source source Tap to bind to the head of the given tail Pipe * @param sink sink Tap to bind to the given tail Pipe * @param trap trap Tap to sink all failed Tuples into * @param tail tail end of a pipe assembly * @return Flow */ public Flow connect( String name, Tap source, Tap sink, Tap trap, Pipe tail ) { Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put( tail.getHeads()[ 0 ].getName(), source ); Map<String, Tap> traps = new HashMap<String, Tap>(); traps.put( tail.getHeads()[ 0 ].getName(), trap ); return connect( name, sources, sink, traps, tail ); } /** * Method connect links the named source Taps and sink Tap to the given pipe assembly. * * @param sources all head names and source Taps to bind to the heads of the given tail Pipe * @param sink sink Tap to bind to the given tail Pipe * @param tail tail end of a pipe assembly * @return Flow */ public Flow connect( Map<String, Tap> sources, Tap sink, Pipe tail ) { return connect( null, sources, sink, tail ); } /** * Method connect links the named source Taps and sink Tap to the given pipe assembly. * * @param name name to give the resulting Flow * @param sources all head names and source Taps to bind to the heads of the given tail Pipe * @param sink sink Tap to bind to the given tail Pipe * @param tail tail end of a pipe assembly * @return Flow */ public Flow connect( String name, Map<String, Tap> sources, Tap sink, Pipe tail ) { Map<String, Tap> sinks = new HashMap<String, Tap>(); sinks.put( tail.getName(), sink ); return connect( name, sources, sinks, tail ); } /** * Method connect links the named source and trap Taps and sink Tap to the given pipe assembly. * * @param name name to give the resulting Flow * @param sources all head names and source Taps to bind to the heads of the given tail Pipe * @param sink sink Tap to bind to the given tail Pipe * @param traps all pipe names and trap Taps to sink all failed Tuples into * @param tail tail end of a pipe assembly * @return Flow */ public Flow connect( String name, Map<String, Tap> sources, Tap sink, Map<String, Tap> traps, Pipe tail ) { Map<String, Tap> sinks = new HashMap<String, Tap>(); sinks.put( tail.getName(), sink ); return connect( name, sources, sinks, traps, tail ); } /** * Method connect links the named trap Taps, source and sink Tap to the given pipe assembly. * * @param name name to give the resulting Flow * @param source source Tap to bind to the head of the given tail Pipe * @param sink sink Tap to bind to the given tail Pipe * @param traps all pipe names and trap Taps to sink all failed Tuples into * @param tail tail end of a pipe assembly * @return Flow */ public Flow connect( String name, Tap source, Tap sink, Map<String, Tap> traps, Pipe tail ) { Map<String, Tap> sources = new HashMap<String, Tap>(); sources.put( tail.getHeads()[ 0 ].getName(), source ); Map<String, Tap> sinks = new HashMap<String, Tap>(); sinks.put( tail.getName(), sink ); return connect( name, sources, sinks, traps, tail ); } /** * Method connect links the named source Taps and sink Tap to the given pipe assembly. * <p/> * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe. * So the head pipe does not need to be included as an argument. * * @param source source Tap to bind to the head of the given tail Pipes * @param sinks all tail names and sink Taps to bind to the given tail Pipes * @param tails all tail ends of a pipe assembly * @return Flow */ public Flow connect( Tap source, Map<String, Tap> sinks, Collection<Pipe> tails ) { return connect( null, source, sinks, tails.toArray( new Pipe[tails.size()] ) ); } /** * Method connect links the named source Taps and sink Tap to the given pipe assembly. * <p/> * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe. * So the head pipe does not need to be included as an argument. * * @param name name to give the resulting Flow * @param source source Tap to bind to the head of the given tail Pipes * @param sinks all tail names and sink Taps to bind to the given tail Pipes * @param tails all tail ends of a pipe assembly * @return Flow */ public Flow connect( String name, Tap source, Map<String, Tap> sinks, Collection<Pipe> tails ) { return connect( name, source, sinks, tails.toArray( new Pipe[tails.size()] ) ); } /** * Method connect links the named source Taps and sink Tap to the given pipe assembly. * <p/> * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe. * So the head pipe does not need to be included as an argument. * * @param source source Tap to bind to the head of the given tail Pipes * @param sinks all tail names and sink Taps to bind to the given tail Pipes * @param tails all tail ends of a pipe assembly * @return Flow */ public Flow connect( Tap source, Map<String, Tap> sinks, Pipe... tails ) { return connect( null, source, sinks, tails ); } /** * Method connect links the named source Taps and sink Tap to the given pipe assembly. * <p/> * Since only once source Tap is given, it is assumed to be associated with the 'head' pipe. * So the head pipe does not need to be included as an argument. * * @param name name to give the resulting Flow * @param source source Tap to bind to the head of the given tail Pipes * @param sinks all tail names and sink Taps to bind to the given tail Pipes * @param tails all tail ends of a pipe assembly * @return Flow */ public Flow connect( String name, Tap source, Map<String, Tap> sinks, Pipe... tails ) { Set<Pipe> heads = new HashSet<Pipe>(); for( Pipe pipe : tails ) Collections.addAll( heads, pipe.getHeads() ); if( heads.isEmpty() ) throw new IllegalArgumentException( "no pipe instance found" ); if( heads.size() != 1 ) throw new IllegalArgumentException( "there may be only 1 head pipe instance, found " + heads.size() ); Map<String, Tap> sources = new HashMap<String, Tap>(); for( Pipe pipe : heads ) sources.put( pipe.getName(), source ); return connect( name, sources, sinks, tails ); } /** * Method connect links the named sources and sinks to the given pipe assembly. * * @param sources all head names and source Taps to bind to the heads of the given tail Pipes * @param sinks all tail names and sink Taps to bind to the given tail Pipes * @param tails all tail ends of a pipe assembly * @return Flow */ public Flow connect( Map<String, Tap> sources, Map<String, Tap> sinks, Pipe... tails ) { return connect( null, sources, sinks, tails ); } /** * Method connect links the named sources and sinks to the given pipe assembly. * * @param name name to give the resulting Flow * @param sources all head names and source Taps to bind to the heads of the given tail Pipes * @param sinks all tail names and sink Taps to bind to the given tail Pipes * @param tails all tail ends of a pipe assembly * @return Flow */ public Flow connect( String name, Map<String, Tap> sources, Map<String, Tap> sinks, Pipe... tails ) { return connect( name, sources, sinks, new HashMap<String, Tap>(), tails ); } /** * Method connect links the named sources, sinks and traps to the given pipe assembly. * * @param name name to give the resulting Flow * @param sources all head names and source Taps to bind to the heads of the given tail Pipes * @param sinks all tail names and sink Taps to bind to the given tail Pipes * @param traps all pipe names and trap Taps to sink all failed Tuples into * @param tails all tail ends of a pipe assembly * @return Flow */ public Flow connect( String name, Map<String, Tap> sources, Map<String, Tap> sinks, Map<String, Tap> traps, Pipe... tails ) { name = name == null ? makeName( tails ) : name; // choose appropriate planner (when there is more than one) return new MultiMapReducePlanner( properties ).buildFlow( name, tails, sources, sinks, traps ); } ///////// // UTIL ///////// private String makeName( Pipe[] pipes ) { String[] names = new String[pipes.length]; for( int i = 0; i < pipes.length; i++ ) names[ i ] = pipes[ i ].getName(); String name = Util.join( names, "+" ); if( name.length() > 32 ) name = name.substring( 0, 32 ); return name; } }