/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.cascade; import java.beans.ConstructorProperties; import java.io.IOException; import java.util.Collection; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.Set; import cascading.flow.Flow; import cascading.tap.CompositeTap; import cascading.tap.Hfs; import cascading.tap.Tap; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntryIterator; import cascading.util.Util; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; import org.jgrapht.Graphs; import org.jgrapht.graph.SimpleDirectedGraph; import org.jgrapht.traverse.TopologicalOrderIterator; /** * Class CascadeConnector is used to construct a new {@link Cascade} instance from a collection of {@link Flow} instance. * <p/> * Note order is not significant when adding passing Flow instances to the {@link #connect(String, Flow...)} * method. This connector will order them based on their dependencies, if any. */ public class CascadeConnector { /** Field LOG */ private static final Logger LOG = Logger.getLogger( CascadeConnector.class ); /** Field properties */ private Map<Object, Object> properties; /** Constructor CascadeConnector creates a new CascadeConnector instance. */ public CascadeConnector() { } /** * Constructor CascadeConnector creates a new CascadeConnector instance. * * @param properties of type Map<Object, Object> */ @ConstructorProperties({"properties"}) public CascadeConnector( Map<Object, Object> properties ) { this.properties = properties; } /** * Given any number of {@link Flow} objects, it will connect them and return a new {@link Cascade} instance. The name * of the Cascade is derived from the given Flow instances. * * @param flows of type Collection<Flow> * @return Cascade */ public Cascade connect( Collection<Flow> flows ) { return connect( null, flows.toArray( new Flow[ flows.size() ] ) ); } /** * Given any number of {@link Flow} objects, it will connect them and return a new {@link Cascade} instance. * * @param name of type String * @param flows of type Collection<Flow> * @return Cascade */ public Cascade connect( String name, Collection<Flow> flows ) { return connect( name, flows.toArray( new Flow[ flows.size() ] ) ); } /** * Given any number of {@link Flow} objects, it will connect them and return a new {@link Cascade} instance. The name * of the Cascade is derived from the given Flow instances. * * @param flows of type Flow * @return Cascade */ public Cascade connect( Flow... flows ) { return connect( null, flows ); } /** * Given any number of {@link Flow} objects, it will connect them and return a new {@link Cascade} instance. * * @param name of type String * @param flows of type Flow * @return Cascade */ public Cascade connect( String name, Flow... flows ) { verifyUniqueFlowNames( flows ); name = name == null ? makeName( flows ) : name; SimpleDirectedGraph<String, Flow.FlowHolder> tapGraph = new SimpleDirectedGraph<String, Flow.FlowHolder>( Flow.FlowHolder.class ); SimpleDirectedGraph<Flow, Integer> flowGraph = new SimpleDirectedGraph<Flow, Integer>( Integer.class ); makeTapGraph( tapGraph, flows ); makeFlowGraph( flowGraph, tapGraph ); verifyNoCycles( flowGraph ); return new Cascade( name, properties, flowGraph, tapGraph ); } private void verifyUniqueFlowNames( Flow[] flows ) { Set<String> set = new HashSet<String>(); for( Flow flow : flows ) { if( set.contains( flow.getName() ) ) throw new CascadeException( "all flow names must be unique, found duplicate: " + flow.getName() ); set.add( flow.getName() ); } } private String makeName( Flow[] flows ) { String[] names = new String[ flows.length ]; for( int i = 0; i < flows.length; i++ ) names[ i ] = flows[ i ].getName(); return Util.join( names, "+" ); } private void verifyNoCycles( SimpleDirectedGraph<Flow, Integer> flowGraph ) { Set<Flow> flows = new HashSet<Flow>(); TopologicalOrderIterator<Flow, Integer> topoIterator = new TopologicalOrderIterator<Flow, Integer>( flowGraph ); while( topoIterator.hasNext() ) flows.add( topoIterator.next() ); if( flows.size() != flowGraph.vertexSet().size() ) throw new CascadeException( "there are likely cycles in the set of given flows, topological iterator cannot traverse flows with cycles" ); } private void makeTapGraph( SimpleDirectedGraph<String, Flow.FlowHolder> tapGraph, Flow[] flows ) { for( Flow flow : flows ) { LinkedList<Tap> sources = new LinkedList<Tap>( flow.getSourcesCollection() ); LinkedList<Tap> sinks = new LinkedList<Tap>( flow.getSinksCollection() ); unwrapCompositeTaps( sources ); unwrapCompositeTaps( sinks ); for( Tap source : sources ) tapGraph.addVertex( getFullPath( flow, source ) ); for( Tap sink : sinks ) tapGraph.addVertex( getFullPath( flow, sink ) ); for( Tap source : sources ) { for( Tap sink : sinks ) addEdgeFor( tapGraph, flow, source, sink ); } } } private void addEdgeFor( SimpleDirectedGraph<String, Flow.FlowHolder> tapGraph, Flow flow, Tap source, Tap sink ) { try { tapGraph.addEdge( getFullPath( flow, source ), getFullPath( flow, sink ), flow.getHolder() ); } catch( IllegalArgumentException exception ) { throw new CascadeException( "no loops allowed in cascade, flow: " + flow.getName() + ", source: " + source + ", sink: " + sink ); } } private String getFullPath( Flow flow, Tap tap ) { String identifier = tap.getIdentifier(); if( tap instanceof Hfs ) { try { identifier = ( (Hfs) tap ).getQualifiedPath( flow.getJobConf() ).toString(); } catch( IOException exception ) { throw new CascadeException( "could not get fully qualified path for: " + tap ); } } return identifier; } private void unwrapCompositeTaps( LinkedList<Tap> taps ) { ListIterator<Tap> iterator = taps.listIterator(); while( iterator.hasNext() ) { Tap tap = iterator.next(); if( tap instanceof CompositeTap ) { iterator.remove(); for( Tap childTap : ( (CompositeTap) tap ).getChildTaps() ) { iterator.add( childTap ); iterator.previous(); // force cursor backward } } } } private void makeFlowGraph( SimpleDirectedGraph<Flow, Integer> jobGraph, SimpleDirectedGraph<String, Flow.FlowHolder> tapGraph ) { Set<String> identifiers = tapGraph.vertexSet(); int count = 0; for( String source : identifiers ) { if( LOG.isDebugEnabled() ) LOG.debug( "handling flow source: " + source ); List<String> sinks = Graphs.successorListOf( tapGraph, source ); for( String sink : sinks ) { if( LOG.isDebugEnabled() ) LOG.debug( "handling flow path: " + source + " -> " + sink ); Flow flow = tapGraph.getEdge( source, sink ).flow; jobGraph.addVertex( flow ); Set<Flow.FlowHolder> previous = tapGraph.incomingEdgesOf( source ); for( Flow.FlowHolder previousFlow : previous ) { jobGraph.addVertex( previousFlow.flow ); if( jobGraph.getEdge( previousFlow.flow, flow ) != null ) continue; if( !jobGraph.addEdge( previousFlow.flow, flow, count++ ) ) throw new CascadeException( "unable to add path between: " + previousFlow.flow.getName() + " and: " + flow.getName() ); } } } } /** Specialized type of {@link Tap} that is the root. */ static class RootTap extends Tap { /** Field serialVersionUID */ private static final long serialVersionUID = 1L; /** @see Tap#getPath() */ public Path getPath() { return null; } /** @see Tap#makeDirs(JobConf) */ public boolean makeDirs( JobConf conf ) throws IOException { return false; } /** @see Tap#deletePath(JobConf) */ public boolean deletePath( JobConf conf ) throws IOException { return false; } /** @see Tap#pathExists(JobConf) */ public boolean pathExists( JobConf conf ) throws IOException { return false; } /** @see Tap#getPathModified(JobConf) */ public long getPathModified( JobConf conf ) throws IOException { return 0; } public TupleEntryIterator openForRead( JobConf conf ) throws IOException { return null; } public TupleEntryCollector openForWrite( JobConf conf ) throws IOException { return null; } } }