/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap; import java.io.IOException; import java.io.Serializable; import java.util.Set; import cascading.flow.Flow; import cascading.flow.FlowElement; import cascading.flow.FlowException; import cascading.flow.Scope; import cascading.pipe.Pipe; import cascading.scheme.Scheme; import cascading.tuple.Fields; import cascading.tuple.FieldsResolverException; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntryIterator; import cascading.util.Util; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; /** * A Tap represents the physical data source or sink in a connected {@link Flow}. * </p> * That is, a source Tap is the head end of a connected {@link Pipe} and {@link Tuple} stream, and * a sink Tap is the tail end. Kinds of Tap types are used to manage files from a local disk, * distributed disk, remote storage like Amazon S3, or via FTP. It simply abstracts * out the complexity of connecting to these types of data sources. * <p/> * A Tap takes a {@link Scheme} instance, which is used to identify the type of resource (text file, binary file, etc). * A Tap is responsible for how the resource is reached. * <p/> * A Tap is not given an explicit name by design. This is so a given Tap instance can be * re-used in different {@link Flow}s that may expect a source or sink by a different * logical name, but are the same physical resource. If a tap had a name other than its path, which would be * used for the tap identity? If the name, then two Tap instances with different names but the same path could * interfere with one another. */ public abstract class Tap implements FlowElement, Serializable { /** Field scheme */ private Scheme scheme; /** Field writeDirect */ boolean writeDirect = false; /** Field mode */ SinkMode sinkMode = SinkMode.KEEP; /** Field trace */ private String trace = Util.captureDebugTrace( getClass() ); /** * Convenience function to make an array of Tap instances. * * @param taps of type Tap * @return Tap array */ public static Tap[] taps( Tap... taps ) { return taps; } protected Tap() { } protected Tap( Scheme scheme ) { this.setScheme( scheme ); } protected Tap( Scheme scheme, SinkMode sinkMode ) { this.setScheme( scheme ); this.sinkMode = sinkMode; } protected void setScheme( Scheme scheme ) { this.scheme = scheme; } /** * Method getScheme returns the scheme of this Tap object. * * @return the scheme (type Scheme) of this Tap object. */ public Scheme getScheme() { return scheme; } /** * Method getTrace return the trace of this object. * * @return String */ public String getTrace() { return trace; } /** * Method isWriteDirect returns true if this instances {@link cascading.tuple.TupleEntryCollector} should be used to sink values. * * @return the writeDirect (type boolean) of this Tap object. */ public boolean isWriteDirect() { return writeDirect || getScheme().isWriteDirect(); } /** * Method setWriteDirect should be set to true if this instances {@link cascading.tuple.TupleEntryCollector} should be used to sink values. * * @param writeDirect the writeDirect of this Tap object. */ public void setWriteDirect( boolean writeDirect ) { this.writeDirect = writeDirect; } /** * Method flowInit allows this Tap instance to initalize itself in context of the given {@link Flow} instance. * This method is guaranteed to be called before the Flow is started and the * {@link cascading.flow.FlowListener#onStarting(cascading.flow.Flow)} event is fired. * <p/> * This method will be called once per Flow, and before {@link #sourceInit(org.apache.hadoop.mapred.JobConf)} and * {@link #sinkInit(org.apache.hadoop.mapred.JobConf)} methods. * * @param flow of type Flow */ public void flowInit( Flow flow ) { } /** * Method sourceInit initializes this instance as a source. * <p/> * This method maybe called more than once if this Tap instance is used outside the scope of a {@link Flow} * instance or if it participates in multiple times in a given Flow or across different Flows in * a {@link cascading.cascade.Cascade}. * <p/> * In the context of a Flow, it will be called after * {@link cascading.flow.FlowListener#onStarting(cascading.flow.Flow)} * * @param conf of type JobConf * @throws IOException on resource initialization failure. */ public void sourceInit( JobConf conf ) throws IOException { getScheme().sourceInit( this, conf ); } /** * Method sinkInit initializes this instance as a sink. * <p/> * This method maybe called more than once if this Tap instance is used outside the scope of a {@link Flow} * instance or if it participates in multiple times in a given Flow or across different Flows in * a {@link cascading.cascade.Cascade}. * <p/> * Note this method will be called in context of this Tap being used as a traditional 'sink' and as a 'trap'. * <p/> * In the context of a Flow, it will be called after * {@link cascading.flow.FlowListener#onStarting(cascading.flow.Flow)} * * @param conf of type JobConf * @throws IOException on resource initialization failure. */ public void sinkInit( JobConf conf ) throws IOException { getScheme().sinkInit( this, conf ); } /** * Method getPath returns the Hadoop path to the resource represented by this Tap instance. * * @return Path */ public abstract Path getPath(); /** * Method getIdentifier returns a String representing the resource identifier this Tap instance represents. * <p/> * By default, simply calls {@code getPath().toString()}. * * @return the resource (type String) of this Tap object. */ public String getIdentifier() { if( getPath() == null ) return null; return getPath().toString(); } /** * Method getSourceFields returns the sourceFields of this Tap object. * * @return the sourceFields (type Fields) of this Tap object. */ public Fields getSourceFields() { return getScheme().getSourceFields(); } /** * Method getSinkFields returns the sinkFields of this Tap object. * * @return the sinkFields (type Fields) of this Tap object. */ public Fields getSinkFields() { return getScheme().getSinkFields(); } /** * Method openForRead opens the resource represented by this Tap instance. * * @param conf of type JobConf * @return TupleEntryIterator * @throws java.io.IOException when the resource cannot be opened */ public abstract TupleEntryIterator openForRead( JobConf conf ) throws IOException; /** * Method openForWrite opens the resource represented by this Tap instance. * * @param conf of type JobConf * @return TupleEntryCollector * @throws java.io.IOException when */ public abstract TupleEntryCollector openForWrite( JobConf conf ) throws IOException; /** * Method source returns the source value as an instance of {@link Tuple} * * @param key of type WritableComparable * @param value of type Writable * @return Tuple */ public Tuple source( Object key, Object value ) { return getScheme().source( key, value ); } /** * Method sink emits the sink value(s) to the OutputCollector * * @param tupleEntry of type TupleEntry * @param outputCollector of type OutputCollector * @throws java.io.IOException when the resource cannot be written to */ public void sink( TupleEntry tupleEntry, OutputCollector outputCollector ) throws IOException { getScheme().sink( tupleEntry, outputCollector ); } /** @see FlowElement#outgoingScopeFor(Set) */ public Scope outgoingScopeFor( Set<Scope> incomingScopes ) { // as a source Tap, we emit the scheme defined Fields // as a sink Tap, we declare we emit the incoming Fields // as a temp Tap, this method never gets called, but we emit what we consume int count = 0; for( Scope incomingScope : incomingScopes ) { Fields incomingFields = resolveFields( incomingScope ); if( incomingFields != null ) { try { incomingFields.select( getSinkFields() ); } catch( FieldsResolverException exception ) { throw new TapException( this, exception.getSourceFields(), exception.getSelectorFields(), exception ); } count++; } } if( count > 1 ) throw new FlowException( "Tap may not have more than one incoming Scope" ); if( count == 1 ) return new Scope( getSinkFields() ); return new Scope( getSourceFields() ); } /** @see FlowElement#resolveIncomingOperationFields(Scope) */ public Fields resolveIncomingOperationFields( Scope incomingScope ) { return getFieldsFor( incomingScope ); } /** @see FlowElement#resolveFields(Scope) */ public Fields resolveFields( Scope scope ) { return getFieldsFor( scope ); } private Fields getFieldsFor( Scope incomingScope ) { if( incomingScope.isEvery() ) return incomingScope.getOutGroupingFields(); else return incomingScope.getOutValuesFields(); } /** * Method getQualifiedPath returns a FileSystem fully qualified Hadoop Path. * * @param conf of type JobConf * @return Path * @throws IOException when */ public Path getQualifiedPath( JobConf conf ) throws IOException { return getPath(); } /** * Method makeDirs makes all the directories this Tap instance represents. * * @param conf of type JobConf * @return boolean * @throws IOException when there is an error making directories */ public abstract boolean makeDirs( JobConf conf ) throws IOException; /** * Method deletePath deletes the resource represented by this instance. * * @param conf of type JobConf * @return boolean * @throws IOException when the resource cannot be deleted */ public abstract boolean deletePath( JobConf conf ) throws IOException; /** * Method pathExists return true if the path represented by this instance exists. * * @param conf of type JobConf * @return boolean * @throws IOException when the status cannot be determined */ public abstract boolean pathExists( JobConf conf ) throws IOException; /** * Method getPathModified returns the date this resource was last modified. * * @param conf of type JobConf * @return long * @throws IOException when the modified date cannot be determined */ public abstract long getPathModified( JobConf conf ) throws IOException; /** * Method getSinkMode returns the {@link SinkMode} }of this Tap object. * * @return the sinkMode (type SinkMode) of this Tap object. */ public SinkMode getSinkMode() { return sinkMode; } /** * Method isKeep indicates whether the resource represented by this instance should be kept if it * already exists when the Flow is started. * * @return boolean */ public boolean isKeep() { return sinkMode == SinkMode.KEEP; } /** * Method isReplace indicates whether the resource represented by this instance should be deleted if it * already exists when the Flow is started. * * @return boolean */ public boolean isReplace() { return sinkMode == SinkMode.REPLACE; } /** * Method isAppend indicates whether the resrouce represented by this instance should be appended to if it already * exists. Otherwise a new resource will be created when the Flow is started.. * * @return boolean */ @Deprecated public boolean isAppend() { return sinkMode == SinkMode.APPEND; } /** * Method isUpdate indicates whether the resrouce represented by this instance should be updated if it already * exists. Otherwise a new resource will be created when the Flow is started.. * * @return boolean */ public boolean isUpdate() { return isAppend() || sinkMode == SinkMode.UPDATE; } /** * Method isSink returns true if this Tap instance can be used as a sink. * * @return boolean */ public boolean isSink() { return getScheme().isSink(); } /** * Method isSource returns true if this Tap instance can be used as a source. * * @return boolean */ public boolean isSource() { return getScheme().isSource(); } @Override public boolean isEquivalentTo( FlowElement element ) { if( element == null ) return false; if( this == element ) return true; boolean compare = getClass() == element.getClass(); if( !compare ) return false; return equals( element ); } @Override public boolean equals( Object object ) { if( this == object ) return true; if( object == null || getClass() != object.getClass() ) return false; Tap tap = (Tap) object; if( getScheme() != null ? !getScheme().equals( tap.getScheme() ) : tap.getScheme() != null ) return false; return true; } @Override public int hashCode() { return getScheme() != null ? getScheme().hashCode() : 0; } }