/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.scheme;
import java.io.IOException;
import java.io.Serializable;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.util.Util;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
/**
* A Scheme defines what is stored in a {@link Tap} instance by declaring the {@link Tuple}
* field names, and alternately parsing or rendering the incoming or outgoing {@link Tuple}
* stream, respectively.
* <p/>
* A Scheme defines the type of resource data will be sourced from or sinked to.
* <p/>
* The given sourcFields only label the values in the {@link Tuple}s as they are sourced.
* It does not necessarily filter the output since a given implementation may choose to
* collapse values and ignore keys depending on the format.
* <p/>
* Setting the {@code numSinkParts} value to 1 (one) insures the output resource has only one part.
* In the case of MapReduce, it does this by setting the number of reducers to the given value.
* This may affect performance, so be cautioned.
* </p>
* Note that setting numSinkParts does not force the planner to insert a final Reduce operation in the job, so
* numSinkParts may be ignored entirely if the final job is Map only. To force the Flow to have a final Reduce,
* add a {@link cascading.pipe.GroupBy} to the assembly before sinking.
*/
public abstract class Scheme implements Serializable
{
/** Field sinkFields */
Fields sinkFields = Fields.ALL;
/** Field sourceFields */
Fields sourceFields = Fields.UNKNOWN;
/** Field numSinkParts */
int numSinkParts;
/** Field trace */
private String trace = Util.captureDebugTrace( getClass() );
/** Constructor Scheme creates a new Scheme instance. */
protected Scheme()
{
}
/**
* Constructor Scheme creates a new Scheme instance.
*
* @param sourceFields of type Fields
*/
protected Scheme( Fields sourceFields )
{
setSourceFields( sourceFields );
}
/**
* Constructor Scheme creates a new Scheme instance.
*
* @param sourceFields of type Fields
* @param numSinkParts of type int
*/
protected Scheme( Fields sourceFields, int numSinkParts )
{
setSourceFields( sourceFields );
this.numSinkParts = numSinkParts;
}
/**
* Constructor Scheme creates a new Scheme instance.
*
* @param sourceFields of type Fields
* @param sinkFields of type Fields
*/
protected Scheme( Fields sourceFields, Fields sinkFields )
{
setSourceFields( sourceFields );
setSinkFields( sinkFields );
}
/**
* Constructor Scheme creates a new Scheme instance.
*
* @param sourceFields of type Fields
* @param sinkFields of type Fields
* @param numSinkParts of type int
*/
protected Scheme( Fields sourceFields, Fields sinkFields, int numSinkParts )
{
setSourceFields( sourceFields );
setSinkFields( sinkFields );
this.numSinkParts = numSinkParts;
}
/**
* Method getSinkFields returns the sinkFields of this Scheme object.
*
* @return the sinkFields (type Fields) of this Scheme object.
*/
public Fields getSinkFields()
{
return sinkFields;
}
/**
* Method setSinkFields sets the sinkFields of this Scheme object.
*
* @param sinkFields the sinkFields of this Scheme object.
*/
public void setSinkFields( Fields sinkFields )
{
if( sinkFields.isUnknown() )
this.sinkFields = Fields.ALL;
else
this.sinkFields = sinkFields;
}
/**
* Method getSourceFields returns the sourceFields of this Scheme object.
*
* @return the sourceFields (type Fields) of this Scheme object.
*/
public Fields getSourceFields()
{
return sourceFields;
}
/**
* Method setSourceFields sets the sourceFields of this Scheme object.
*
* @param sourceFields the sourceFields of this Scheme object.
*/
public void setSourceFields( Fields sourceFields )
{
if( sourceFields.isAll() )
this.sourceFields = Fields.UNKNOWN;
else
this.sourceFields = sourceFields;
}
/**
* Method getNumSinkParts returns the numSinkParts of this Scheme object.
*
* @return the numSinkParts (type int) of this Scheme object.
*/
public int getNumSinkParts()
{
return numSinkParts;
}
/**
* Method setNumSinkParts sets the numSinkParts of this Scheme object.
*
* @param numSinkParts the numSinkParts of this Scheme object.
*/
public void setNumSinkParts( int numSinkParts )
{
this.numSinkParts = numSinkParts;
}
/**
* Method getTrace returns a String that pinpoint where this instance was created for debugging.
*
* @return String
*/
public String getTrace()
{
return trace;
}
/**
* Method isWriteDirect returns true if the parent {@link Tap} instances {@link cascading.tuple.TupleEntryCollector} should be used to sink values.
*
* @return the writeDirect (type boolean) of this Tap object.
*/
public boolean isWriteDirect()
{
return false;
}
/**
* Method isSymmetrical returns {@code true} if the sink fields equal the source fields. That is, this
* scheme sources the same fields as it sinks.
*
* @return the symmetrical (type boolean) of this Scheme object.
*/
public boolean isSymmetrical()
{
return getSinkFields().equals( getSourceFields() );
}
/**
* Method isSource returns true if this Scheme instance can be used as a source.
*
* @return boolean
*/
public boolean isSource()
{
return true;
}
/**
* Method isSink returns true if this Scheme instance can be used as a sink.
*
* @return boolean
*/
public boolean isSink()
{
return true;
}
/**
* Method sourceInit initializes this instance as a source.
*
* @param tap of type Tap
* @param conf of type JobConf
* @throws IOException on initializatin failure
*/
public abstract void sourceInit( Tap tap, JobConf conf ) throws IOException;
/**
* Method sinkInit initializes this instance as a sink.
*
* @param tap of type Tap
* @param conf of type JobConf
* @throws IOException on initialization failure
*/
public abstract void sinkInit( Tap tap, JobConf conf ) throws IOException;
/**
* Method source takes the given Hadoop key and value and returns a new {@link Tuple} instance.
*
* @param key of type WritableComparable
* @param value of type Writable
* @return Tuple
*/
public abstract Tuple source( Object key, Object value );
/**
* Method sink writes out the given {@link Tuple} instance to the outputCollector.
*
* @param tupleEntry
* @param outputCollector of type OutputCollector @throws IOException when
*/
public abstract void sink( TupleEntry tupleEntry, OutputCollector outputCollector ) throws IOException;
@Override
public boolean equals( Object object )
{
if( this == object )
return true;
if( object == null || getClass() != object.getClass() )
return false;
Scheme scheme = (Scheme) object;
if( numSinkParts != scheme.numSinkParts )
return false;
if( sinkFields != null ? !sinkFields.equals( scheme.sinkFields ) : scheme.sinkFields != null )
return false;
if( sourceFields != null ? !sourceFields.equals( scheme.sourceFields ) : scheme.sourceFields != null )
return false;
return true;
}
@Override
public String toString()
{
if( getSinkFields().equals( getSourceFields() ) )
return getClass().getSimpleName() + "[" + getSourceFields().print() + "]";
else
return getClass().getSimpleName() + "[" + getSourceFields().print() + "->" + getSinkFields().print() + "]";
}
public int hashCode()
{
int result;
result = ( sinkFields != null ? sinkFields.hashCode() : 0 );
result = 31 * result + ( sourceFields != null ? sourceFields.hashCode() : 0 );
result = 31 * result + numSinkParts;
return result;
}
}