Scheme.java example

Explorer
cascading-master
- src
/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.scheme;

import java.io.IOException;
import java.io.Serializable;

import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.util.Util;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;

/**
 * A Scheme defines what is stored in a {@link Tap} instance by declaring the {@link Tuple}
 * field names, and alternately parsing or rendering the incoming or outgoing {@link Tuple}
 * stream, respectively.
 * <p/>
 * A Scheme defines the type of resource data will be sourced from or sinked to.
 * <p/>
 * The given sourcFields only label the values in the {@link Tuple}s as they are sourced.
 * It does not necessarily filter the output since a given implementation may choose to
 * collapse values and ignore keys depending on the format.
 * <p/>
 * Setting the {@code numSinkParts} value to 1 (one) insures the output resource has only one part.
 * In the case of MapReduce, it does this by setting the number of reducers to the given value.
 * This may affect performance, so be cautioned.
 * </p>
 * Note that setting numSinkParts does not force the planner to insert a final Reduce operation in the job, so
 * numSinkParts may be ignored entirely if the final job is Map only. To force the Flow to have a final Reduce,
 * add a {@link cascading.pipe.GroupBy} to the assembly before sinking.
 */
public abstract class Scheme implements Serializable
  {
  /** Field sinkFields */
  Fields sinkFields = Fields.ALL;
  /** Field sourceFields */
  Fields sourceFields = Fields.UNKNOWN;
  /** Field numSinkParts */
  int numSinkParts;
  /** Field trace */
  private String trace = Util.captureDebugTrace( getClass() );

  /** Constructor Scheme creates a new Scheme instance. */
  protected Scheme()
    {
    }

  /**
   * Constructor Scheme creates a new Scheme instance.
   *
   * @param sourceFields of type Fields
   */
  protected Scheme( Fields sourceFields )
    {
    setSourceFields( sourceFields );
    }

  /**
   * Constructor Scheme creates a new Scheme instance.
   *
   * @param sourceFields of type Fields
   * @param numSinkParts of type int
   */
  protected Scheme( Fields sourceFields, int numSinkParts )
    {
    setSourceFields( sourceFields );
    this.numSinkParts = numSinkParts;
    }

  /**
   * Constructor Scheme creates a new Scheme instance.
   *
   * @param sourceFields of type Fields
   * @param sinkFields   of type Fields
   */
  protected Scheme( Fields sourceFields, Fields sinkFields )
    {
    setSourceFields( sourceFields );
    setSinkFields( sinkFields );
    }

  /**
   * Constructor Scheme creates a new Scheme instance.
   *
   * @param sourceFields of type Fields
   * @param sinkFields   of type Fields
   * @param numSinkParts of type int
   */
  protected Scheme( Fields sourceFields, Fields sinkFields, int numSinkParts )
    {
    setSourceFields( sourceFields );
    setSinkFields( sinkFields );
    this.numSinkParts = numSinkParts;
    }

  /**
   * Method getSinkFields returns the sinkFields of this Scheme object.
   *
   * @return the sinkFields (type Fields) of this Scheme object.
   */
  public Fields getSinkFields()
    {
    return sinkFields;
    }

  /**
   * Method setSinkFields sets the sinkFields of this Scheme object.
   *
   * @param sinkFields the sinkFields of this Scheme object.
   */
  public void setSinkFields( Fields sinkFields )
    {
    if( sinkFields.isUnknown() )
      this.sinkFields = Fields.ALL;
    else
      this.sinkFields = sinkFields;
    }

  /**
   * Method getSourceFields returns the sourceFields of this Scheme object.
   *
   * @return the sourceFields (type Fields) of this Scheme object.
   */
  public Fields getSourceFields()
    {
    return sourceFields;
    }

  /**
   * Method setSourceFields sets the sourceFields of this Scheme object.
   *
   * @param sourceFields the sourceFields of this Scheme object.
   */
  public void setSourceFields( Fields sourceFields )
    {
    if( sourceFields.isAll() )
      this.sourceFields = Fields.UNKNOWN;
    else
      this.sourceFields = sourceFields;
    }

  /**
   * Method getNumSinkParts returns the numSinkParts of this Scheme object.
   *
   * @return the numSinkParts (type int) of this Scheme object.
   */
  public int getNumSinkParts()
    {
    return numSinkParts;
    }

  /**
   * Method setNumSinkParts sets the numSinkParts of this Scheme object.
   *
   * @param numSinkParts the numSinkParts of this Scheme object.
   */
  public void setNumSinkParts( int numSinkParts )
    {
    this.numSinkParts = numSinkParts;
    }

  /**
   * Method getTrace returns a String that pinpoint where this instance was created for debugging.
   *
   * @return String
   */
  public String getTrace()
    {
    return trace;
    }

  /**
   * Method isWriteDirect returns true if the parent {@link Tap} instances {@link cascading.tuple.TupleEntryCollector} should be used to sink values.
   *
   * @return the writeDirect (type boolean) of this Tap object.
   */
  public boolean isWriteDirect()
    {
    return false;
    }

  /**
   * Method isSymmetrical returns {@code true} if the sink fields equal the source fields. That is, this
   * scheme sources the same fields as it sinks.
   *
   * @return the symmetrical (type boolean) of this Scheme object.
   */
  public boolean isSymmetrical()
    {
    return getSinkFields().equals( getSourceFields() );
    }

  /**
   * Method isSource returns true if this Scheme instance can be used as a source.
   *
   * @return boolean
   */
  public boolean isSource()
    {
    return true;
    }

  /**
   * Method isSink returns true if this Scheme instance can be used as a sink.
   *
   * @return boolean
   */
  public boolean isSink()
    {
    return true;
    }

  /**
   * Method sourceInit initializes this instance as a source.
   *
   * @param tap  of type Tap
   * @param conf of type JobConf
   * @throws IOException on initializatin failure
   */
  public abstract void sourceInit( Tap tap, JobConf conf ) throws IOException;

  /**
   * Method sinkInit initializes this instance as a sink.
   *
   * @param tap  of type Tap
   * @param conf of type JobConf
   * @throws IOException on initialization failure
   */
  public abstract void sinkInit( Tap tap, JobConf conf ) throws IOException;

  /**
   * Method source takes the given Hadoop key and value and returns a new {@link Tuple} instance.
   *
   * @param key   of type WritableComparable
   * @param value of type Writable
   * @return Tuple
   */
  public abstract Tuple source( Object key, Object value );

  /**
   * Method sink writes out the given {@link Tuple} instance to the outputCollector.
   *
   * @param tupleEntry
   * @param outputCollector of type OutputCollector @throws IOException when
   */
  public abstract void sink( TupleEntry tupleEntry, OutputCollector outputCollector ) throws IOException;


  @Override
  public boolean equals( Object object )
    {
    if( this == object )
      return true;
    if( object == null || getClass() != object.getClass() )
      return false;

    Scheme scheme = (Scheme) object;

    if( numSinkParts != scheme.numSinkParts )
      return false;
    if( sinkFields != null ? !sinkFields.equals( scheme.sinkFields ) : scheme.sinkFields != null )
      return false;
    if( sourceFields != null ? !sourceFields.equals( scheme.sourceFields ) : scheme.sourceFields != null )
      return false;

    return true;
    }

  @Override
  public String toString()
    {
    if( getSinkFields().equals( getSourceFields() ) )
      return getClass().getSimpleName() + "[" + getSourceFields().print() + "]";
    else
      return getClass().getSimpleName() + "[" + getSourceFields().print() + "->" + getSinkFields().print() + "]";
    }

  public int hashCode()
    {
    int result;
    result = ( sinkFields != null ? sinkFields.hashCode() : 0 );
    result = 31 * result + ( sourceFields != null ? sourceFields.hashCode() : 0 );
    result = 31 * result + numSinkParts;
    return result;
    }
  }