/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.tap;
import java.beans.ConstructorProperties;
import java.io.IOException;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import cascading.scheme.Scheme;
import cascading.tap.hadoop.TapCollector;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleEntryCollector;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.log4j.Logger;
/**
* Class TemplateTap can be used to write tuple streams out to subdirectories based on the values in the {@link Tuple}
* instance.
* <p/>
* The constructor takes a {@link Hfs} {@link Tap} and a {@link java.util.Formatter} format syntax String. This allows
* Tuple values at given positions to be used as directory names. Note that Hadoop can only sink to directories, and
* all files in those directories are "part-xxxxx" files.
* <p/>
* {@code openTapsThreshold} limits the number of open files to be output to. This value defaults to 300 files.
* Each time the threshold is exceeded, 10% of the least recently used open files will be closed.
*/
public class TemplateTap extends SinkTap
{
/** Field LOG */
private static final Logger LOG = Logger.getLogger( TemplateTap.class );
/** Field OPEN_FILES_THRESHOLD_DEFAULT */
private static final int OPEN_TAPS_THRESHOLD_DEFAULT = 300;
/** Field parent */
private Tap parent;
/** Field pathTemplate */
private String pathTemplate;
/** Field keepParentOnDelete */
private boolean keepParentOnDelete = false;
/** Field openTapsThreshold */
private int openTapsThreshold = OPEN_TAPS_THRESHOLD_DEFAULT;
/** Field collectors */
private Map<String, OutputCollector> collectors = new LinkedHashMap<String, OutputCollector>( 1000, .75f, true );
private class TemplateCollector extends TupleEntryCollector implements OutputCollector
{
JobConf conf;
public TemplateCollector( JobConf conf )
{
this.conf = conf;
}
protected void collect( Tuple tuple )
{
throw new UnsupportedOperationException( "collect should never be called on TemplateCollector" );
}
private OutputCollector getCollector( String path )
{
OutputCollector collector = collectors.get( path );
if( collector != null )
return collector;
try
{
Tap tap = new Hfs( parent.getScheme(), parent.getQualifiedPath( conf ).toString() );
if( LOG.isDebugEnabled() )
LOG.debug( "creating collector for path: " + new Path( parent.getQualifiedPath( conf ), path ) );
collector = (OutputCollector) new TapCollector( tap, path, conf );
}
catch( IOException exception )
{
throw new TapException( "unable to open template path: " + path, exception );
}
if( collectors.size() > openTapsThreshold )
{
int numToClose = Math.max( 1, (int) ( openTapsThreshold * .10 ) );
if( LOG.isInfoEnabled() )
LOG.info( "removing " + numToClose + " open Taps from cache of size " + collectors.size() );
Set<String> removeKeys = new HashSet<String>();
Set<String> keys = collectors.keySet();
for( String key : keys )
{
if( numToClose-- == 0 )
break;
removeKeys.add( key );
}
for( String removeKey : removeKeys )
closeCollector( collectors.remove( removeKey ) );
}
collectors.put( path, collector );
if( LOG.isInfoEnabled() && collectors.size() % 100 == 0 )
LOG.info( "caching " + collectors.size() + " open Taps" );
return collector;
}
@Override
public void close()
{
super.close();
try
{
for( OutputCollector collector : collectors.values() )
{
closeCollector( collector );
}
}
finally
{
collectors.clear();
}
}
private void closeCollector( OutputCollector collector )
{
if( collector == null )
return;
try
{
( (TupleEntryCollector) collector ).close();
}
catch( Exception exception )
{
// do nothing
}
}
public void collect( Object key, Object value ) throws IOException
{
String path = ( (Tuple) value ).format( pathTemplate );
getCollector( path ).collect( key, value );
}
}
public static class TemplateScheme extends Scheme
{
private final Scheme scheme;
private final Fields pathFields;
private final String pathTemplate;
public TemplateScheme( Scheme scheme )
{
this.scheme = scheme;
this.pathFields = null;
this.pathTemplate = null;
}
public TemplateScheme( Scheme scheme, String pathTemplate, Fields pathFields )
{
this.scheme = scheme;
this.pathFields = pathFields;
this.pathTemplate = pathTemplate;
}
public Fields getSinkFields()
{
return scheme.getSinkFields();
}
public void setSinkFields( Fields sinkFields )
{
scheme.setSinkFields( sinkFields );
}
public Fields getSourceFields()
{
return scheme.getSourceFields();
}
public void setSourceFields( Fields sourceFields )
{
scheme.setSourceFields( sourceFields );
}
public int getNumSinkParts()
{
return scheme.getNumSinkParts();
}
public void setNumSinkParts( int numSinkParts )
{
scheme.setNumSinkParts( numSinkParts );
}
public boolean isWriteDirect()
{
return scheme.isWriteDirect();
}
public void sourceInit( Tap tap, JobConf conf ) throws IOException
{
scheme.sourceInit( tap, conf );
}
public void sinkInit( Tap tap, JobConf conf ) throws IOException
{
scheme.sinkInit( tap, conf );
}
public Tuple source( Object key, Object value )
{
return scheme.source( key, value );
}
public void sink( TupleEntry tupleEntry, OutputCollector outputCollector ) throws IOException
{
if( pathFields != null )
{
Tuple values = tupleEntry.selectTuple( pathFields );
outputCollector = ( (TemplateCollector) outputCollector ).getCollector( values.format( pathTemplate ) );
}
scheme.sink( tupleEntry, outputCollector );
}
}
/**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
*
* @param parent of type Tap
* @param pathTemplate of type String
*/
@ConstructorProperties({"parent", "pathTemplate"})
public TemplateTap( Hfs parent, String pathTemplate )
{
this( parent, pathTemplate, OPEN_TAPS_THRESHOLD_DEFAULT );
}
/**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
* <p/>
* openTapsThreshold limits the number of open files to be output to.
*
* @param parent of type Hfs
* @param pathTemplate of type String
* @param openTapsThreshold of type int
*/
@ConstructorProperties({"parent", "pathTemplate", "openTapsThreshold"})
public TemplateTap( Hfs parent, String pathTemplate, int openTapsThreshold )
{
super( new TemplateScheme( parent.getScheme() ) );
this.parent = parent;
this.pathTemplate = pathTemplate;
this.openTapsThreshold = openTapsThreshold;
}
/**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
*
* @param parent of type Tap
* @param pathTemplate of type String
* @param sinkMode of type SinkMode
*/
@ConstructorProperties({"parent", "pathTemplate", "sinkMode"})
public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode )
{
super( new TemplateScheme( parent.getScheme() ), sinkMode );
this.parent = parent;
this.pathTemplate = pathTemplate;
}
/**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
* <p/>
* keepParentOnDelete, when set to true, prevents the parent Tap from being deleted when {@link #deletePath(org.apache.hadoop.mapred.JobConf)}
* is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
*
* @param parent of type Tap
* @param pathTemplate of type String
* @param sinkMode of type SinkMode
* @param keepParentOnDelete of type boolean
*/
@ConstructorProperties({"parent", "pathTemplate", "sinkMode", "keepParentOnDelete"})
public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode, boolean keepParentOnDelete )
{
this( parent, pathTemplate, sinkMode, keepParentOnDelete, OPEN_TAPS_THRESHOLD_DEFAULT );
}
/**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
* <p/>
* keepParentOnDelete, when set to true, prevents the parent Tap from being deleted when {@link #deletePath(org.apache.hadoop.mapred.JobConf)}
* is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
* <p/>
* openTapsThreshold limits the number of open files to be output to.
*
* @param parent of type Tap
* @param pathTemplate of type String
* @param sinkMode of type SinkMode
* @param keepParentOnDelete of type boolean
* @param openTapsThreshold of type int
*/
@ConstructorProperties({"parent", "pathTemplate", "sinkMode", "keepParentOnDelete", "openTapsThreshold"})
public TemplateTap( Hfs parent, String pathTemplate, SinkMode sinkMode, boolean keepParentOnDelete, int openTapsThreshold )
{
super( new TemplateScheme( parent.getScheme() ), sinkMode );
this.parent = parent;
this.pathTemplate = pathTemplate;
this.keepParentOnDelete = keepParentOnDelete;
this.openTapsThreshold = openTapsThreshold;
}
/**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
* The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
* <p/>
* This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
* data not in the result file to be used in the template path name.
*
* @param parent of type Tap
* @param pathTemplate of type String
* @param pathFields of type Fields
*/
@ConstructorProperties({"parent", "pathTemplate", "pathFields"})
public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields )
{
this( parent, pathTemplate, pathFields, OPEN_TAPS_THRESHOLD_DEFAULT );
}
/**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
* The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
* <p/>
* This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
* data not in the result file to be used in the template path name.
* <p/>
* openTapsThreshold limits the number of open files to be output to.
*
* @param parent of type Hfs
* @param pathTemplate of type String
* @param pathFields of type Fields
* @param openTapsThreshold of type int
*/
@ConstructorProperties({"parent", "pathTemplate", "pathFields", "openTapsThreshold"})
public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, int openTapsThreshold )
{
super( new TemplateScheme( parent.getScheme(), pathTemplate, pathFields ) );
this.parent = parent;
this.pathTemplate = pathTemplate;
this.openTapsThreshold = openTapsThreshold;
}
/**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
* The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
* <p/>
* This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
* data not in the result file to be used in the template path name.
*
* @param parent of type Tap
* @param pathTemplate of type String
* @param pathFields of type Fields
* @param sinkMode of type SinkMode
*/
@ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode"})
public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode )
{
super( new TemplateScheme( parent.getScheme(), pathTemplate, pathFields ), sinkMode );
this.parent = parent;
this.pathTemplate = pathTemplate;
}
/**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
* The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
* <p/>
* This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
* data not in the result file to be used in the template path name.
* <p/>
* keepParentOnDelete, when set to true, prevents the parent Tap from being deleted when {@link #deletePath(org.apache.hadoop.mapred.JobConf)}
* is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
*
* @param parent of type Tap
* @param pathTemplate of type String
* @param pathFields of type Fields
* @param sinkMode of type SinkMode
* @param keepParentOnDelete of type boolean
*/
@ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode", "keepParentOnDelete"})
public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode, boolean keepParentOnDelete )
{
this( parent, pathTemplate, pathFields, sinkMode, keepParentOnDelete, OPEN_TAPS_THRESHOLD_DEFAULT );
}
/**
* /**
* Constructor TemplateTap creates a new TemplateTap instance using the given parent {@link Hfs} Tap as the
* base path and default {@link cascading.scheme.Scheme}, and the pathTemplate as the {@link java.util.Formatter} format String.
* The pathFields is a selector that selects and orders the fields to be used in the given pathTemplate.
* <p/>
* This constructor also allows the sinkFields of the parent Tap to be independent of the pathFields. Thus allowing
* data not in the result file to be used in the template path name.
* <p/>
* keepParentOnDelete, when set to true, prevents the parent Tap from being deleted when {@link #deletePath(org.apache.hadoop.mapred.JobConf)}
* is called, typically an issue when used inside a {@link cascading.cascade.Cascade}.
* <p/>
* openTapsThreshold limits the number of open files to be output to.
*
* @param parent of type Hfs
* @param pathTemplate of type String
* @param pathFields of type Fields
* @param sinkMode of type SinkMode
* @param keepParentOnDelete of type boolean
* @param openTapsThreshold of type int
*/
@ConstructorProperties({"parent", "pathTemplate", "pathFields", "sinkMode", "keepParentOnDelete",
"openTapsThreshold"})
public TemplateTap( Hfs parent, String pathTemplate, Fields pathFields, SinkMode sinkMode, boolean keepParentOnDelete, int openTapsThreshold )
{
super( new TemplateScheme( parent.getScheme(), pathTemplate, pathFields ), sinkMode );
this.parent = parent;
this.pathTemplate = pathTemplate;
this.keepParentOnDelete = keepParentOnDelete;
this.openTapsThreshold = openTapsThreshold;
}
/**
* Method getParent returns the parent Tap of this TemplateTap object.
*
* @return the parent (type Tap) of this TemplateTap object.
*/
public Tap getParent()
{
return parent;
}
/**
* Method getPathTemplate returns the pathTemplate {@link java.util.Formatter} format String of this TemplateTap object.
*
* @return the pathTemplate (type String) of this TemplateTap object.
*/
public String getPathTemplate()
{
return pathTemplate;
}
@Override
public boolean isWriteDirect()
{
return true;
}
/** @see Tap#getPath() */
public Path getPath()
{
return parent.getPath();
}
/**
* Method getOpenTapsThreshold returns the openTapsThreshold of this TemplateTap object.
*
* @return the openTapsThreshold (type int) of this TemplateTap object.
*/
public int getOpenTapsThreshold()
{
return openTapsThreshold;
}
@Override
public TupleEntryCollector openForWrite( JobConf conf ) throws IOException
{
return new TemplateCollector( conf );
}
/** @see Tap#makeDirs(JobConf) */
public boolean makeDirs( JobConf conf ) throws IOException
{
return parent.makeDirs( conf );
}
/** @see Tap#deletePath(JobConf) */
public boolean deletePath( JobConf conf ) throws IOException
{
return keepParentOnDelete || parent.deletePath( conf );
}
/** @see Tap#pathExists(JobConf) */
public boolean pathExists( JobConf conf ) throws IOException
{
return parent.pathExists( conf );
}
/** @see Tap#getPathModified(JobConf) */
public long getPathModified( JobConf conf ) throws IOException
{
return parent.getPathModified( conf );
}
@Override
public boolean equals( Object object )
{
if( this == object )
return true;
if( object == null || getClass() != object.getClass() )
return false;
if( !super.equals( object ) )
return false;
TemplateTap that = (TemplateTap) object;
if( parent != null ? !parent.equals( that.parent ) : that.parent != null )
return false;
if( pathTemplate != null ? !pathTemplate.equals( that.pathTemplate ) : that.pathTemplate != null )
return false;
return true;
}
@Override
public int hashCode()
{
int result = super.hashCode();
result = 31 * result + ( parent != null ? parent.hashCode() : 0 );
result = 31 * result + ( pathTemplate != null ? pathTemplate.hashCode() : 0 );
return result;
}
@Override
public String toString()
{
return getClass().getSimpleName() + "[\"" + parent + "\"]" + "[\"" + pathTemplate + "\"]";
}
}