/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.pipe.cogroup;
import java.util.Iterator;
import cascading.flow.FlowProcess;
import cascading.flow.hadoop.HadoopFlowProcess;
import cascading.tuple.Fields;
import cascading.tuple.IndexTuple;
import cascading.tuple.SpillableTupleList;
import cascading.tuple.Tuple;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.log4j.Logger;
/**
* Class CoGroupClosure is used internally to represent co-grouping results of multiple tuple streams.
* <p/>
* <p/>
* "org.apache.hadoop.io.compress.LzoCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec"
*/
public class CoGroupClosure extends GroupClosure
{
public static final String SPILL_THRESHOLD = "cascading.cogroup.spill.threshold";
private static final int defaultThreshold = 10 * 1000;
public static final String SPILL_COMPRESS = "cascading.cogroup.spill.compress";
public static final String SPILL_CODECS = "cascading.cogroup.spill.codecs";
private static final String defaultCodecs = "org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec";
/** Field LOG */
private static final Logger LOG = Logger.getLogger( CoGroupClosure.class );
/** Field groups */
SpillableTupleList[] groups;
private int numSelfJoins;
private CompressionCodec codec;
private long threshold;
private JobConf conf;
public CoGroupClosure( FlowProcess flowProcess, int numSelfJoins, Fields[] groupingFields, Fields[] valueFields )
{
super( flowProcess, groupingFields, valueFields );
this.numSelfJoins = numSelfJoins;
this.codec = getCompressionCodec( flowProcess );
this.threshold = getLong( flowProcess, SPILL_THRESHOLD, defaultThreshold );
this.conf = ( (HadoopFlowProcess) flowProcess ).getJobConf();
initLists( flowProcess );
}
@Override
public int size()
{
return groups.length;
}
@Override
public Iterator<Tuple> getIterator( int pos )
{
if( pos < 0 || pos >= groups.length )
throw new IllegalArgumentException( "invalid group position: " + pos );
return makeIterator( pos, groups[ pos ].iterator() );
}
public boolean isEmpty( int pos )
{
return groups[ pos ].isEmpty();
}
@Override
public void reset( Joiner joiner, Tuple grouping, Iterator values )
{
super.reset( joiner, grouping, values );
build();
}
private void build()
{
clearGroups();
while( values.hasNext() )
{
IndexTuple current = (IndexTuple) values.next();
int pos = current.getIndex();
// if this is the first (lhs) co-group, just use values iterator
if( numSelfJoins == 0 && pos == 0 )
{
groups[ pos ].setIterator( current, values );
break;
}
boolean spilled = groups[ pos ].add( (Tuple) current.getTuple() ); // get the value tuple for this cogroup
if( spilled && ( groups[ pos ].getNumFiles() - 1 ) % 10 == 0 )
{
LOG.info( "spilled group: " + groupingFields[ pos ].printVerbose() + ", on grouping: " + getGrouping().print() );
Runtime runtime = Runtime.getRuntime();
long freeMem = runtime.freeMemory() / 1024 / 1024;
long maxMem = runtime.maxMemory() / 1024 / 1024;
long totalMem = runtime.totalMemory() / 1024 / 1024;
LOG.info( "mem on spill (mb), free: " + freeMem + ", total: " + totalMem + ", max: " + maxMem );
}
}
}
private void clearGroups()
{
for( SpillableTupleList group : groups )
group.clear();
}
private void initLists( FlowProcess flowProcess )
{
int numPipes = groupingFields.length;
groups = new SpillableTupleList[Math.max( numPipes, numSelfJoins + 1 )];
for( int i = 0; i < numPipes; i++ ) // use numPipes not numSelfJoins, see below
groups[ i ] = new SpillableTupleList( threshold, conf, codec, flowProcess );
for( int i = 1; i < numSelfJoins + 1; i++ )
groups[ i ] = groups[ 0 ];
}
private long getLong( FlowProcess flowProcess, String key, long defaultValue )
{
String value = (String) flowProcess.getProperty( key );
if( value == null || value.length() == 0 )
return defaultValue;
return Long.parseLong( value );
}
public CompressionCodec getCompressionCodec( FlowProcess flowProcess )
{
String compress = (String) flowProcess.getProperty( SPILL_COMPRESS );
if( compress != null && !Boolean.parseBoolean( compress ) )
return null;
String codecs = (String) flowProcess.getProperty( SPILL_CODECS );
if( codecs == null || codecs.length() == 0 )
codecs = defaultCodecs;
Class<? extends CompressionCodec> codecClass = null;
for( String codec : codecs.split( "[,\\s]+" ) )
{
try
{
LOG.info( "attempting to load codec: " + codec );
codecClass = Thread.currentThread().getContextClassLoader().loadClass( codec ).asSubclass( CompressionCodec.class );
if( codecClass != null )
{
LOG.info( "found codec: " + codec );
break;
}
}
catch( ClassNotFoundException exception )
{
// do nothing
}
}
if( codecClass == null )
{
LOG.warn( "codecs set, but unable to load any: " + codecs );
return null;
}
return ReflectionUtils.newInstance( codecClass, ( (HadoopFlowProcess) flowProcess ).getJobConf() );
}
}