/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.tuple;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.Flushable;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import cascading.flow.FlowProcess;
import cascading.tuple.hadoop.TupleSerialization;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
/**
* SpillableTupleList is a simple {@link Iterable} object that can store an unlimited number of {@link Tuple} instances by spilling
* excess to a temporary disk file.
*/
public class SpillableTupleList implements Iterable<Tuple>
{
/** Field LOG */
private static final Logger LOG = Logger.getLogger( SpillableTupleList.class );
enum Spill
{
Num_Spills_Written, Num_Spills_Read
}
/** Field threshold */
private long threshold = 10000;
/** Field codec */
private CompressionCodec codec = null;
/** Field flowProcess */
private FlowProcess flowProcess;
/** Field files */
private List<File> files = new LinkedList<File>();
/** Field current */
private List<Tuple> current = new LinkedList<Tuple>();
/** Field overrideIterator */
private Iterator<Tuple> overrideIterator;
/** Field size */
private long size = 0;
/** Field fields */
private Fields fields;
/** Field serializationElementWriter */
private TupleSerialization tupleSerialization;
/** Constructor SpillableTupleList creates a new SpillableTupleList instance. */
public SpillableTupleList()
{
}
/**
* Constructor SpillableTupleList creates a new SpillableTupleList instance using the given threshold value.
*
* @param threshold of type long
*/
public SpillableTupleList( long threshold )
{
this.threshold = threshold;
}
/**
* Constructor SpillableTupleList creates a new SpillableTupleList instance using the given threshold value, and
* the first available compression codec, if any.
*
* @param threshold of type long
* @param conf
* @param codec of type CompressionCodec
*/
public SpillableTupleList( long threshold, JobConf conf, CompressionCodec codec )
{
this( threshold, conf, codec, null );
}
public SpillableTupleList( long threshold, JobConf conf, CompressionCodec codec, FlowProcess flowProcess )
{
this.threshold = threshold;
this.codec = codec;
this.flowProcess = flowProcess;
if( conf != null )
tupleSerialization = new TupleSerialization( conf );
}
/**
* Method add will add the given {@link Tuple} to this list.
*
* @param tuple of type Tuple
*/
public boolean add( Tuple tuple )
{
current.add( tuple );
size++;
return doSpill();
}
/**
* Method add the given {@link TupleEntry} to this list. All TupleEntry instances added must declare the same {@link Fields}.
*
* @param tupleEntry of type TupleEntry
*/
public boolean add( TupleEntry tupleEntry )
{
if( fields == null )
fields = tupleEntry.fields;
else if( !fields.equals( tupleEntry.fields ) )
throw new IllegalArgumentException( "all entries must have same fields, have: " + fields.print() + " got: " + tupleEntry.fields.print() );
return add( tupleEntry.getTuple() );
}
/**
* Method size returns the size of this list.
*
* @return long
*/
public long size()
{
return size;
}
/**
* Method isEmpty returns true if this list is empty
*
* @return the empty (type boolean) of this SpillableTupleList object.
*/
public boolean isEmpty()
{
return overrideIterator == null && files.isEmpty() && current.size() == 0;
}
/**
* Method getNumFiles returns the number of files this list has spilled to.
*
* @return the numFiles (type int) of this SpillableTupleList object.
*/
public int getNumFiles()
{
return files.size();
}
private final boolean doSpill()
{
if( current.size() != threshold )
return false;
LOG.info( "spilling tuple list to file number " + ( getNumFiles() + 1 ) );
if( flowProcess != null )
flowProcess.increment( Spill.Num_Spills_Written, 1 );
File file = createTempFile();
TupleOutputStream dataOutputStream = createTupleOutputStream( file );
try
{
writeList( dataOutputStream, current );
}
finally
{
flushSilent( dataOutputStream );
closeSilent( dataOutputStream );
}
files.add( file );
current.clear();
return true;
}
private void flushSilent( Flushable flushable )
{
try
{
flushable.flush();
}
catch( IOException exception )
{
// ignore
}
}
private void closeSilent( Closeable closeable )
{
try
{
closeable.close();
}
catch( IOException exception )
{
// ignore
}
}
private void writeList( TupleOutputStream dataOutputStream, List<Tuple> list )
{
try
{
dataOutputStream.writeLong( list.size() );
for( Tuple tuple : list )
dataOutputStream.writeTuple( tuple );
}
catch( IOException exception )
{
throw new TupleException( "unable to write to file output stream", exception );
}
}
private TupleOutputStream createTupleOutputStream( File file )
{
OutputStream outputStream;
try
{
if( codec == null )
outputStream = new FileOutputStream( file );
else
outputStream = codec.createOutputStream( new FileOutputStream( file ) );
if( tupleSerialization == null )
return new TupleOutputStream( outputStream );
else
return new TupleOutputStream( outputStream, tupleSerialization.getElementWriter() );
}
catch( IOException exception )
{
throw new TupleException( "unable to create temporary file input stream", exception );
}
}
private List<Tuple> readList( TupleInputStream tupleInputStream )
{
try
{
long size = tupleInputStream.readLong();
List<Tuple> list = new LinkedList<Tuple>();
for( int i = 0; i < size; i++ )
list.add( tupleInputStream.readTuple() );
return list;
}
catch( IOException exception )
{
throw new TupleException( "unable to read from file output stream", exception );
}
}
private TupleInputStream createTupleInputStream( File file )
{
try
{
InputStream inputStream;
if( codec == null )
inputStream = new FileInputStream( file );
else
inputStream = codec.createInputStream( new FileInputStream( file ) );
if( tupleSerialization == null )
return new TupleInputStream( inputStream );
else
return new TupleInputStream( inputStream, tupleSerialization.getElementReader() );
}
catch( IOException exception )
{
throw new TupleException( "unable to create temporary file output stream", exception );
}
}
private File createTempFile()
{
try
{
File file = File.createTempFile( "cascading-spillover", null );
file.deleteOnExit();
return file;
}
catch( IOException exception )
{
throw new TupleException( "unable to create temporary file", exception );
}
}
/** Method clear empties this container so it may be re-used. */
public void clear()
{
overrideIterator = null;
files.clear();
current.clear();
size = 0;
}
public void setIterator( final IndexTuple current, final Iterator values )
{
overrideIterator = new Iterator<Tuple>()
{
IndexTuple value = current;
@Override
public boolean hasNext()
{
return value != null;
}
@Override
public Tuple next()
{
Tuple result = value.getTuple();
if( values.hasNext() )
value = (IndexTuple) values.next();
else
value = null;
return result;
}
@Override
public void remove()
{
// unsupported
}
};
}
/**
* Method iterator returns a Tuple Iterator of all the values in this collection.
*
* @return Iterator<Tuple>
*/
public Iterator<Tuple> iterator()
{
if( overrideIterator != null )
return overrideIterator;
if( files.isEmpty() )
return current.iterator();
return new SpilledListIterator();
}
/**
* Method entryIterator returns a TupleEntry Iterator of all the alues in this collection.
*
* @return Iterator<TupleEntry>
*/
public Iterator<TupleEntry> entryIterator()
{
return new TupleEntryIterator( fields, iterator() );
}
private class SpilledListIterator implements Iterator<Tuple>
{
int fileIndex = 0;
List<Tuple> currentList;
private Iterator<Tuple> iterator;
private SpilledListIterator()
{
getNextList();
}
private void getNextList()
{
if( fileIndex < files.size() )
currentList = getListFor( files.get( fileIndex++ ) );
else
currentList = current;
iterator = currentList.iterator();
}
private List<Tuple> getListFor( File file )
{
if( flowProcess != null )
flowProcess.increment( Spill.Num_Spills_Read, 1 );
TupleInputStream dataInputStream = createTupleInputStream( file );
try
{
return readList( dataInputStream );
}
finally
{
closeSilent( dataInputStream );
}
}
public boolean hasNext()
{
if( currentList == current )
return iterator.hasNext();
if( iterator.hasNext() )
return true;
getNextList();
return hasNext();
}
public Tuple next()
{
if( currentList == current || iterator.hasNext() )
return iterator.next();
getNextList();
return next();
}
public void remove()
{
throw new UnsupportedOperationException( "remove is not supported" );
}
}
}