/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.pipe.assembly;
import java.beans.ConstructorProperties;
import java.util.LinkedHashMap;
import java.util.Map;
import cascading.flow.FlowProcess;
import cascading.operation.BaseOperation;
import cascading.operation.Filter;
import cascading.operation.FilterCall;
import cascading.operation.OperationCall;
import cascading.operation.aggregator.First;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
/**
* Class Unique {@link SubAssembly} is used to filter all duplicates out of a tuple stream.
* <p/>
* Typically finding unique value in a tuple stream relies on a {@link GroupBy} and a {@link First}
* {@link cascading.operation.Aggregator} operation.
* <p/>
* This SubAssembly also uses the {@link FilterPartialDuplicates} {@link cascading.operation.Filter}
* to remove as many observed duplicates before the GroupBy operator to reduce IO over the network.
* <p/>
* This strategy is similar to using {@code combiners}, except no sorting or serialization is invoked and results
* in a much simpler mechanism.
* <p/>
* The {@code threshold} value tells the underlying FilterPartialDuplicates how many values to cache for duplicate
* comparison before dropping values from the LRU cache.
*/
public class Unique extends SubAssembly
{
/**
* Class FilterPartialDuplicates is a {@link cascading.operation.Filter} that is used to remove observed duplicates from the tuple stream.
* <p/>
* Use this class typically in tandem with a {@link cascading.operation.aggregator.First}
* {@link cascading.operation.Aggregator} in order to improve de-duping performance by removing as many values
* as possible before the intermediate {@link cascading.pipe.GroupBy} operator.
* <p/>
* The {@code threshold} value is used to maintain a LRU of a constant size. If more than threshold unique values
* are seen, the oldest cached values will be removed from the cache.
*
* @see Unique
*/
public static class FilterPartialDuplicates extends BaseOperation<LinkedHashMap<Tuple, Object>> implements Filter<LinkedHashMap<Tuple, Object>>
{
private int threshold = 10000;
/** Constructor FilterPartialDuplicates creates a new FilterPartialDuplicates instance. */
public FilterPartialDuplicates()
{
}
/**
* Constructor FilterPartialDuplicates creates a new FilterPartialDuplicates instance.
*
* @param threshold of type int
*/
@ConstructorProperties({"threshold"})
public FilterPartialDuplicates( int threshold )
{
this.threshold = threshold;
}
@Override
public void prepare( FlowProcess flowProcess, OperationCall<LinkedHashMap<Tuple, Object>> operationCall )
{
operationCall.setContext( new LinkedHashMap<Tuple, Object>( threshold, 0.75f, true )
{
@Override
protected boolean removeEldestEntry( Map.Entry eldest )
{
return size() > threshold;
}
} );
}
@Override
public boolean isRemove( FlowProcess flowProcess, FilterCall<LinkedHashMap<Tuple, Object>> filterCall )
{
// we assume its more painful to create lots of tuple copies vs comparisons
Tuple args = filterCall.getArguments().getTuple();
if( filterCall.getContext().containsKey( args ) )
return true;
filterCall.getContext().put( filterCall.getArguments().getTupleCopy(), null );
return false;
}
@Override
public void cleanup( FlowProcess flowProcess, OperationCall<LinkedHashMap<Tuple, Object>> operationCall )
{
operationCall.setContext( null );
}
@Override
public boolean equals( Object object )
{
if( this == object )
return true;
if( !( object instanceof FilterPartialDuplicates ) )
return false;
if( !super.equals( object ) )
return false;
FilterPartialDuplicates that = (FilterPartialDuplicates) object;
if( threshold != that.threshold )
return false;
return true;
}
@Override
public int hashCode()
{
int result = super.hashCode();
result = 31 * result + threshold;
return result;
}
}
/**
* Constructor Unique creates a new Unique instance.
*
* @param pipe of type Pipe
* @param groupingFields of type Fields
*/
@ConstructorProperties({"pipe", "groupingFields"})
public Unique( Pipe pipe, Fields groupingFields )
{
this( null, pipe, groupingFields );
}
/**
* Constructor Unique creates a new Unique instance.
*
* @param pipe of type Pipe
* @param groupingFields of type Fields
* @param threshold of type int
*/
@ConstructorProperties({"pipe", "groupingFields", "threshold"})
public Unique( Pipe pipe, Fields groupingFields, int threshold )
{
this( null, pipe, groupingFields, threshold );
}
/**
* Constructor Unique creates a new Unique instance.
*
* @param name of type String
* @param pipe of type Pipe
* @param groupingFields of type Fields
*/
@ConstructorProperties({"name", "pipe", "groupingFields"})
public Unique( String name, Pipe pipe, Fields groupingFields )
{
this( name, pipe, groupingFields, 10000 );
}
/**
* Constructor Unique creates a new Unique instance.
*
* @param name of type String
* @param pipe of type Pipe
* @param groupingFields of type Fields
* @param threshold of type int
*/
@ConstructorProperties({"name", "pipe", "groupingFields", "threshold"})
public Unique( String name, Pipe pipe, Fields groupingFields, int threshold )
{
this( name, Pipe.pipes( pipe ), groupingFields, threshold );
}
/**
* Constructor Unique creates a new Unique instance.
*
* @param pipes of type Pipe[]
* @param groupingFields of type Fields
*/
@ConstructorProperties({"pipes", "groupingFields"})
public Unique( Pipe[] pipes, Fields groupingFields )
{
this( null, pipes, groupingFields, 10000 );
}
/**
* Constructor Unique creates a new Unique instance.
*
* @param pipes of type Pipe[]
* @param groupingFields of type Fields
* @param threshold of type int
*/
@ConstructorProperties({"pipes", "groupingFields", "threshold"})
public Unique( Pipe[] pipes, Fields groupingFields, int threshold )
{
this( null, pipes, groupingFields, threshold );
}
/**
* Constructor Unique creates a new Unique instance.
*
* @param name of type String
* @param pipes of type Pipe[]
* @param groupingFields of type Fields
*/
@ConstructorProperties({"name", "pipes", "groupingFields"})
public Unique( String name, Pipe[] pipes, Fields groupingFields )
{
this( name, pipes, groupingFields, 10000 );
}
/**
* Constructor Unique creates a new Unique instance.
*
* @param name of type String
* @param pipes of type Pipe[]
* @param groupingFields of type Fields
* @param threshold of type int
*/
@ConstructorProperties({"name", "pipes", "groupingFields", "threshold"})
public Unique( String name, Pipe[] pipes, Fields groupingFields, int threshold )
{
Pipe[] filters = new Pipe[pipes.length];
FilterPartialDuplicates partialDuplicates = new FilterPartialDuplicates( threshold );
for( int i = 0; i < filters.length; i++ )
filters[ i ] = new Each( pipes[ i ], groupingFields, partialDuplicates );
Pipe pipe = new GroupBy( name, filters, groupingFields );
pipe = new Every( pipe, Fields.ALL, new First(), Fields.RESULTS );
setTails( pipe );
}
}