/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.pipe.assembly; import java.io.Serializable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import cascading.flow.FlowProcess; import cascading.operation.Aggregator; import cascading.operation.BaseOperation; import cascading.operation.Function; import cascading.operation.FunctionCall; import cascading.operation.OperationCall; import cascading.pipe.Each; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.pipe.SubAssembly; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryCollector; /** * Class AggregateBy is a {@link SubAssembly} that serves two roles for handling aggregate operations. * <p/> * The first role is as a base class for composable aggregate operations that have a MapReduce Map side optimization for the * Reduce side aggregation. For example 'summing' a value within a grouping can be performed partially Map side and * completed Reduce side. Summing is associative and commutative. * <p/> * AggregateBy also supports operations that are not associative/commutative like 'counting'. Counting * would result in 'counting' value occurrences Map side but summing those counts Reduce side. (Yes, counting can be * transposed to summing Map and Reduce sides by emitting 1's before the first sum, but that's three operations over * two, and a hack) * <p/> * Think of this mechanism as a MapReduce Combiner, but more efficient as no values are serialized, * deserialized, saved to disk, and multi-pass sorted in the process, which consume cpu resources in trade of * memory and a little or no IO. * <p/> * Further, Combiners are limited to only associative/commutative operations. * <p/> * Additionally the Cascading planner can move the Map side optimization * to the previous Reduce operation further increasing IO performance (between the preceding Reduce and Map phase which * is over HDFS). * <p/> * The second role of the AggregateBy class is to allow for composition of AggregateBy * sub-classes. That is, {@link SumBy} and {@link CountBy} AggregateBy sub-classes can be performed * in parallel on the same grouping keys. * </p> * Custom AggregateBy classes can be created by sub-classing this class and implementing a special * {@link Functor} for use on the Map side. Multiple Functor instances are managed by the {@link CompositeFunction} * class allowing them all to share the same LRU value map for more efficiency. * <p/> * To tune the LRU, set the {@code threshold} value to a high enough value to utilize available memory. * <p/> * Note using a AggregateBy instance automatically inserts a {@link GroupBy} into the resulting {@link cascading.flow.Flow}. * And passing multiple AggregateBy instances to a parent AggregateBy instance still results in one GroupBy. * <p/> * Also note that {@link Unique} is not a CompositeAggregator as it makes no sense to combine it with other aggregators, * and so is slightly more optimized internally. * * @see SumBy * @see CountBy * @see Unique */ public class AggregateBy extends SubAssembly { private String name; private int threshold; private Fields[] argumentFields; private Functor[] functors; private Aggregator[] aggregators; /** * Interface Functor provides a means to create a simple function for use with the {@link CompositeFunction} class. * <p/> * Note the {@link FlowProcess} argument provides access to the underlying properties and counter APIs. */ public interface Functor extends Serializable { /** * Method getDeclaredFields returns the declaredFields of this Functor object. * * @return the declaredFields (type Fields) of this Functor object. */ Fields getDeclaredFields(); /** * Method aggregate operates on the given args in tandem (optionally) with the given context values. * <p/> * The context argument is the result of the previous call to this method. Use it to store values between aggregate * calls (the current count, or sum of the args). * <p/> * On the very first invocation of aggregate for a given grouping key, context will be {@code null}. All subsequent * invocations context will be the value returned on the previous invocation. * * @param flowProcess of type FlowProcess * @param args of type TupleEntry * @param context of type Tuple @return Tuple */ Tuple aggregate( FlowProcess flowProcess, TupleEntry args, Tuple context ); /** * Method complete allows the final aggregate computation to be performed before the return value is collected. * <p/> * The number of values in the returned {@link Tuple} instance must match the number of declaredFields. * <p/> * It is safe to return the context object as the result value. * * @param flowProcess of type FlowProcess * @param context of type Tuple @return Tuple */ Tuple complete( FlowProcess flowProcess, Tuple context ); } /** * Class CompositeFunction takes multiple Functor instances and manages them as a single {@link Function}. * * @see Functor */ public static class CompositeFunction extends BaseOperation<LinkedHashMap<Tuple, Tuple[]>> implements Function<LinkedHashMap<Tuple, Tuple[]>> { public static final int DEFAULT_THRESHOLD = 10000; private int threshold = DEFAULT_THRESHOLD; private Fields groupingFields; private Fields[] argumentFields; private Fields[] functorFields; private Functor[] functors; /** * Constructor CompositeFunction creates a new CompositeFunction instance. * * @param groupingFields of type Fields * @param argumentFields of type Fields * @param functor of type Functor * @param threshold of type int */ public CompositeFunction( Fields groupingFields, Fields argumentFields, Functor functor, int threshold ) { this( groupingFields, Fields.fields( argumentFields ), new Functor[]{functor}, threshold ); } /** * Constructor CompositeFunction creates a new CompositeFunction instance. * * @param groupingFields of type Fields * @param argumentFields of type Fields[] * @param functors of type Functor[] * @param threshold of type int */ public CompositeFunction( Fields groupingFields, Fields[] argumentFields, Functor[] functors, int threshold ) { super( getFields( groupingFields, functors ) ); this.groupingFields = groupingFields; this.argumentFields = argumentFields; this.functors = functors; this.threshold = threshold; functorFields = new Fields[functors.length]; for( int i = 0; i < functors.length; i++ ) functorFields[ i ] = functors[ i ].getDeclaredFields(); } private static Fields getFields( Fields groupingFields, Functor[] functors ) { Fields fields = groupingFields; for( int i = 0; i < functors.length; i++ ) fields = fields.append( functors[ i ].getDeclaredFields() ); return fields; } @Override public void prepare( final FlowProcess flowProcess, final OperationCall<LinkedHashMap<Tuple, Tuple[]>> operationCall ) { operationCall.setContext( new LinkedHashMap<Tuple, Tuple[]>( threshold, 0.75f, true ) { @Override protected boolean removeEldestEntry( Map.Entry<Tuple, Tuple[]> eldest ) { boolean doRemove = size() > threshold; if( doRemove ) completeFunctors( flowProcess, ( (FunctionCall) operationCall ).getOutputCollector(), eldest ); return doRemove; } } ); } @Override public void operate( FlowProcess flowProcess, FunctionCall<LinkedHashMap<Tuple, Tuple[]>> functionCall ) { TupleEntry args = functionCall.getArguments(); Tuple key = args.selectTuple( groupingFields ); Tuple[] context = functionCall.getContext().get( key ); if( context == null ) { context = new Tuple[functors.length]; functionCall.getContext().put( key, context ); } for( int i = 0; i < functors.length; i++ ) context[ i ] = functors[ i ].aggregate( flowProcess, args.selectEntry( argumentFields[ i ] ), context[ i ] ); } @Override public void cleanup( FlowProcess flowProcess, OperationCall<LinkedHashMap<Tuple, Tuple[]>> operationCall ) { // need to drain context TupleEntryCollector collector = ( (FunctionCall) operationCall ).getOutputCollector(); for( Map.Entry<Tuple, Tuple[]> entry : operationCall.getContext().entrySet() ) completeFunctors( flowProcess, collector, entry ); operationCall.setContext( null ); } private void completeFunctors( FlowProcess flowProcess, TupleEntryCollector outputCollector, Map.Entry<Tuple, Tuple[]> entry ) { Tuple result = new Tuple( entry.getKey() ); Tuple[] values = entry.getValue(); for( int i = 0; i < functors.length; i++ ) result.addAll( functors[ i ].complete( flowProcess, values[ i ] ) ); outputCollector.add( result ); } @Override public boolean equals( Object object ) { if( this == object ) return true; if( !( object instanceof CompositeFunction ) ) return false; if( !super.equals( object ) ) return false; CompositeFunction that = (CompositeFunction) object; if( threshold != that.threshold ) return false; if( !Arrays.equals( argumentFields, that.argumentFields ) ) return false; if( !Arrays.equals( functorFields, that.functorFields ) ) return false; if( !Arrays.equals( functors, that.functors ) ) return false; if( groupingFields != null ? !groupingFields.equals( that.groupingFields ) : that.groupingFields != null ) return false; return true; } @Override public int hashCode() { int result = super.hashCode(); result = 31 * result + threshold; result = 31 * result + ( groupingFields != null ? groupingFields.hashCode() : 0 ); result = 31 * result + ( argumentFields != null ? Arrays.hashCode( argumentFields ) : 0 ); result = 31 * result + ( functorFields != null ? Arrays.hashCode( functorFields ) : 0 ); result = 31 * result + ( functors != null ? Arrays.hashCode( functors ) : 0 ); return result; } } /** * Constructor CompositeAggregator creates a new CompositeAggregator instance. * * @param name of type String * @param threshold of type int */ protected AggregateBy( String name, int threshold ) { this.name = name; this.threshold = threshold; } /** * Constructor CompositeAggregator creates a new CompositeAggregator instance. * * @param argumentFields of type Fields * @param functor of type Functor * @param aggregator of type Aggregator */ protected AggregateBy( Fields argumentFields, Functor functor, Aggregator aggregator ) { this.argumentFields = Fields.fields( argumentFields ); this.functors = new Functor[]{functor}; this.aggregators = new Aggregator[]{aggregator}; } /** * Constructor CompositeAggregator creates a new CompositeAggregator instance. * * @param pipe of type Pipe * @param groupingFields of type Fields * @param assemblies of type CompositeAggregator... */ public AggregateBy( Pipe pipe, Fields groupingFields, AggregateBy... assemblies ) { this( null, Pipe.pipes( pipe ), groupingFields, CompositeFunction.DEFAULT_THRESHOLD, assemblies ); } /** * Constructor CompositeAggregator creates a new CompositeAggregator instance. * * @param pipe of type Pipe * @param groupingFields of type Fields * @param threshold of type int * @param assemblies of type CompositeAggregator... */ public AggregateBy( Pipe pipe, Fields groupingFields, int threshold, AggregateBy... assemblies ) { this( null, Pipe.pipes( pipe ), groupingFields, threshold, assemblies ); } /** * Constructor CompositeAggregator creates a new CompositeAggregator instance. * * @param name of type String * @param pipes of type Pipe[] * @param groupingFields of type Fields * @param assemblies of type CompositeAggregator... */ public AggregateBy( String name, Pipe[] pipes, Fields groupingFields, AggregateBy... assemblies ) { this( name, pipes, groupingFields, CompositeFunction.DEFAULT_THRESHOLD, assemblies ); } /** * Constructor CompositeAggregator creates a new CompositeAggregator instance. * * @param name of type String * @param pipes of type Pipe[] * @param groupingFields of type Fields * @param threshold of type int * @param assemblies of type CompositeAggregator... */ public AggregateBy( String name, Pipe[] pipes, Fields groupingFields, int threshold, AggregateBy... assemblies ) { this( name, threshold ); List<Fields> arguments = new ArrayList<Fields>(); List<Functor> functors = new ArrayList<Functor>(); List<Aggregator> aggregators = new ArrayList<Aggregator>(); for( int i = 0; i < assemblies.length; i++ ) { AggregateBy assembly = assemblies[ i ]; Collections.addAll( arguments, assembly.getArgumentFields() ); Collections.addAll( functors, assembly.getFunctors() ); Collections.addAll( aggregators, assembly.getAggregators() ); } initialize( groupingFields, pipes, arguments.toArray( new Fields[0] ), functors.toArray( new Functor[0] ), aggregators.toArray( new Aggregator[0] ) ); } protected AggregateBy( String name, Pipe[] pipes, Fields groupingFields, Fields argument, Functor functor, Aggregator aggregator, int threshold ) { this( name, threshold ); initialize( groupingFields, pipes, argument, functor, aggregator ); } protected void initialize( Fields groupingFields, Pipe[] pipes, Fields argument, Functor functor, Aggregator aggregator ) { initialize( groupingFields, pipes, Fields.fields( argument ), new Functor[]{functor}, new Aggregator[]{aggregator} ); } protected void initialize( Fields groupingFields, Pipe[] pipes, Fields[] argumentFields, Functor[] functors, Aggregator[] aggregators ) { this.argumentFields = argumentFields; this.functors = functors; this.aggregators = aggregators; verify(); Fields argumentSelector = Fields.merge( groupingFields, Fields.merge( argumentFields ) ); Pipe[] functions = new Pipe[pipes.length]; CompositeFunction function = new CompositeFunction( groupingFields, argumentFields, functors, threshold ); for( int i = 0; i < functions.length; i++ ) functions[ i ] = new Each( pipes[ i ], argumentSelector, function, Fields.RESULTS ); Pipe pipe = new GroupBy( name, functions, groupingFields ); for( int i = 0; i < aggregators.length; i++ ) pipe = new Every( pipe, functors[ i ].getDeclaredFields(), aggregators[ i ], Fields.ALL ); setTails( pipe ); } /** Method verify should be overridden by sub-classes if any values must be tested before the calling constructor returns. */ protected void verify() { } protected Fields[] getArgumentFields() { return argumentFields; } protected Functor[] getFunctors() { return functors; } protected Aggregator[] getAggregators() { return aggregators; } }