/*
* Copyright (c) 2007-2012 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cascading.pipe.assembly;
import java.beans.ConstructorProperties;
import cascading.flow.FlowProcess;
import cascading.operation.Aggregator;
import cascading.operation.AggregatorCall;
import cascading.operation.BaseOperation;
import cascading.pipe.Pipe;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
/**
* Class AverageBy is used to average values associated with duplicate keys in a tuple stream.
* <p/>
* Typically finding the average value in a tuple stream relies on a {@link cascading.pipe.GroupBy} and a {@link cascading.operation.aggregator.Average}
* {@link cascading.operation.Aggregator} operation.
* <p/>
* This SubAssembly uses the {@link cascading.pipe.assembly.AverageBy.AveragePartials} {@link cascading.pipe.assembly.AggregateBy.Functor}
* and private {@link AverageFinal} Aggregator to count and sum as many field values before the GroupBy operator to reduce IO over the network.
* <p/>
* This strategy is similar to using {@code combiners}, except no sorting or serialization is invoked and results
* in a much simpler mechanism.
* <p/>
* The {@code threshold} value tells the underlying AveragePartials functions how many unique key sums and counts to accumulate
* in the LRU cache, before emitting the least recently used entry.
*
* @see cascading.pipe.assembly.AggregateBy
*/
public class AverageBy extends AggregateBy
{
private static final Fields BIND_FIELDS = new Fields( AverageBy.class.getPackage().getName() + ".sum", AverageBy.class.getPackage().getName() + ".count" );
/**
* Class AveragePartials is a {@link cascading.pipe.assembly.AggregateBy.Functor} that is used to count and sum observed duplicates from the tuple stream.
*
* @see cascading.pipe.assembly.AverageBy
*/
public static class AveragePartials implements Functor
{
/** Constructor SumPartials creates a new SumPartials instance. */
public AveragePartials()
{
}
@Override
public Fields getDeclaredFields()
{
return BIND_FIELDS;
}
@Override
public Tuple aggregate( FlowProcess flowProcess, TupleEntry args, Tuple context )
{
if( context == null )
context = Tuple.size( 2 );
context.set( 0, context.getDouble( 0 ) + args.getDouble( 0 ) );
context.set( 1, context.getLong( 1 ) + 1 );
return context;
}
@Override
public Tuple complete( FlowProcess flowProcess, Tuple context )
{
return context;
}
}
/**
* Class AverageFinal is used to finalize the average operation on the Reduce side of the process. It must be used
* in tandem with a {@link AveragePartials} Functor.
*/
public static class AverageFinal extends BaseOperation<AverageFinal.Context> implements Aggregator<AverageFinal.Context>
{
/** Class Context is used to hold intermediate values. */
protected static class Context
{
double sum = 0.0D;
long count = 0L;
public Context reset()
{
sum = 0.0D;
count = 0L;
return this;
}
}
/**
* Constructs a new instance that returns the average of the values encountered in the given fieldDeclaration field name.
*
* @param fieldDeclaration of type Fields
*/
public AverageFinal( Fields fieldDeclaration )
{
super( 2, fieldDeclaration );
if( !fieldDeclaration.isSubstitution() && fieldDeclaration.size() != 1 )
throw new IllegalArgumentException( "fieldDeclaration may only declare 1 field, got: " + fieldDeclaration.size() );
}
public void start( FlowProcess flowProcess, AggregatorCall<Context> aggregatorCall )
{
if( aggregatorCall.getContext() != null )
aggregatorCall.getContext().reset();
else
aggregatorCall.setContext( new Context() );
}
public void aggregate( FlowProcess flowProcess, AggregatorCall<Context> aggregatorCall )
{
Context context = aggregatorCall.getContext();
TupleEntry arguments = aggregatorCall.getArguments();
context.sum += arguments.getDouble( 0 );
context.count += arguments.getLong( 1 );
}
public void complete( FlowProcess flowProcess, AggregatorCall<Context> aggregatorCall )
{
aggregatorCall.getOutputCollector().add( getResult( aggregatorCall ) );
}
private Tuple getResult( AggregatorCall<Context> aggregatorCall )
{
Context context = aggregatorCall.getContext();
return new Tuple( (Double) context.sum / context.count );
}
}
/**
* Constructor AverageBy creates a new AverageBy instance. Use this constructor when used with a {@link cascading.pipe.assembly.AggregateBy}
* instance.
*
* @param valueField of type Fields
* @param averageField of type Fields
*/
@ConstructorProperties({"valueField", "averageField"})
public AverageBy( Fields valueField, Fields averageField )
{
super( valueField, new AveragePartials(), new AverageFinal( averageField ) );
}
//////////////
/**
* Constructor AverageBy creates a new AverageBy instance.
*
* @param pipe of type Pipe
* @param groupingFields of type Fields
* @param valueField of type Fields
* @param averageField of type Fields
*/
@ConstructorProperties({"pipe", "groupingFields", "valueField", "averageField"})
public AverageBy( Pipe pipe, Fields groupingFields, Fields valueField, Fields averageField )
{
this( null, pipe, groupingFields, valueField, averageField, 10000 );
}
/**
* Constructor AverageBy creates a new AverageBy instance.
*
* @param pipe of type Pipe
* @param groupingFields of type Fields
* @param valueField of type Fields
* @param averageField of type Fields
* @param threshold of type int
*/
@ConstructorProperties({"pipe", "groupingFields", "valueField", "averageField", "threshold"})
public AverageBy( Pipe pipe, Fields groupingFields, Fields valueField, Fields averageField, int threshold )
{
this( null, pipe, groupingFields, valueField, averageField, threshold );
}
/**
* Constructor AverageBy creates a new AverageBy instance.
*
* @param name of type String
* @param pipe of type Pipe
* @param groupingFields of type Fields
* @param valueField of type Fields
* @param averageField of type Fields
*/
@ConstructorProperties({"name", "pipe", "groupingFields", "valueField", "averageField"})
public AverageBy( String name, Pipe pipe, Fields groupingFields, Fields valueField, Fields averageField )
{
this( name, pipe, groupingFields, valueField, averageField, 10000 );
}
/**
* Constructor AverageBy creates a new AverageBy instance.
*
* @param name of type String
* @param pipe of type Pipe
* @param groupingFields of type Fields
* @param valueField of type Fields
* @param averageField of type Fields
* @param threshold of type int
*/
@ConstructorProperties({"name", "pipe", "groupingFields", "valueField", "averageField", "threshold"})
public AverageBy( String name, Pipe pipe, Fields groupingFields, Fields valueField, Fields averageField, int threshold )
{
this( name, Pipe.pipes( pipe ), groupingFields, valueField, averageField, threshold );
}
/**
* Constructor AverageBy creates a new AverageBy instance.
*
* @param pipes of type Pipe[]
* @param groupingFields of type Fields
* @param valueField of type Fields
* @param averageField of type Fields
*/
@ConstructorProperties({"name", "pipes", "groupingFields", "valueField", "averageField"})
public AverageBy( Pipe[] pipes, Fields groupingFields, Fields valueField, Fields averageField )
{
this( null, pipes, groupingFields, valueField, averageField, 10000 );
}
/**
* Constructor AverageBy creates a new AverageBy instance.
*
* @param pipes of type Pipe[]
* @param groupingFields of type Fields
* @param valueField of type Fields
* @param averageField of type Fields
* @param threshold of type int
*/
@ConstructorProperties({"name", "pipes", "groupingFields", "valueField", "averageField", "threshold"})
public AverageBy( Pipe[] pipes, Fields groupingFields, Fields valueField, Fields averageField, int threshold )
{
this( null, pipes, groupingFields, valueField, averageField, threshold );
}
/**
* Constructor AverageBy creates a new AverageBy instance.
*
* @param name of type String
* @param pipes of type Pipe[]
* @param groupingFields of type Fields
* @param valueField of type Fields
* @param averageField of type Fields
*/
@ConstructorProperties({"name", "pipes", "groupingFields", "valueField", "averageField"})
public AverageBy( String name, Pipe[] pipes, Fields groupingFields, Fields valueField, Fields averageField )
{
this( name, pipes, groupingFields, valueField, averageField, 10000 );
}
/**
* Constructor AverageBy creates a new AverageBy instance.
*
* @param name of type String
* @param pipes of type Pipe[]
* @param groupingFields of type Fields
* @param valueField of type Fields
* @param averageField of type Fields
* @param threshold of type int
*/
@ConstructorProperties({"name", "pipes", "groupingFields", "valueField", "averageField", "threshold"})
public AverageBy( String name, Pipe[] pipes, Fields groupingFields, Fields valueField, Fields averageField, int threshold )
{
super( name, pipes, groupingFields, valueField, new AveragePartials(), new AverageFinal( averageField ), threshold );
}
}