/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.pipe;
import java.beans.ConstructorProperties;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.operation.Aggregator;
import cascading.operation.Filter;
import cascading.operation.Function;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
/**
* The GroupBy pipe groups the {@link Tuple} stream by the given groupFields.
* </p>
* If more than one {@link Pipe} instance is provided on the constructor, all branches will be merged. It is required
* that all Pipe instances output the same field names, otherwise the {@link FlowConnector} will fail to create a
* {@link Flow} instance. Again, the Pipe instances are merged together as if one Tuple stream and not joined.
* See {@link CoGroup} for joining by common fields.
* </p>
* Typically an {@link Every} follows GroupBy to apply an {@link Aggregator} function to every grouping. The
* {@link Each} operator may also follow GroupBy to apply a {@link Function} or {@link Filter} to the resulting
* stream. But an Each cannot come immediately before an Every.
* <p/>
* Optionally a stream can be further sorted by providing sortFields. This allows an Aggregator to receive
* values in the order of the sortedFields.
* <p/>
* Note that local sorting always happens on the groupFields, sortFields are a secondary sorting on the grouped values within the
* current grouping. sortFields is particularly useful if the Aggregators following the GroupBy would like to see their arguments
* in order.
* <p/>
* For more control over sorting at the group or secondary sort level, use {@link cascading.tuple.Fields}
* containing {@link java.util.Comparator} instances for the appropriate fields when setting the groupFields or
* sortFields values. Fields allows you to set a custom {@link java.util.Comparator} instance for each field name or
* position. It is required that each Comparator class also be {@link java.io.Serializable}.
* <p/>
* It should be noted for MapReduce systems, distributed group sorting is not 'total'. That is groups are sorted
* as seen by each Reducer, but they are not sorted across Reducers. See the MapReduce algorithm for details.
*/
public class GroupBy extends Group
{
/**
* Creates a new GroupBy instance that will group on {@link Fields#ALL} fields.
*
* @param pipe of type Pipe
*/
@ConstructorProperties({"pipe"})
public GroupBy( Pipe pipe )
{
super( pipe );
}
/**
* Creates a new GroupBy instance that will group on the given groupFields field names.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
*/
@ConstructorProperties({"pipe", "groupFields"})
public GroupBy( Pipe pipe, Fields groupFields )
{
super( pipe, groupFields );
}
/**
* Creates a new GroupBy instance that will group on the given groupFields field names.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param reverseOrder of type boolean
*/
@ConstructorProperties({"pipe", "groupFields", "reverseOrder"})
public GroupBy( Pipe pipe, Fields groupFields, boolean reverseOrder )
{
super( pipe, groupFields, null, reverseOrder );
}
/**
* Creates a new GroupBy instance that will group on the given groupFields field names.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
*/
@ConstructorProperties({"groupName", "pipe", "groupFields"})
public GroupBy( String groupName, Pipe pipe, Fields groupFields )
{
super( groupName, pipe, groupFields );
}
/**
* Creates a new GroupBy instance that will group on the given groupFields field names.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param reverseOrder of type boolean
*/
@ConstructorProperties({"groupName", "pipe", "groupFields", "reverseOrder"})
public GroupBy( String groupName, Pipe pipe, Fields groupFields, boolean reverseOrder )
{
super( groupName, pipe, groupFields, null, reverseOrder );
}
/**
* Creates a new GroupBy instance that will group on the given groupFields field names
* and sorts the grouped values on the given sortFields fields names.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
*/
@ConstructorProperties({"pipe", "groupFields", "sortFields"})
public GroupBy( Pipe pipe, Fields groupFields, Fields sortFields )
{
super( pipe, groupFields, sortFields );
}
/**
* Creates a new GroupBy instance that will group on the given groupFields field names
* and sorts the grouped values on the given sortFields fields names.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
*/
@ConstructorProperties({"groupName", "pipe", "groupFields", "sortFields"})
public GroupBy( String groupName, Pipe pipe, Fields groupFields, Fields sortFields )
{
super( groupName, pipe, groupFields, sortFields );
}
/**
* Creates a new GroupBy instance that will group on the given groupFields field names
* and sorts the grouped values on the given sortFields fields names.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
* @param reverseOrder of type boolean
*/
@ConstructorProperties({"pipe", "groupFields", "sortFields", "reverseOrder"})
public GroupBy( Pipe pipe, Fields groupFields, Fields sortFields, boolean reverseOrder )
{
super( pipe, groupFields, sortFields, reverseOrder );
}
/**
* Creates a new GroupBy instance that will group on the given groupFields field names
* and sorts the grouped values on the given sortFields fields names.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
* @param reverseOrder of type boolean
*/
@ConstructorProperties({"groupName", "pipe", "groupFields", "sortFields", "reverseOrder"})
public GroupBy( String groupName, Pipe pipe, Fields groupFields, Fields sortFields, boolean reverseOrder )
{
super( groupName, pipe, groupFields, sortFields, reverseOrder );
}
//////////
// MERGE
//////////
/**
* Creates a new GroupBy instance that will first merge the given pipes, then group on Fields.FIRST.
* <p/>
* The assumption is that the first fields in all streams are logically the same field, which should be true
* as merging assumes all incoming streams have the same fields in the same order.
* <p/>
* To get the best performance, choose a field(s) that has many unique values, by using the constructor that takes
* a groupFields argument. If the first field has few unqiue values, data will only be sent to that number of reducers,
* or less, in the cluster, making the reduce phase a larger bottleneck.
*
* @param pipes of type Pipe
*/
@ConstructorProperties({"pipes"})
public GroupBy( Pipe[] pipes )
{
super( pipes, Fields.FIRST );
}
/**
* Creates a new GroupBy instance that will first merge the given pipes, then group on the given groupFields field names.
*
* @param pipes of type Pipe
* @param groupFields of type Fields
*/
@ConstructorProperties({"pipes", "groupFields"})
public GroupBy( Pipe[] pipes, Fields groupFields )
{
super( pipes, groupFields );
}
/**
* Creates a new GroupBy instance that will first merge the given pipes, then group on the given groupFields field names.
*
* @param lhsPipe of type Pipe
* @param rhsPipe of type Pipe
* @param groupFields of type Fields
*/
public GroupBy( Pipe lhsPipe, Pipe rhsPipe, Fields groupFields )
{
super( Pipe.pipes( lhsPipe, rhsPipe ), groupFields );
}
/**
* Creates a new GroupBy instance that will first merge the given pipes, then group on the given groupFields field names.
*
* @param groupName of type String
* @param pipes of type Pipe
* @param groupFields of type Fields
*/
@ConstructorProperties({"groupName", "pipes", "groupFields"})
public GroupBy( String groupName, Pipe[] pipes, Fields groupFields )
{
super( groupName, pipes, groupFields );
}
/**
* Creates a new GroupBy instance that will first merge the given pipes, then group on the given groupFields field names.
*
* @param groupName of type String
* @param lhsPipe of type Pipe
* @param rhsPipe of type Pipe
* @param groupFields of type Fields
*/
public GroupBy( String groupName, Pipe lhsPipe, Pipe rhsPipe, Fields groupFields )
{
super( groupName, Pipe.pipes( lhsPipe, rhsPipe ), groupFields );
}
/**
* Creates a new GroupBy instance that will first merge the given pipes, then group on the given groupFields field names
* and sorts the grouped values on the given sortFields fields names.
*
* @param pipes of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
*/
@ConstructorProperties({"pipes", "groupFields", "sortFields"})
public GroupBy( Pipe[] pipes, Fields groupFields, Fields sortFields )
{
super( pipes, groupFields, sortFields );
}
/**
* Creates a new GroupBy instance that will first merge the given pipes, then group on the given groupFields field names
* and sorts the grouped values on the given sortFields fields names.
*
* @param groupName of type String
* @param pipes of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
*/
@ConstructorProperties({"groupName", "pipes", "groupFields", "sortFields"})
public GroupBy( String groupName, Pipe[] pipes, Fields groupFields, Fields sortFields )
{
super( groupName, pipes, groupFields, sortFields );
}
/**
* Creates a new GroupBy instance that will first merge the given pipes, then group on the given groupFields field names
* and sorts the grouped values on the given sortFields fields names.
*
* @param pipes of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
* @param reverseOrder of type boolean
*/
@ConstructorProperties({"pipes", "groupFields", "sortFields", "reverseOrder"})
public GroupBy( Pipe[] pipes, Fields groupFields, Fields sortFields, boolean reverseOrder )
{
super( pipes, groupFields, sortFields, reverseOrder );
}
/**
* Creates a new GroupBy instance that will first merge the given pipes, then group on the given groupFields field names
* and sorts the grouped values on the given sortFields fields names.
*
* @param groupName of type String
* @param pipes of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
* @param reverseOrder of type boolean
*/
@ConstructorProperties({"groupName", "pipes", "groupFields", "sortFields", "reverseOrder"})
public GroupBy( String groupName, Pipe[] pipes, Fields groupFields, Fields sortFields, boolean reverseOrder )
{
super( groupName, pipes, groupFields, sortFields, reverseOrder );
}
}