/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.pipe;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import cascading.flow.FlowElement;
import cascading.flow.FlowProcess;
import cascading.flow.Scope;
import cascading.pipe.cogroup.CoGroupClosure;
import cascading.pipe.cogroup.GroupClosure;
import cascading.pipe.cogroup.InnerJoin;
import cascading.pipe.cogroup.Joiner;
import cascading.tuple.Fields;
import cascading.tuple.FieldsResolverException;
import cascading.tuple.IndexTuple;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntry;
import cascading.tuple.TupleException;
import cascading.tuple.TuplePair;
import cascading.tuple.Tuples;
import cascading.util.Util;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.log4j.Logger;
/**
* The base class for {@link GroupBy} and {@link CoGroup}. This class should not be used directly.
*
* @see GroupBy
* @see CoGroup
*/
public class Group extends Pipe
{
/** Field LOG */
private static final Logger LOG = Logger.getLogger( Group.class );
/** Field pipes */
private final List<Pipe> pipes = new ArrayList<Pipe>();
/** Field groupFieldsMap */
protected final Map<String, Fields> groupFieldsMap = new LinkedHashMap<String, Fields>(); // keep order
/** Field sortFieldsMap */
protected Map<String, Fields> sortFieldsMap = new LinkedHashMap<String, Fields>(); // keep order
/** Field reverseOrder */
private boolean reverseOrder = false;
/** Field declaredFields */
protected Fields declaredFields;
/** Field resultGroupFields */
protected Fields resultGroupFields;
/** Field repeat */
private int numSelfJoins = 0;
/** Field coGrouper */
private Joiner joiner;
/** Field groupName */
private String groupName;
/** Field isGroupBy */
private boolean isGroupBy;
/** Field pipePos */
private transient Map<String, Integer> pipePos;
/** Field closure */
private GroupClosure closure;
/**
* Constructor Group creates a new Group instance.
*
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param declaredFields of type Fields
*/
protected Group( Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Fields declaredFields )
{
this( lhs, lhsGroupFields, rhs, rhsGroupFields, declaredFields, null, null );
}
/**
* Constructor Group creates a new Group instance.
*
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
*/
protected Group( Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Fields declaredFields, Fields resultGroupFields )
{
this( lhs, lhsGroupFields, rhs, rhsGroupFields, declaredFields, resultGroupFields, null );
}
/**
* Constructor Group creates a new Group instance.
*
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param declaredFields of type Fields
* @param joiner of type CoGrouper
*/
protected Group( Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Fields declaredFields, Joiner joiner )
{
this( Pipe.pipes( lhs, rhs ), Fields.fields( lhsGroupFields, rhsGroupFields ), declaredFields, joiner );
}
/**
* Constructor Group creates a new Group instance.
*
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
* @param joiner of type Joiner
*/
protected Group( Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Fields declaredFields, Fields resultGroupFields, Joiner joiner )
{
this( Pipe.pipes( lhs, rhs ), Fields.fields( lhsGroupFields, rhsGroupFields ), declaredFields, resultGroupFields, joiner );
}
/**
* Constructor Group creates a new Group instance.
*
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param joiner of type CoGrouper
*/
protected Group( Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Joiner joiner )
{
this( lhs, lhsGroupFields, rhs, rhsGroupFields, null, joiner );
}
/**
* Constructor Group creates a new Group instance.
*
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
*/
protected Group( Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields )
{
this( Pipe.pipes( lhs, rhs ), Fields.fields( lhsGroupFields, rhsGroupFields ) );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipes of type Pipe...
*/
protected Group( Pipe... pipes )
{
this( pipes, (Fields[]) null );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipes of type Pipe[]
* @param groupFields of type Fields[]
*/
protected Group( Pipe[] pipes, Fields[] groupFields )
{
this( null, pipes, groupFields, null, null );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipes of type Pipe[]
* @param groupFields of type Fields[]
*/
protected Group( String groupName, Pipe[] pipes, Fields[] groupFields )
{
this( groupName, pipes, groupFields, null, null );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipes of type Pipe[]
* @param groupFields of type Fields[]
* @param declaredFields of type Fields
*/
protected Group( String groupName, Pipe[] pipes, Fields[] groupFields, Fields declaredFields )
{
this( groupName, pipes, groupFields, declaredFields, null );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipes of type Pipe[]
* @param groupFields of type Fields[]
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
*/
protected Group( String groupName, Pipe[] pipes, Fields[] groupFields, Fields declaredFields, Fields resultGroupFields )
{
this( groupName, pipes, groupFields, declaredFields, resultGroupFields, null );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipes of type Pipe[]
* @param groupFields of type Fields[]
* @param declaredFields of type Fields
* @param joiner of type CoGrouper
*/
protected Group( Pipe[] pipes, Fields[] groupFields, Fields declaredFields, Joiner joiner )
{
this( null, pipes, groupFields, declaredFields, null, joiner );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipes of type Pipe[]
* @param groupFields of type Fields[]
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
* @param joiner of type Joiner
*/
protected Group( Pipe[] pipes, Fields[] groupFields, Fields declaredFields, Fields resultGroupFields, Joiner joiner )
{
this( null, pipes, groupFields, declaredFields, resultGroupFields, joiner );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipes of type Pipe[]
* @param groupFields of type Fields[]
* @param declaredFields of type Fields
* @param joiner of type CoGrouper
*/
protected Group( String groupName, Pipe[] pipes, Fields[] groupFields, Fields declaredFields, Fields resultGroupFields, Joiner joiner )
{
this.groupName = groupName;
int uniques = new HashSet<Pipe>( Arrays.asList( Pipe.resolvePreviousAll( pipes ) ) ).size();
if( pipes.length > 1 && uniques == 1 )
{
if( new HashSet<Fields>( Arrays.asList( groupFields ) ).size() != 1 )
throw new IllegalArgumentException( "all groupFields must be identical" );
addPipe( pipes[ 0 ] );
this.numSelfJoins = pipes.length - 1;
this.groupFieldsMap.put( pipes[ 0 ].getName(), groupFields[ 0 ] );
if( resultGroupFields != null && groupFields[ 0 ].size() != resultGroupFields.size() )
throw new IllegalArgumentException( "resultGroupFields and cogroup fields must be same size" );
}
else
{
int last = -1;
for( int i = 0; i < pipes.length; i++ )
{
addPipe( pipes[ i ] );
if( groupFields == null || groupFields.length == 0 )
{
addGroupFields( pipes[ i ], Fields.FIRST );
continue;
}
if( last != -1 && last != groupFields[ i ].size() )
throw new IllegalArgumentException( "all cogroup fields must be same size" );
last = groupFields[ i ].size();
addGroupFields( pipes[ i ], groupFields[ i ] );
}
if( resultGroupFields != null && last != resultGroupFields.size() )
throw new IllegalArgumentException( "resultGroupFields and cogroup fields must be same size" );
}
this.declaredFields = declaredFields;
this.resultGroupFields = resultGroupFields;
this.joiner = joiner;
verifyCoGrouper();
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param declaredFields of type Fields
*/
protected Group( String groupName, Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Fields declaredFields )
{
this( lhs, lhsGroupFields, rhs, rhsGroupFields, declaredFields );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
*/
protected Group( String groupName, Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Fields declaredFields, Fields resultGroupFields )
{
this( lhs, lhsGroupFields, rhs, rhsGroupFields, declaredFields, resultGroupFields );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param declaredFields of type Fields
* @param joiner of type CoGrouper
*/
protected Group( String groupName, Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Fields declaredFields, Joiner joiner )
{
this( lhs, lhsGroupFields, rhs, rhsGroupFields, declaredFields, joiner );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
* @param joiner of type Joiner
*/
protected Group( String groupName, Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Fields declaredFields, Fields resultGroupFields, Joiner joiner )
{
this( lhs, lhsGroupFields, rhs, rhsGroupFields, declaredFields, resultGroupFields, joiner );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
* @param joiner of type CoGrouper
*/
protected Group( String groupName, Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields, Joiner joiner )
{
this( lhs, lhsGroupFields, rhs, rhsGroupFields, joiner );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param lhs of type Pipe
* @param lhsGroupFields of type Fields
* @param rhs of type Pipe
* @param rhsGroupFields of type Fields
*/
protected Group( String groupName, Pipe lhs, Fields lhsGroupFields, Pipe rhs, Fields rhsGroupFields )
{
this( lhs, lhsGroupFields, rhs, rhsGroupFields );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipes of type Pipe...
*/
protected Group( String groupName, Pipe... pipes )
{
this( pipes );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param declaredFields of type Fields
*/
protected Group( Pipe pipe, Fields groupFields, int numSelfJoins, Fields declaredFields )
{
this( pipe, groupFields, numSelfJoins );
this.declaredFields = declaredFields;
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
*/
protected Group( Pipe pipe, Fields groupFields, int numSelfJoins, Fields declaredFields, Fields resultGroupFields )
{
this( pipe, groupFields, numSelfJoins );
this.declaredFields = declaredFields;
this.resultGroupFields = resultGroupFields;
if( resultGroupFields != null && groupFields.size() != resultGroupFields.size() )
throw new IllegalArgumentException( "resultGroupFields and cogroup fields must be same size" );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param declaredFields of type Fields
* @param joiner of type CoGrouper
*/
protected Group( Pipe pipe, Fields groupFields, int numSelfJoins, Fields declaredFields, Joiner joiner )
{
this( pipe, groupFields, numSelfJoins, declaredFields );
this.joiner = joiner;
verifyCoGrouper();
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
* @param joiner of type Joiner
*/
protected Group( Pipe pipe, Fields groupFields, int numSelfJoins, Fields declaredFields, Fields resultGroupFields, Joiner joiner )
{
this( pipe, groupFields, numSelfJoins, declaredFields, resultGroupFields );
this.joiner = joiner;
verifyCoGrouper();
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param joiner of type CoGrouper
*/
protected Group( Pipe pipe, Fields groupFields, int numSelfJoins, Joiner joiner )
{
addPipe( pipe );
this.groupFieldsMap.put( pipe.getName(), groupFields );
this.numSelfJoins = numSelfJoins;
this.joiner = joiner;
verifyCoGrouper();
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
*/
protected Group( Pipe pipe, Fields groupFields, int numSelfJoins )
{
this( pipe, groupFields, numSelfJoins, (Joiner) null );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param declaredFields of type Fields
*/
protected Group( String groupName, Pipe pipe, Fields groupFields, int numSelfJoins, Fields declaredFields )
{
this( pipe, groupFields, numSelfJoins, declaredFields );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
*/
protected Group( String groupName, Pipe pipe, Fields groupFields, int numSelfJoins, Fields declaredFields, Fields resultGroupFields )
{
this( pipe, groupFields, numSelfJoins, declaredFields, resultGroupFields );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param declaredFields of type Fields
* @param joiner of type CoGrouper
*/
protected Group( String groupName, Pipe pipe, Fields groupFields, int numSelfJoins, Fields declaredFields, Joiner joiner )
{
this( pipe, groupFields, numSelfJoins, declaredFields, joiner );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param declaredFields of type Fields
* @param resultGroupFields of type Fields
* @param joiner of type Joiner
*/
protected Group( String groupName, Pipe pipe, Fields groupFields, int numSelfJoins, Fields declaredFields, Fields resultGroupFields, Joiner joiner )
{
this( pipe, groupFields, numSelfJoins, declaredFields, resultGroupFields, joiner );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
* @param joiner of type CoGrouper
*/
protected Group( String groupName, Pipe pipe, Fields groupFields, int numSelfJoins, Joiner joiner )
{
this( pipe, groupFields, numSelfJoins, joiner );
this.groupName = groupName;
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param numSelfJoins of type int
*/
protected Group( String groupName, Pipe pipe, Fields groupFields, int numSelfJoins )
{
this( pipe, groupFields, numSelfJoins );
this.groupName = groupName;
}
////////////
// GROUPBY
////////////
/**
* Constructor Group creates a new Group instance where grouping occurs on {@link Fields#ALL} fields.
*
* @param pipe of type Pipe
*/
protected Group( Pipe pipe )
{
this( null, pipe, Fields.ALL, null, false );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
*/
protected Group( Pipe pipe, Fields groupFields )
{
this( null, pipe, groupFields, null, false );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
*/
protected Group( String groupName, Pipe pipe, Fields groupFields )
{
this( groupName, pipe, groupFields, null, false );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
*/
protected Group( Pipe pipe, Fields groupFields, Fields sortFields )
{
this( null, pipe, groupFields, sortFields, false );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
*/
protected Group( String groupName, Pipe pipe, Fields groupFields, Fields sortFields )
{
this( groupName, pipe, groupFields, sortFields, false );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
* @param reverseOrder of type boolean
*/
protected Group( Pipe pipe, Fields groupFields, Fields sortFields, boolean reverseOrder )
{
this( null, pipe, groupFields, sortFields, reverseOrder );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
* @param reverseOrder of type boolean
*/
protected Group( String groupName, Pipe pipe, Fields groupFields, Fields sortFields, boolean reverseOrder )
{
this( groupName, Pipe.pipes( pipe ), groupFields, sortFields, reverseOrder );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipes of type Pipe
* @param groupFields of type Fields
*/
protected Group( Pipe[] pipes, Fields groupFields )
{
this( null, pipes, groupFields, null, false );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipes of type Pipe
* @param groupFields of type Fields
*/
protected Group( String groupName, Pipe[] pipes, Fields groupFields )
{
this( groupName, pipes, groupFields, null, false );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipes of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
*/
protected Group( Pipe[] pipes, Fields groupFields, Fields sortFields )
{
this( null, pipes, groupFields, sortFields, false );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipe of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
*/
protected Group( String groupName, Pipe[] pipe, Fields groupFields, Fields sortFields )
{
this( groupName, pipe, groupFields, sortFields, false );
}
/**
* Constructor Group creates a new Group instance.
*
* @param pipes of type Pipe
* @param groupFields of type Fields
* @param sortFields of type Fields
* @param reverseOrder of type boolean
*/
protected Group( Pipe[] pipes, Fields groupFields, Fields sortFields, boolean reverseOrder )
{
this( null, pipes, groupFields, sortFields, reverseOrder );
}
/**
* Constructor Group creates a new Group instance.
*
* @param groupName of type String
* @param pipes of type Pipe[]
* @param groupFields of type Fields
* @param sortFields of type Fields
* @param reverseOrder of type boolean
*/
protected Group( String groupName, Pipe[] pipes, Fields groupFields, Fields sortFields, boolean reverseOrder )
{
this.isGroupBy = true;
this.groupName = groupName;
for( Pipe pipe : pipes )
{
addPipe( pipe );
this.groupFieldsMap.put( pipe.getName(), groupFields );
if( sortFields != null )
this.sortFieldsMap.put( pipe.getName(), sortFields );
}
this.reverseOrder = reverseOrder;
this.joiner = new InnerJoin();
}
private void verifyCoGrouper()
{
if( joiner == null )
{
joiner = new InnerJoin();
return;
}
if( joiner.numJoins() == -1 )
return;
int joins = Math.max( numSelfJoins, groupFieldsMap.size() - 1 ); // joining two streams is one join
if( joins != joiner.numJoins() )
throw new IllegalArgumentException( "invalid cogrouper, only accepts " + joiner.numJoins() + " joins, there are: " + joins );
}
/**
* Method getDeclaredFields returns the declaredFields of this Group object.
*
* @return the declaredFields (type Fields) of this Group object.
*/
public Fields getDeclaredFields()
{
return declaredFields;
}
private void addPipe( Pipe pipe )
{
if( pipe.getName() == null )
throw new IllegalArgumentException( "each input pipe must have a name" );
pipes.add( pipe ); // allow same pipe
}
private void addGroupFields( Pipe pipe, Fields fields )
{
if( groupFieldsMap.containsKey( pipe.getName() ) )
throw new IllegalArgumentException( "each input pipe branch must be uniquely named" );
groupFieldsMap.put( pipe.getName(), fields );
}
@Override
public String getName()
{
if( groupName != null )
return groupName;
StringBuffer buffer = new StringBuffer();
for( Pipe pipe : pipes )
{
if( buffer.length() != 0 )
{
if( isGroupBy )
buffer.append( "+" );
else
buffer.append( "*" ); // more semantically correct
}
buffer.append( pipe.getName() );
}
groupName = buffer.toString();
return groupName;
}
@Override
public Pipe[] getPrevious()
{
return pipes.toArray( new Pipe[ pipes.size() ] );
}
/**
* Method getGroupingSelectors returns the groupingSelectors of this Group object.
*
* @return the groupingSelectors (type Map<String, Fields>) of this Group object.
*/
public Map<String, Fields> getGroupingSelectors()
{
return groupFieldsMap;
}
/**
* Method getSortingSelectors returns the sortingSelectors of this Group object.
*
* @return the sortingSelectors (type Map<String, Fields>) of this Group object.
*/
public Map<String, Fields> getSortingSelectors()
{
return sortFieldsMap;
}
/**
* Method isSorted returns true if this Group instance is sorting values other than the group fields.
*
* @return the sorted (type boolean) of this Group object.
*/
public boolean isSorted()
{
return !sortFieldsMap.isEmpty();
}
/**
* Method isSortReversed returns true if sorting is reversed.
*
* @return the sortReversed (type boolean) of this Group object.
*/
public boolean isSortReversed()
{
return reverseOrder;
}
private Map<String, Integer> getPipePos()
{
if( pipePos != null )
return pipePos;
pipePos = new HashMap<String, Integer>();
int pos = 0;
for( Object pipe : pipes )
pipePos.put( ( (Pipe) pipe ).getName(), pos++ );
return pipePos;
}
/**
* Method makeReduceGrouping makes a group Tuple[] of the form [ ['grpValue', ...] [ sourceName, [ 'value', ...] ] ]
* <p/>
* Since this is a join, we must track from which source a given tuple is sourced from so we can
* cogroup properly at the reduce stage.
*
* @param incomingScope of type Scope
* @param outgoingScope of type Scope
* @param entry of type TupleEntry
* @param output of type OutputCollector
* @throws IOException thrown by OutputCollector on collect
*/
public void collectReduceGrouping( Scope incomingScope, Scope outgoingScope, TupleEntry entry, OutputCollector output ) throws IOException
{
Fields groupFields = outgoingScope.getGroupingSelectors().get( incomingScope.getName() );
Fields sortFields = outgoingScope.getSortingSelectors() == null ? null : outgoingScope.getSortingSelectors().get( incomingScope.getName() );
if( LOG.isDebugEnabled() )
LOG.debug( "cogroup: [" + incomingScope + "] key pos: [" + groupFields + "]" );
// todo: would be nice to delegate this back to the GroupClosure
Tuple groupTuple = Tuples.extractTuple( entry, groupFields ); // we are nulling dupe values here to reduce bandwidth usage
Tuple sortTuple = sortFields == null ? null : entry.selectTuple( sortFields );
Tuple valuesTuple = entry.getTuple();
Tuple groupKey = sortTuple == null ? groupTuple : new TuplePair( groupTuple, sortTuple );
if( isGroupBy() )
{
output.collect( groupKey, valuesTuple );
return;
}
Integer pos = getPipePos().get( incomingScope.getName() );
output.collect( new IndexTuple( pos, groupKey ), new IndexTuple( pos, valuesTuple ) );
}
/**
* Method unwrapGrouping tests if the given grouping key Tuple should be unwrapped if this Group instance is sorting.
*
* @param tuple of type Tuple
* @return Tuple
*/
public Tuple unwrapGrouping( Tuple tuple )
{
if( !isGroupBy )
return ( (IndexTuple) tuple ).getTuple();
return !isSorted() ? (Tuple) tuple : ( (TuplePair) tuple ).getLhs();
}
/**
* Method makeReduceValues wrapps the incoming Hadoop value stream as an iterator over {@link Tuple} instance.
*
* @param key of type WritableComparable
* @param values of type Iterator @return Iterator<Tuple>
* @return a Tuple Iterator
*/
public Iterator<Tuple> iterateReduceValues( Tuple key, Iterator values )
{
closure.reset( joiner, key, values );
return joiner.getIterator( closure );
}
public void initializeReduce( FlowProcess flowProcess, Set<Scope> incomingScopes, Scope outgoingScope )
{
if( isGroupBy() )
{
Scope incomingScope = incomingScopes.iterator().next();
Fields[] groupFields = Fields.fields( outgoingScope.getGroupingSelectors().get( incomingScope.getName() ) );
Fields[] valuesFields = Fields.fields( incomingScope.getOutValuesFields() );
closure = new GroupClosure( flowProcess, groupFields, valuesFields );
}
else
{
Fields[] groupFields = new Fields[ pipes.size() ];
Fields[] valuesFields = new Fields[ pipes.size() ];
for( Scope incomingScope : incomingScopes )
{
int pos = getPipePos().get( incomingScope.getName() );
groupFields[ pos ] = outgoingScope.getGroupingSelectors().get( incomingScope.getName() );
valuesFields[ pos ] = incomingScope.getOutValuesFields();
}
closure = new CoGroupClosure( flowProcess, numSelfJoins, groupFields, valuesFields );
}
}
/**
* Method isGroupBy returns true if this Group instance will perform a GroupBy operation.
*
* @return the groupBy (type boolean) of this Group object.
*/
public boolean isGroupBy()
{
return isGroupBy;
}
boolean isSelfJoin()
{
return numSelfJoins != 0;
}
// FIELDS
@Override
public Scope outgoingScopeFor( Set<Scope> incomingScopes )
{
Map<String, Fields> groupingSelectors = resolveGroupingSelectors( incomingScopes );
Map<String, Fields> sortingSelectors = resolveSortingSelectors( incomingScopes );
Fields declared = resolveDeclared( incomingScopes );
// for Group, the outgoing fields are the same as those declared
return new Scope( getName(), declared, resultGroupFields, groupingSelectors, sortingSelectors, declared, isGroupBy() );
}
Map<String, Fields> resolveGroupingSelectors( Set<Scope> incomingScopes )
{
try
{
Map<String, Fields> groupingSelectors = getGroupingSelectors();
Map<String, Fields> groupingFields = resolveSelectorsAgainstIncoming( incomingScopes, groupingSelectors, "grouping" );
Iterator<Fields> iterator = groupingFields.values().iterator();
int size = iterator.next().size();
while( iterator.hasNext() )
{
Fields groupingField = iterator.next();
if( groupingField.size() != size )
throw new OperatorException( this, "all grouping fields must be same size:" + toString() );
size = groupingField.size();
}
return groupingFields;
}
catch( FieldsResolverException exception )
{
throw new OperatorException( this, OperatorException.Kind.grouping, exception.getSourceFields(), exception.getSelectorFields(), exception );
}
catch( RuntimeException exception )
{
throw new OperatorException( this, "could not resolve grouping selector in: " + this, exception );
}
}
private Map<String, Fields> resolveSelectorsAgainstIncoming( Set<Scope> incomingScopes, Map<String, Fields> selectors, String type )
{
Map<String, Fields> resolvedFields = new HashMap<String, Fields>();
for( Scope incomingScope : incomingScopes )
{
Fields selector = selectors.get( incomingScope.getName() );
if( selector == null )
throw new OperatorException( this, "no " + type + " selector found for: " + incomingScope.getName() );
Fields incomingFields;
if( selector.isAll() )
incomingFields = resolveFields( incomingScope );
else if( selector.isGroup() )
incomingFields = incomingScope.getOutGroupingFields();
else if( selector.isValues() )
incomingFields = incomingScope.getOutValuesFields().subtract( incomingScope.getOutGroupingFields() );
else
incomingFields = resolveFields( incomingScope ).select( selector );
resolvedFields.put( incomingScope.getName(), incomingFields );
}
return resolvedFields;
}
Map<String, Fields> resolveSortingSelectors( Set<Scope> incomingScopes )
{
try
{
if( getSortingSelectors().isEmpty() )
return null;
return resolveSelectorsAgainstIncoming( incomingScopes, getSortingSelectors(), "sorting" );
}
catch( FieldsResolverException exception )
{
throw new OperatorException( this, OperatorException.Kind.sorting, exception.getSourceFields(), exception.getSelectorFields(), exception );
}
catch( RuntimeException exception )
{
throw new OperatorException( this, "could not resolve sorting selector in: " + this, exception );
}
}
@Override
public Fields resolveFields( Scope scope )
{
if( scope.isEvery() )
return scope.getOutGroupingFields();
else
return scope.getOutValuesFields();
}
Fields resolveDeclared( Set<Scope> incomingScopes )
{
try
{
Fields declaredFields = getDeclaredFields();
if( declaredFields != null ) // null for GroupBy
{
if( incomingScopes.size() != pipes.size() && isSelfJoin() )
throw new OperatorException( this, "self joins without intermediate operators are not permitted, see 'numSelfJoins' constructor or identity function" );
int size = 0;
boolean foundUnknown = false;
List<Fields> resolvedFields = new ArrayList<Fields>();
for( Scope incomingScope : incomingScopes )
{
Fields fields = resolveFields( incomingScope );
foundUnknown = foundUnknown || fields.isUnknown();
size += fields.size();
resolvedFields.add( fields );
}
// we must relax field checking in the face of unkown fields
if( !foundUnknown && declaredFields.size() != size * ( numSelfJoins + 1 ) )
{
if( isSelfJoin() )
throw new OperatorException( this, "declared grouped fields not same size as grouped values, declared: " + declaredFields.printVerbose() + " != size: " + size * ( numSelfJoins + 1 ) );
else
throw new OperatorException( this, "declared grouped fields not same size as grouped values, declared: " + declaredFields.printVerbose() + " resolved: " + Util.print( resolvedFields, "" ) );
}
return declaredFields;
}
// support merge or cogrouping here
if( isGroupBy() )
{
Fields commonFields = null;
for( Scope incomingScope : incomingScopes )
{
Fields fields = resolveFields( incomingScope );
if( commonFields == null )
commonFields = fields;
else if( !commonFields.equals( fields ) )
throw new OperatorException( this, "merged streams must declare the same field names, expected: " + commonFields.printVerbose() + " found: " + fields.print() );
}
return commonFields;
}
else
{
Map<String, Scope> scopesMap = new HashMap<String, Scope>();
for( Scope incomingScope : incomingScopes )
scopesMap.put( incomingScope.getName(), incomingScope );
List<Fields> appendableFields = new ArrayList<Fields>();
for( Pipe pipe : pipes )
appendableFields.add( resolveFields( scopesMap.get( pipe.getName() ) ) );
Fields appendedFields = new Fields();
try
{
// will throwFail on name collisions
for( Fields appendableField : appendableFields )
appendedFields = appendedFields.append( appendableField );
}
catch( TupleException exception )
{
String fields = "";
for( Fields appendableField : appendableFields )
fields += appendableField.print();
throw new OperatorException( this, "found duplicate field names in cogrouped tuple stream: " + fields, exception );
}
return appendedFields;
}
}
catch( OperatorException exception )
{
throw exception;
}
catch( RuntimeException exception )
{
throw new OperatorException( this, "could not resolve declared fields in: " + this, exception );
}
}
Fields resolveOutgoingSelector( Fields declared )
{
return declared;
}
@Override
public boolean isEquivalentTo( FlowElement element )
{
boolean equivalentTo = super.isEquivalentTo( element );
if( !equivalentTo )
return equivalentTo;
Group group = (Group) element;
if( !groupFieldsMap.equals( group.groupFieldsMap ) )
return false;
if( !pipes.equals( group.pipes ) )
return false;
return true;
}
// OBJECT OVERRIDES
@Override
@SuppressWarnings({"RedundantIfStatement"})
public boolean equals( Object object )
{
if( this == object )
return true;
if( object == null || getClass() != object.getClass() )
return false;
if( !super.equals( object ) )
return false;
Group group = (Group) object;
if( groupName != null ? !groupName.equals( group.groupName ) : group.groupName != null )
return false;
if( groupFieldsMap != null ? !groupFieldsMap.equals( group.groupFieldsMap ) : group.groupFieldsMap != null )
return false;
if( pipes != null ? !pipes.equals( group.pipes ) : group.pipes != null )
return false;
return true;
}
@Override
public int hashCode()
{
int result = super.hashCode();
result = 31 * result + ( pipes != null ? pipes.hashCode() : 0 );
result = 31 * result + ( groupFieldsMap != null ? groupFieldsMap.hashCode() : 0 );
result = 31 * result + ( groupName != null ? groupName.hashCode() : 0 );
return result;
}
@Override
public String toString()
{
StringBuilder buffer = new StringBuilder( super.toString() );
buffer.append( "[by:" );
for( String name : groupFieldsMap.keySet() )
{
if( groupFieldsMap.size() > 1 )
buffer.append( name ).append( ":" );
buffer.append( groupFieldsMap.get( name ).printVerbose() );
}
if( isSelfJoin() )
buffer.append( "[numSelfJoins:" ).append( numSelfJoins ).append( "]" );
buffer.append( "]" );
return buffer.toString();
}
@Override
protected void printInternal( StringBuffer buffer, Scope scope )
{
super.printInternal( buffer, scope );
Map<String, Fields> map = scope.getGroupingSelectors();
if( map != null )
{
buffer.append( "[by:" );
for( String name : map.keySet() )
{
if( map.size() > 1 )
buffer.append( name ).append( ":" );
buffer.append( map.get( name ).print() );
}
if( isSelfJoin() )
buffer.append( "[numSelfJoins:" ).append( numSelfJoins ).append( "]" );
buffer.append( "]" );
}
}
}