/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.PlannerException;
import cascading.operation.Function;
import cascading.operation.Identity;
import cascading.operation.aggregator.Count;
import cascading.operation.aggregator.First;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.cogroup.LeftJoin;
import cascading.pipe.cogroup.MixedJoin;
import cascading.pipe.cogroup.OuterJoin;
import cascading.pipe.cogroup.RightJoin;
import cascading.scheme.TextDelimited;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.TupleEntryIterator;
public class CoGroupFieldedPipesTest extends ClusterTestCase
{
String inputFileApache = "build/test/data/apache.10.txt";
String inputFileIps = "build/test/data/ips.20.txt";
String inputFileNums20 = "build/test/data/nums.20.txt";
String inputFileNums10 = "build/test/data/nums.10.txt";
String inputFileCritics = "build/test/data/critics.txt";
String inputFileUpper = "build/test/data/upper.txt";
String inputFileLower = "build/test/data/lower.txt";
String inputFileLowerOffset = "build/test/data/lower-offset.txt";
String inputFileJoined = "build/test/data/lower+upper.txt";
String inputFileLhs = "build/test/data/lhs.txt";
String inputFileRhs = "build/test/data/rhs.txt";
String inputFileCross = "build/test/data/lhs+rhs-cross.txt";
String inputFileLhsSparse = "build/test/data/lhs-sparse.txt";
String inputFileRhsSparse = "build/test/data/rhs-sparse.txt";
String outputPath = "build/test/output/fields/";
public CoGroupFieldedPipesTest()
{
super( "fielded pipes", true, 4, 1 ); // leave cluster testing enabled
}
public void testCoGroup() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroup/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 5, null );
TupleEntryIterator iterator = countFlow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\tA", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tB", iterator.next().get( 1 ) );
iterator.close();
}
public void testCoGroupWithUnknowns() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( Fields.UNKNOWN, " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroup/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( 0 ), pipeUpper, new Fields( 0 ), Fields.size( 4 ) );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 5, null );
TupleEntryIterator iterator = countFlow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\tA", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tB", iterator.next().get( 1 ) );
iterator.close();
}
/**
* this test intentionally filters out all values so the intermediate tap is empty. this tap is cogrouped with
* a new stream using an outerjoin.
*
* @throws Exception
*/
public void testCoGroupFilteredBranch() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupfilteredbranch/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
pipeUpper = new Each( pipeUpper, new Fields( "num" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all
pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ), new OuterJoin() );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 5, null );
TupleEntryIterator iterator = countFlow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\tnull\tnull", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\tnull\tnull", iterator.next().get( 1 ) );
iterator.close();
}
public void testCoGroupSelf() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupself/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
// countFlow.writeDOT( "cogroupself.dot" );
countFlow.complete();
validateLength( countFlow, 5, null );
TupleEntryIterator iterator = countFlow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\ta", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tb", iterator.next().get( 1 ) );
iterator.close();
}
/**
* Method testCoGroupAfterEvery tests that a tmp tap is inserted after the Every in the cogroup join
*
* @throws Exception when
*/
public void testCoGroupAfterEvery() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroup/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
pipeLower = new GroupBy( pipeLower, new Fields( "num" ) );
pipeLower = new Every( pipeLower, new Fields( "char" ), new First(), Fields.ALL );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
pipeUpper = new GroupBy( pipeUpper, new Fields( "num" ) );
pipeUpper = new Every( pipeUpper, new Fields( "char" ), new First(), Fields.ALL );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
Flow countFlow = null;
try
{
countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
}
catch( PlannerException exception )
{
// exception.writeDOT( "cogroupedevery.dot" );
throw exception;
}
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 5, null );
TupleEntryIterator iterator = countFlow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\tA", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tB", iterator.next().get( 1 ) );
iterator.close();
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
*
* @throws Exception
*/
public void testCoGroupInner() throws Exception
{
if( !new File( inputFileLhsSparse ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhsSparse );
copyFromLocal( inputFileRhsSparse );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLhsSparse );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileRhsSparse );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupinner/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 15, null );
TupleEntryIterator iterator = countFlow.openSink();
Set<String> results = new HashSet<String>();
results.add( "1\ta1\t1\tA1" );
results.add( "1\ta1\t1\tA2" );
results.add( "1\ta1\t1\tA3" );
results.add( "1\ta2\t1\tA1" );
results.add( "1\ta2\t1\tA2" );
results.add( "1\ta2\t1\tA3" );
results.add( "1\ta3\t1\tA1" );
results.add( "1\ta3\t1\tA2" );
results.add( "1\ta3\t1\tA3" );
results.add( "2\tb1\t2\tB1" );
results.add( "2\tb1\t2\tB2" );
results.add( "2\tb1\t2\tB3" );
results.add( "4\td1\t4\tD1" );
results.add( "4\td2\t4\tD1" );
results.add( "4\td3\t4\tD1" );
int size = results.size();
for( int i = 0; i < size; i++ )
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertEquals( 0, results.size() );
iterator.close();
}
public void testCoGroupInnerSingleField() throws Exception
{
if( !new File( inputFileLowerOffset ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLowerOffset );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLowerOffset );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupinnersingle/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), new RegexSplitter( new Fields( "num1", "char" ), " " ), new Fields( "num1" ) );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), new RegexSplitter( new Fields( "num2", "char" ), " " ), new Fields( "num2" ) );
Pipe join = new CoGroup( pipeLower, new Fields( "num1" ), pipeUpper, new Fields( "num2" ) );
join = new Every( join, new Count() );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, join );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 2, null );
TupleEntryIterator iterator = countFlow.openSink();
Set<String> results = new HashSet<String>();
results.add( "1\t1" );
results.add( "5\t2" );
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
iterator.close();
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 3 c1 null null
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* 5 e1 null null
* 5 e2 null null
* 5 e3 null null
* null null 6 F1
* null null 6 F2
* 7 g1 null null
* 7 g2 null null
* 7 g3 null null
* 7 g4 null null
* 7 g5 null null
*
* @throws Exception
*/
public void testCoGroupOuter() throws Exception
{
if( !new File( inputFileLhsSparse ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhsSparse );
copyFromLocal( inputFileRhsSparse );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLhsSparse );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileRhsSparse );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupouter/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ), new OuterJoin() );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 26, null );
TupleEntryIterator iterator = countFlow.openSink();
Set<String> results = new HashSet<String>();
results.add( "1\ta1\t1\tA1" );
results.add( "1\ta1\t1\tA2" );
results.add( "1\ta1\t1\tA3" );
results.add( "1\ta2\t1\tA1" );
results.add( "1\ta2\t1\tA2" );
results.add( "1\ta2\t1\tA3" );
results.add( "1\ta3\t1\tA1" );
results.add( "1\ta3\t1\tA2" );
results.add( "1\ta3\t1\tA3" );
results.add( "2\tb1\t2\tB1" );
results.add( "2\tb1\t2\tB2" );
results.add( "2\tb1\t2\tB3" );
results.add( "3\tc1\tnull\tnull" );
results.add( "4\td1\t4\tD1" );
results.add( "4\td2\t4\tD1" );
results.add( "4\td3\t4\tD1" );
results.add( "5\te1\tnull\tnull" );
results.add( "5\te2\tnull\tnull" );
results.add( "5\te3\tnull\tnull" );
results.add( "null\tnull\t6\tF1" );
results.add( "null\tnull\t6\tF2" );
results.add( "7\tg1\tnull\tnull" );
results.add( "7\tg2\tnull\tnull" );
results.add( "7\tg3\tnull\tnull" );
results.add( "7\tg4\tnull\tnull" );
results.add( "7\tg5\tnull\tnull" );
int size = results.size();
for( int i = 0; i < size; i++ )
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertEquals( 0, results.size() );
iterator.close();
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 3 c1 null null
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* 5 e1 null null
* 5 e2 null null
* 5 e3 null null
* 7 g1 null null
* 7 g2 null null
* 7 g3 null null
* 7 g4 null null
* 7 g5 null null
*
* @throws Exception
*/
public void testCoGroupInnerOuter() throws Exception
{
if( !new File( inputFileLhsSparse ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhsSparse );
copyFromLocal( inputFileRhsSparse );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLhsSparse );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileRhsSparse );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupinnerouter/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Fields declaredFields = new Fields( "num", "char", "num2", "char2" );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), declaredFields, new Fields( "num" ), new LeftJoin() );
splice = new Every( splice, Fields.ALL, new TestIdentityBuffer( new Fields( "num" ), 7 ), Fields.RESULTS );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 24, null );
TupleEntryIterator iterator = countFlow.openSink();
Set<String> results = new HashSet<String>();
results.add( "1\ta1\t1\tA1" );
results.add( "1\ta1\t1\tA2" );
results.add( "1\ta1\t1\tA3" );
results.add( "1\ta2\t1\tA1" );
results.add( "1\ta2\t1\tA2" );
results.add( "1\ta2\t1\tA3" );
results.add( "1\ta3\t1\tA1" );
results.add( "1\ta3\t1\tA2" );
results.add( "1\ta3\t1\tA3" );
results.add( "2\tb1\t2\tB1" );
results.add( "2\tb1\t2\tB2" );
results.add( "2\tb1\t2\tB3" );
results.add( "3\tc1\tnull\tnull" );
results.add( "4\td1\t4\tD1" );
results.add( "4\td2\t4\tD1" );
results.add( "4\td3\t4\tD1" );
results.add( "5\te1\tnull\tnull" );
results.add( "5\te2\tnull\tnull" );
results.add( "5\te3\tnull\tnull" );
results.add( "7\tg1\tnull\tnull" );
results.add( "7\tg2\tnull\tnull" );
results.add( "7\tg3\tnull\tnull" );
results.add( "7\tg4\tnull\tnull" );
results.add( "7\tg5\tnull\tnull" );
int size = results.size();
for( int i = 0; i < size; i++ )
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertEquals( 0, results.size() );
iterator.close();
}
/**
* 1 a1
* 1 a2
* 1 a3
* 2 b1
* 3 c1
* 4 d1
* 4 d2
* 4 d3
* 5 e1
* 5 e2
* 5 e3
* 7 g1
* 7 g2
* 7 g3
* 7 g4
* 7 g5
* <p/>
* 1 A1
* 1 A2
* 1 A3
* 2 B1
* 2 B2
* 2 B3
* 4 D1
* 6 F1
* 6 F2
* <p/>
* 1 a1 1 A1
* 1 a1 1 A2
* 1 a1 1 A3
* 1 a2 1 A1
* 1 a2 1 A2
* 1 a2 1 A3
* 1 a3 1 A1
* 1 a3 1 A2
* 1 a3 1 A3
* 2 b1 2 B1
* 2 b1 2 B2
* 2 b1 2 B3
* 4 d1 4 D1
* 4 d2 4 D1
* 4 d3 4 D1
* null null 6 F1
* null null 6 F2
*
* @throws Exception
*/
public void testCoGroupOuterInner() throws Exception
{
if( !new File( inputFileLhsSparse ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhsSparse );
copyFromLocal( inputFileRhsSparse );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLhsSparse );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileRhsSparse );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupouterinner/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Fields declaredFields = new Fields( "num", "char", "num2", "char2" );
Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), declaredFields, new Fields( "num" ), new RightJoin() );
splice = new Every( splice, Fields.ALL, new TestIdentityBuffer( new Fields( "num" ), 7 ), Fields.RESULTS );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 17, null );
TupleEntryIterator iterator = countFlow.openSink();
Set<String> results = new HashSet<String>();
results.add( "1\ta1\t1\tA1" );
results.add( "1\ta1\t1\tA2" );
results.add( "1\ta1\t1\tA3" );
results.add( "1\ta2\t1\tA1" );
results.add( "1\ta2\t1\tA2" );
results.add( "1\ta2\t1\tA3" );
results.add( "1\ta3\t1\tA1" );
results.add( "1\ta3\t1\tA2" );
results.add( "1\ta3\t1\tA3" );
results.add( "2\tb1\t2\tB1" );
results.add( "2\tb1\t2\tB2" );
results.add( "2\tb1\t2\tB3" );
results.add( "4\td1\t4\tD1" );
results.add( "4\td2\t4\tD1" );
results.add( "4\td3\t4\tD1" );
results.add( "null\tnull\t6\tF1" );
results.add( "null\tnull\t6\tF2" );
int size = results.size();
for( int i = 0; i < size; i++ )
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertEquals( 0, results.size() );
iterator.close();
}
/**
* 1 a
* 5 b
* 6 c
* 5 b
* 5 e
* <p/>
* 1 A
* 2 B
* 3 C
* 4 D
* 5 E
* <p/>
* 1 a
* 2 b
* 3 c
* 4 d
* 5 e
* <p/>
* 1 a 1 A 1 a
* - - 2 B 2 b
* - - 3 C 3 c
* - - 4 D 4 d
* 5 b 5 E 5 e
* 5 e 5 E 5 e
*
* @throws Exception
*/
public void testCoGroupMixed() throws Exception
{
if( !new File( inputFileLowerOffset ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLowerOffset );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLowerOffset = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLowerOffset );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Map sources = new HashMap();
sources.put( "loweroffset", sourceLowerOffset );
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupmixed/", true );
Pipe pipeLowerOffset = new Each( new Pipe( "loweroffset" ), new Fields( "line" ), splitter );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe[] pipes = Pipe.pipes( pipeLowerOffset, pipeUpper, pipeLower );
Fields[] fields = Fields.fields( new Fields( "num" ), new Fields( "num" ), new Fields( "num" ) );
MixedJoin join = new MixedJoin( new boolean[]{MixedJoin.OUTER, MixedJoin.INNER, MixedJoin.OUTER} );
Pipe splice = new CoGroup( pipes, fields, Fields.size( 6 ), join );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 6, null );
TupleEntryIterator iterator = countFlow.openSink();
Set<String> results = new HashSet<String>();
results.add( "1\ta\t1\tA\t1\ta" );
results.add( "null\tnull\t2\tB\t2\tb" );
results.add( "null\tnull\t3\tC\t3\tc" );
results.add( "null\tnull\t4\tD\t4\td" );
results.add( "5\tb\t5\tE\t5\te" );
results.add( "5\te\t5\tE\t5\te" );
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
assertTrue( "not equal: tuple.get(1)", results.remove( iterator.next().get( 1 ) ) );
iterator.close();
}
public void testCoGroupDiffFields() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroup/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
Pipe cogroup = new CoGroup( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sink, cogroup );
// System.out.println( "flow =\n" + flow );
flow.complete();
validateLength( flow, 5, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\tA", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tB", iterator.next().get( 1 ) );
iterator.close();
}
public void testCoGroupGroupBy() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitterLower = new RegexSplitter( new Fields( "numA", "lower" ), " " );
Function splitterUpper = new RegexSplitter( new Fields( "numB", "upper" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupgroupby/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitterLower );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitterUpper );
Pipe cogroup = new CoGroup( pipeLower, new Fields( "numA" ), pipeUpper, new Fields( "numB" ) );
//cogroup = new Each( cogroup, new Identity() );
Pipe groupby = new GroupBy( cogroup, new Fields( "numA" ) );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sink, groupby );
// System.out.println( "flow =\n" + flow );
flow.complete();
validateLength( flow, 5, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\tA", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tB", iterator.next().get( 1 ) );
iterator.close();
}
public void testCoGroupSamePipe() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Map sources = new HashMap();
sources.put( "lower", source );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroup/same", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe cogroup = new CoGroup( pipeLower, new Fields( "num" ), 1, new Fields( "num1", "char1", "num2", "char2" ) );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sink, cogroup );
// System.out.println( "flow =\n" + flow );
flow.complete();
validateLength( flow, 5, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\ta", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tb", iterator.next().get( 1 ) );
iterator.close();
}
public void testCoGroupSamePipe2() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Map sources = new HashMap();
sources.put( "lower", source );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroup/same2", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe cogroup = new CoGroup( pipeLower, new Fields( "num" ), pipeLower, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sink, cogroup );
// System.out.println( "flow =\n" + flow );
flow.complete();
validateLength( flow, 5, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\ta", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tb", iterator.next().get( 1 ) );
iterator.close();
}
public void testCoGroupSamePipe3() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLower );
Map sources = new HashMap();
sources.put( "lower", source );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroup/same3", true );
Pipe pipe = new Pipe( "lower" );
Pipe lhs = new Pipe( "lhs", pipe );
Pipe rhs = new Pipe( "rhs", pipe );
Pipe cogroup = new CoGroup( lhs, new Fields( "num" ), rhs, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sink, cogroup );
// System.out.println( "flow =\n" + flow );
flow.complete();
validateLength( flow, 5, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\ta", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tb", iterator.next().get( 1 ) );
iterator.close();
}
public void testCoGroupAroundCoGroup() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper1", sourceUpper );
sources.put( "upper2", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupacogroup/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper1 = new Each( new Pipe( "upper1" ), new Fields( "line" ), splitter );
Pipe pipeUpper2 = new Each( new Pipe( "upper2" ), new Fields( "line" ), splitter );
Pipe splice1 = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper1, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2" ) );
splice1 = new Each( splice1, new Identity() );
Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeUpper2, new Fields( "num" ), new Fields( "num1", "char1", "num2", "char2", "num3", "char3" ) );
Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice2 );
// countFlow.writeDOT( "cogroupcogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 5, null );
TupleEntryIterator iterator = countFlow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\tA\t1\tA", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tB\t2\tB", iterator.next().get( 1 ) );
iterator.close();
}
public void testCoGroupAroundCoGroupWithout() throws Exception
{
runCoGroupAroundCoGroup( null, outputPath + "/complex/cogroupacogroupopt1/" );
}
public void testCoGroupAroundCoGroupWith() throws Exception
{
runCoGroupAroundCoGroup( TestTextLine.class, outputPath + "/complex/cogroupacogroupopt2/" );
}
private void runCoGroupAroundCoGroup( Class schemeClass, String stringPath ) throws IOException
{
if( !new File( inputFileNums10 ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileNums20 );
copyFromLocal( inputFileNums10 );
Tap source10 = new Hfs( new TestTextLine( new Fields( "num" ) ), inputFileNums10 );
Tap source20 = new Hfs( new TestTextLine( new Fields( "num" ) ), inputFileNums20 );
Map sources = new HashMap();
sources.put( "source20", source20 );
sources.put( "source101", source10 );
sources.put( "source102", source10 );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), stringPath, true );
Pipe pipeNum20 = new Pipe( "source20" );
Pipe pipeNum101 = new Pipe( "source101" );
Pipe pipeNum102 = new Pipe( "source102" );
Pipe splice1 = new CoGroup( pipeNum20, new Fields( "num" ), pipeNum101, new Fields( "num" ), new Fields( "num1", "num2" ) );
Pipe splice2 = new CoGroup( splice1, new Fields( "num1" ), pipeNum102, new Fields( "num" ), new Fields( "num1", "num2", "num3" ) );
splice2 = new Each( splice2, new Identity() );
Map<Object, Object> properties = getProperties();
FlowConnector.setIntermediateSchemeClass( properties, schemeClass );
FlowConnector flowConnector = new FlowConnector( properties );
Flow flow = flowConnector.connect( "cogroupopt", sources, sink, splice2 );
assertEquals( "wrong number of steps", 2, flow.getSteps().size() );
// flow.writeDOT( "cogroupcogroupwout.dot" );
flow.complete();
validateLength( flow, 10, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\t1\t1", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "10\t10\t10", iterator.next().get( 1 ) );
iterator.close();
}
}