/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading.pipe.assembly;
import java.io.File;
import java.io.IOException;
import java.util.Map;
import java.util.regex.Pattern;
import cascading.ClusterTestCase;
import cascading.cascade.Cascades;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.operation.Function;
import cascading.operation.expression.ExpressionFunction;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.Each;
import cascading.pipe.Pipe;
import cascading.scheme.TextDelimited;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryIterator;
/**
*
*/
public class AssemblyHelpersTest extends ClusterTestCase
{
String inputFileUpper = "build/test/data/upper.txt";
String inputFileLower = "build/test/data/lower.txt";
String inputFileLhs = "build/test/data/lhs.txt";
String inputFileRhs = "build/test/data/rhs.txt";
String outputPath = "build/test/output/assembly/";
public AssemblyHelpersTest()
{
super( "assembly helper tests", false );
}
public void testCoerce() throws IOException
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sink = new Hfs( new TextLine( new Fields( "line" ), new Fields( "num", "char" ) ), outputPath + "/coerce", true );
Pipe pipe = new Pipe( "coerce" );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
pipe = new Each( pipe, new Fields( "line" ), splitter );
pipe = new Coerce( pipe, new Fields( "num" ), Integer.class );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 1, Pattern.compile( "^\\d+\\s\\w+$" ) );
}
public void testShapeNarrow() throws IOException
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sink = new Hfs( new TextLine( new Fields( "num" ), new Fields( "num" ) ), outputPath + "/shapenarrow", true );
Pipe pipe = new Pipe( "shape" );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
pipe = new Each( pipe, new Fields( "line" ), splitter );
pipe = new Shape( pipe, new Fields( "num" ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 1, Pattern.compile( "^\\d+$" ) );
}
public void testRenameNamed() throws IOException
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sink = new Hfs( new TextLine( new Fields( "line" ), new Fields( "item", "element" ) ), outputPath + "/renameall", true );
Pipe pipe = new Pipe( "shape" );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
pipe = new Each( pipe, new Fields( "line" ), splitter );
pipe = new Rename( pipe, new Fields( "num", "char" ), new Fields( "item", "element" ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 1, Pattern.compile( "^\\d+\\s\\w+$" ) );
}
public void testRenameAll() throws IOException
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sink = new Hfs( new TextLine( new Fields( "line" ), new Fields( "item", "element" ) ), outputPath + "/renameall", true );
Pipe pipe = new Pipe( "shape" );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
pipe = new Each( pipe, new Fields( "line" ), splitter );
pipe = new Rename( pipe, Fields.ALL, new Fields( "item", "element" ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 1, Pattern.compile( "^\\d+\\s\\w+$" ) );
}
public void testRenameNarrow() throws IOException
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sink = new Hfs( new TextLine( new Fields( "item" ), new Fields( "char", "item" ) ), outputPath + "/renamenarrow", true );
Pipe pipe = new Pipe( "shape" );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
pipe = new Each( pipe, new Fields( "line" ), splitter );
pipe = new Rename( pipe, new Fields( "num" ), new Fields( "item" ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 1, Pattern.compile( "^\\w+\\s\\d+$" ) );
}
public void testUnique() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLhs );
Tap sink = new Hfs( new TextLine( new Fields( "item" ), new Fields( "num", "char" ) ), outputPath + "/unique", true );
Pipe pipe = new Pipe( "shape" );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
pipe = new Each( pipe, new Fields( "line" ), splitter );
pipe = new Unique( pipe, new Fields( "num" ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 1, Pattern.compile( "^\\d+\\s\\w+$" ) );
}
public void testUniqueMerge() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
copyFromLocal( inputFileRhs );
Tap sourceLhs = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLhs );
Tap sourceRhs = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileRhs );
Tap sink = new Hfs( new TextLine( new Fields( "item" ), new Fields( "num", "char" ) ), outputPath + "/unique", true );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe lhsPipe = new Pipe( "lhs" );
lhsPipe = new Each( lhsPipe, new Fields( "line" ), splitter );
Pipe rhsPipe = new Pipe( "rhs" );
rhsPipe = new Each( rhsPipe, new Fields( "line" ), splitter );
Pipe pipe = new Unique( Pipe.pipes( lhsPipe, rhsPipe ), new Fields( "num" ) );
Map<String, Tap> sources = Cascades.tapsMap( Pipe.pipes( lhsPipe, rhsPipe ), Tap.taps( sourceLhs, sourceRhs ) );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sink, pipe );
flow.complete();
validateLength( flow, 5, 1, Pattern.compile( "^\\d+\\s\\w+$" ) );
}
public void testCount() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
Tap source = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Tap sink = new Hfs( new TextDelimited( new Fields( "char", "count" ), "\t", new Class[]{String.class,
Integer.TYPE} ), outputPath + "/count", SinkMode.REPLACE );
Pipe pipe = new Pipe( "count" );
pipe = new CountBy( pipe, new Fields( "char" ), new Fields( "count" ), 2 );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 2, Pattern.compile( "^\\w+\\s\\d+$" ) );
Tuple[] results = new Tuple[]{
new Tuple( "a", 2 ),
new Tuple( "b", 4 ),
new Tuple( "c", 4 ),
new Tuple( "d", 2 ),
new Tuple( "e", 1 ),
};
TupleEntryIterator iterator = flow.openSink();
int count = 0;
while( iterator.hasNext() )
assertEquals( results[ count++ ], iterator.next().getTuple() );
iterator.close();
}
public void testCountMerge() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
copyFromLocal( inputFileRhs );
Tap lhs = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Tap rhs = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileRhs );
Tap sink = new Hfs( new TextDelimited( new Fields( "char", "count" ), "\t", new Class[]{String.class,
Integer.TYPE} ), outputPath + "/mergecount", SinkMode.REPLACE );
Pipe lhsPipe = new Pipe( "count-lhs" );
Pipe rhsPipe = new Pipe( "count-rhs" );
rhsPipe = new Each( rhsPipe, new Fields( "char" ), new ExpressionFunction( Fields.ARGS, "$0.toLowerCase()", String.class ), Fields.REPLACE );
Pipe countPipe = new CountBy( Pipe.pipes( lhsPipe, rhsPipe ), new Fields( "char" ), new Fields( "count" ), 2 );
Map<String, Tap> tapMap = Cascades.tapsMap( Pipe.pipes( lhsPipe, rhsPipe ), Tap.taps( lhs, rhs ) );
Flow flow = new FlowConnector( getProperties() ).connect( tapMap, sink, countPipe );
flow.complete();
validateLength( flow, 5, 2, Pattern.compile( "^\\w+\\s\\d+$" ) );
Tuple[] results = new Tuple[]{
new Tuple( "a", 4 ),
new Tuple( "b", 8 ),
new Tuple( "c", 8 ),
new Tuple( "d", 4 ),
new Tuple( "e", 2 ),
};
TupleEntryIterator iterator = flow.openSink();
int count = 0;
while( iterator.hasNext() )
assertEquals( results[ count++ ], iterator.next().getTuple() );
iterator.close();
}
public void testSum() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
Tap source = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Tap sink = new Hfs( new TextDelimited( new Fields( "char", "sum" ), "\t", new Class[]{String.class,
Integer.TYPE} ), outputPath + "/sum", SinkMode.REPLACE );
Pipe pipe = new Pipe( "sum" );
pipe = new SumBy( pipe, new Fields( "char" ), new Fields( "num" ), new Fields( "sum" ), long.class, 2 );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 2, Pattern.compile( "^\\w+\\s\\d+$" ) );
Tuple[] results = new Tuple[]{
new Tuple( "a", 6 ),
new Tuple( "b", 12 ),
new Tuple( "c", 10 ),
new Tuple( "d", 6 ),
new Tuple( "e", 5 ),
};
TupleEntryIterator iterator = flow.openSink();
int count = 0;
while( iterator.hasNext() )
assertEquals( results[ count++ ], iterator.next().getTuple() );
iterator.close();
}
public void testSumMerge() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
copyFromLocal( inputFileRhs );
Tap lhs = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Tap rhs = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileRhs );
Tap sink = new Hfs( new TextDelimited( new Fields( "char", "sum" ), "\t", new Class[]{String.class,
Integer.TYPE} ), outputPath + "/mergesum", SinkMode.REPLACE );
Pipe lhsPipe = new Pipe( "sum-lhs" );
Pipe rhsPipe = new Pipe( "sum-rhs" );
rhsPipe = new Each( rhsPipe, new Fields( "char" ), new ExpressionFunction( Fields.ARGS, "$0.toLowerCase()", String.class ), Fields.REPLACE );
Pipe sumPipe = new SumBy( Pipe.pipes( lhsPipe, rhsPipe ), new Fields( "char" ), new Fields( "num" ), new Fields( "sum" ), long.class, 2 );
Map<String, Tap> tapMap = Cascades.tapsMap( Pipe.pipes( lhsPipe, rhsPipe ), Tap.taps( lhs, rhs ) );
Flow flow = new FlowConnector( getProperties() ).connect( tapMap, sink, sumPipe );
flow.complete();
validateLength( flow, 5, 2, Pattern.compile( "^\\w+\\s\\d+$" ) );
Tuple[] results = new Tuple[]{
new Tuple( "a", 12 ),
new Tuple( "b", 24 ),
new Tuple( "c", 20 ),
new Tuple( "d", 12 ),
new Tuple( "e", 10 ),
};
TupleEntryIterator iterator = flow.openSink();
int count = 0;
while( iterator.hasNext() )
assertEquals( results[ count++ ], iterator.next().getTuple() );
iterator.close();
}
public void testAverage() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
Tap source = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Tap sink = new Hfs( new TextDelimited( new Fields( "char", "average" ), "\t", new Class[]{String.class,
Double.TYPE} ), outputPath + "/average", SinkMode.REPLACE );
Pipe pipe = new Pipe( "average" );
pipe = new AverageBy( pipe, new Fields( "char" ), new Fields( "num" ), new Fields( "average" ), 2 );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 2, Pattern.compile( "^\\w+\\s[\\d.]+$" ) );
Tuple[] results = new Tuple[]{
new Tuple( "a", (double) 6 / 2 ),
new Tuple( "b", (double) 12 / 4 ),
new Tuple( "c", (double) 10 / 4 ),
new Tuple( "d", (double) 6 / 2 ),
new Tuple( "e", (double) 5 / 1 ),
};
TupleEntryIterator iterator = flow.openSink();
int count = 0;
while( iterator.hasNext() )
assertEquals( results[ count++ ], iterator.next().getTuple() );
iterator.close();
}
public void testAverageMerge() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
copyFromLocal( inputFileRhs );
Tap lhs = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Tap rhs = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileRhs );
Tap sink = new Hfs( new TextDelimited( new Fields( "char", "average" ), "\t", new Class[]{String.class,
Double.TYPE} ), outputPath + "/mergeaverage", SinkMode.REPLACE );
Pipe lhsPipe = new Pipe( "average-lhs" );
Pipe rhsPipe = new Pipe( "average-rhs" );
rhsPipe = new Each( rhsPipe, new Fields( "char" ), new ExpressionFunction( Fields.ARGS, "$0.toLowerCase()", String.class ), Fields.REPLACE );
Pipe sumPipe = new AverageBy( Pipe.pipes( lhsPipe, rhsPipe ), new Fields( "char" ), new Fields( "num" ), new Fields( "average" ), 2 );
Map<String, Tap> tapMap = Cascades.tapsMap( Pipe.pipes( lhsPipe, rhsPipe ), Tap.taps( lhs, rhs ) );
Flow flow = new FlowConnector( getProperties() ).connect( tapMap, sink, sumPipe );
flow.complete();
validateLength( flow, 5, 2, Pattern.compile( "^\\w+\\s[\\d.]+$" ) );
Tuple[] results = new Tuple[]{
new Tuple( "a", (double) 12 / 4 ),
new Tuple( "b", (double) 24 / 8 ),
new Tuple( "c", (double) 20 / 8 ),
new Tuple( "d", (double) 12 / 4 ),
new Tuple( "e", (double) 10 / 2 ),
};
TupleEntryIterator iterator = flow.openSink();
int count = 0;
while( iterator.hasNext() )
assertEquals( results[ count++ ], iterator.next().getTuple() );
iterator.close();
}
public void testParallelAggregates() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
Tap source = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Tap sink = new Hfs( new TextDelimited( new Fields( "char", "sum", "count", "average" ), "\t", new Class[]{
String.class,
Integer.TYPE,
Integer.TYPE,
Double.TYPE} ), outputPath + "/multi", SinkMode.REPLACE );
Pipe pipe = new Pipe( "multi" );
SumBy sumPipe = new SumBy( new Fields( "num" ), new Fields( "sum" ), long.class );
CountBy countPipe = new CountBy( new Fields( "count" ) );
AverageBy averagePipe = new AverageBy( new Fields( "num" ), new Fields( "average" ) );
pipe = new AggregateBy( "name", Pipe.pipes( pipe ), new Fields( "char" ), 2, sumPipe, countPipe, averagePipe );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5, 4, Pattern.compile( "^\\w+\\s\\d+\\s\\d+\\s[\\d.]+$" ) );
Tuple[] results = new Tuple[]{
new Tuple( "a", 6, 2, (double) 6 / 2 ),
new Tuple( "b", 12, 4, (double) 12 / 4 ),
new Tuple( "c", 10, 4, (double) 10 / 4 ),
new Tuple( "d", 6, 2, (double) 6 / 2 ),
new Tuple( "e", 5, 1, (double) 5 / 1 ),
};
TupleEntryIterator iterator = flow.openSink();
int count = 0;
while( iterator.hasNext() )
assertEquals( results[ count++ ], iterator.next().getTuple() );
iterator.close();
}
public void testParallelAggregatesMerge() throws IOException
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
copyFromLocal( inputFileRhs );
Tap lhs = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Tap rhs = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileRhs );
Tap sink = new Hfs( new TextDelimited( new Fields( "char", "sum", "count", "average" ), "\t", new Class[]{
String.class,
Integer.TYPE,
Integer.TYPE, Double.TYPE} ),
outputPath + "/multimerge", SinkMode.REPLACE );
Pipe lhsPipe = new Pipe( "multi-lhs" );
Pipe rhsPipe = new Pipe( "multi-rhs" );
rhsPipe = new Each( rhsPipe, new Fields( "char" ), new ExpressionFunction( Fields.ARGS, "$0.toLowerCase()", String.class ), Fields.REPLACE );
SumBy sumPipe = new SumBy( new Fields( "num" ), new Fields( "sum" ), long.class );
CountBy countPipe = new CountBy( new Fields( "count" ) );
AverageBy averagePipe = new AverageBy( new Fields( "num" ), new Fields( "average" ) );
Pipe pipe = new AggregateBy( "name", Pipe.pipes( lhsPipe, rhsPipe ), new Fields( "char" ), 2, sumPipe, countPipe, averagePipe );
Map<String, Tap> tapMap = Cascades.tapsMap( Pipe.pipes( lhsPipe, rhsPipe ), Tap.taps( lhs, rhs ) );
Flow flow = new FlowConnector( getProperties() ).connect( tapMap, sink, pipe );
flow.complete();
validateLength( flow, 5, 4, Pattern.compile( "^\\w+\\s\\d+\\s\\d+\\s[\\d.]+$" ) );
Tuple[] results = new Tuple[]{
new Tuple( "a", 12, 4, (double) 12 / 4 ),
new Tuple( "b", 24, 8, (double) 24 / 8 ),
new Tuple( "c", 20, 8, (double) 20 / 8 ),
new Tuple( "d", 12, 4, (double) 12 / 4 ),
new Tuple( "e", 10, 2, (double) 10 / 2 ),
};
TupleEntryIterator iterator = flow.openSink();
int count = 0;
while( iterator.hasNext() )
assertEquals( results[ count++ ], iterator.next().getTuple() );
iterator.close();
}
}