/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading;
import java.io.File;
import java.util.Map;
import java.util.regex.Pattern;
import cascading.cascade.Cascades;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.operation.AssertionLevel;
import cascading.operation.Debug;
import cascading.operation.Filter;
import cascading.operation.Function;
import cascading.operation.Identity;
import cascading.operation.assertion.AssertSizeMoreThan;
import cascading.operation.filter.And;
import cascading.operation.filter.Not;
import cascading.operation.filter.Or;
import cascading.operation.filter.Xor;
import cascading.operation.function.UnGroup;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.scheme.SequenceFile;
import cascading.scheme.TextDelimited;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.SinkMode;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryIterator;
public class RegressionPipesTest extends ClusterTestCase
{
String inputFileApache = "build/test/data/apache.10.txt";
String inputFileIps = "build/test/data/ips.20.txt";
String inputFileNums20 = "build/test/data/nums.20.txt";
String inputFileNums10 = "build/test/data/nums.10.txt";
String inputFileCritics = "build/test/data/critics.txt";
String inputFileUpper = "build/test/data/upper.txt";
String inputFileLower = "build/test/data/lower.txt";
String inputFileLowerOffset = "build/test/data/lower-offset.txt";
String inputFileJoined = "build/test/data/lower+upper.txt";
String inputFileLhs = "build/test/data/lhs.txt";
String inputFileRhs = "build/test/data/rhs.txt";
String inputFileCross = "build/test/data/lhs+rhs-cross.txt";
String outputPath = "build/test/output/regression/";
public RegressionPipesTest()
{
super( "regression pipes", false );
}
/**
* tests that a selector will select something other than the first position from an UNKNOWN tuple
*
* @throws Exception
*/
public void testUnknown() throws Exception
{
if( !new File( inputFileJoined ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileJoined );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileJoined );
Tap sink = new Hfs( new TextLine(), outputPath + "/unknown", true );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( Fields.UNKNOWN ) );
// pipe = new Each( pipe, new Debug() );
pipe = new Each( pipe, new Fields( 2 ), new Identity( new Fields( "label" ) ) );
// pipe = new Each( pipe, new Debug() );
pipe = new Each( pipe, new Fields( "label" ), new RegexFilter( "[A-Z]*" ) );
// pipe = new Each( pipe, new Debug() );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "unknownselect.dot" );
flow.complete();
validateLength( flow, 5, null );
}
public void testCopy() throws Exception
{
if( !new File( inputFileJoined ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileJoined );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileJoined );
Tap sink = new Hfs( new TextLine(), outputPath + "/copy", true );
Pipe pipe = new Pipe( "test" );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "copy.dot" );
flow.complete();
validateLength( flow, 5, null );
}
/**
* tests that a selector will select something other than the first position from an UNKNOWN tuple
*
* @throws Exception
*/
public void testVarWidth() throws Exception
{
if( !new File( inputFileCritics ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileCritics );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileCritics );
Tap sink = new Hfs( new TextLine(), outputPath + "/varwidth", true );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( Fields.UNKNOWN ) );
pipe = new Each( pipe, AssertionLevel.STRICT, new AssertSizeMoreThan( 3 ) );
pipe = new Each( pipe, new Fields( 0, 1, -1 ), new Identity( new Fields( "name", "second", "last" ) ) );
// pipe = new Each( pipe, new Debug() );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "unknownselect.dot" );
flow.complete();
validateLength( flow, 7 );
}
/**
* This test allows for Fields.UNKNOWN to propagate from the RegexSplitter through to the UnGroup (or any other
* operation).
* <p/>
* This could be dangerous but feels very natural and part of the intentions of having UNKNOWN
*
* @throws Exception
*/
public void testUnGroupUnknown() throws Exception
{
if( !new File( inputFileJoined ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileJoined );
Tap source = new Hfs( new TextLine(), inputFileJoined );
Tap sink = new Hfs( new TextLine(), outputPath + "/ungrouped-unknown", true );
Pipe pipe = new Pipe( "test" );
// emits Fields.UNKNOWN
pipe = new Each( pipe, new Fields( 1 ), new RegexSplitter( "\t" ), Fields.ALL );
// accepts Fields.UNKOWN
pipe = new Each( pipe, new UnGroup( Fields.size( 2 ), new Fields( 0 ), Fields.fields( new Fields( 1 ), new Fields( 2 ) ) ) );
Flow flow = new FlowConnector().connect( source, sink, pipe );
// flow.writeDOT( "ungroup.dot" );
flow.complete();
validateLength( flow, 10 );
}
public void testDupeHeadNames() throws Exception
{
// todo: re-enable these tests on next major release
if( true )
return;
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileJoined );
Tap sink = new Hfs( new TextLine(), outputPath + "/unknown", true );
Pipe lhs = new Pipe( "test" );
lhs = new Each( lhs, new Fields( "line" ), new RegexSplitter( " " ) );
Pipe rhs = new Pipe( "test" );
rhs = new Each( rhs, new Fields( "line" ), new RegexSplitter( " " ) );
Pipe group = new GroupBy( Pipe.pipes( lhs, rhs ), Fields.size( 3 ) );
try
{
new FlowConnector( getProperties() ).connect( source, sink, group );
fail( "did not fail on dupe head names" );
}
catch( Exception exception )
{
// ignore
}
}
public void testDupeTailNames() throws Exception
{
// todo: re-enable these tests on next major release
if( true )
return;
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileJoined );
Tap sink = new Hfs( new TextLine(), outputPath + "/unknown", true );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( " " ) );
Pipe group = new GroupBy( pipe, Fields.size( 3 ) );
Pipe lhs = new Pipe( "tail", group );
lhs = new Each( group, new Fields( "line" ), new RegexSplitter( " " ) );
Pipe rhs = new Pipe( "tail", group );
rhs = new Each( group, new Fields( "line" ), new RegexSplitter( " " ) );
Map<String, Tap> sinks = Cascades.tapsMap( Pipe.pipes( lhs, rhs ), Tap.taps( sink, sink ) );
try
{
new FlowConnector( getProperties() ).connect( source, sinks, Pipe.pipes( lhs, rhs ) );
fail( "did not fail on dupe head names" );
}
catch( Exception exception )
{
// ignore
}
}
public void testIllegalCharsInTempFiles() throws Exception
{
if( !new File( inputFileJoined ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileJoined );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileJoined );
Tap sink = new Hfs( new TextLine(), outputPath + "/illegalchars", true );
// Pipe pipe = new Pipe( "bar:bar@foo://blah/\t(*(**^**&%&%^@#@&&() :::: ///\\\\ \t illegal chars in it" );
Pipe pipe = new Pipe( "**&%&%bar:bar@foo://blah/\t(*(**^**&%&%^@#@&&() :::: ///\\\\ \t illegal chars in it" );
pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( " " ) );
pipe = new GroupBy( pipe, new Fields( 0 ) );
pipe = new GroupBy( pipe, new Fields( 0 ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 5 );
}
/**
* Method testCoGroupSplitPipe tests the case where CoGroup on the lhs steps on the tuple as it passes down
* the rhs. this is rare and expects that one side is all filters.
*
* @throws Exception when
*/
public void testCoGroupSplitPipe() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileLower );
Tap splitTap = new Hfs( new SequenceFile( new Fields( "num", "char" ) ), outputPath + "/complex/intermediate", SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe split = new Each( "split", splitter );
Flow splitFlow = new FlowConnector( getProperties() ).connect( source, splitTap, split );
splitFlow.complete();
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroupsplit/", true );
Pipe lower = new Pipe( "lower" );
Pipe lhs = new Pipe( "lhs", lower );
// lhs = new Each( lhs, new Identity() ); // identity does not trigger the issue this tests.
// lhs = new Each( lhs, new Debug( "lhs", true ) );
Pipe rhs = new Pipe( "rhs", lower );
rhs = new Each( rhs, new Debug( "rhs-pre", true ) );
rhs = new Each( rhs, new Fields( "num" ), new Identity( new Fields( "num2" ) ) );
// rhs = new Each( rhs, new Debug( "rhs-post", true ) );
Pipe cogroup = new CoGroup( lhs, new Fields( "num" ), rhs, new Fields( "num2" ) );
// cogroup = new Each( cogroup, new Debug( true ) );
Flow flow = new FlowConnector( getProperties() ).connect( splitTap, sink, cogroup );
// flow.writeDOT( "othercogroup.dot" );
flow.complete();
validateLength( flow, 5, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb\t2", iterator.next().get( 1 ) );
iterator.close();
}
/**
* Method testCoGroupSplitPipe tests the case where GroupBy on the lhs steps on the tuple as it passes down
* the rhs. this is rare and expects that one side is all filters.
*
* @throws Exception when
*/
public void testGroupBySplitPipe() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileLower );
Tap splitTap = new Hfs( new SequenceFile( new Fields( "num", "char" ) ), outputPath + "/complex/intermediate", SinkMode.REPLACE );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
Pipe split = new Each( "split", splitter );
Flow splitFlow = new FlowConnector( getProperties() ).connect( source, splitTap, split );
splitFlow.complete();
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/groupbysplit/", true );
Pipe lower = new Pipe( "lower" );
Pipe lhs = new Pipe( "lhs", lower );
Pipe rhs = new Pipe( "rhs", lower );
rhs = new Each( rhs, new Fields( "num" ), new Identity( new Fields( "num2" ) ), new Fields( "num", "char" ) );
Pipe groupBy = new GroupBy( Pipe.pipes( lhs, rhs ), new Fields( "num" ) );
Flow flow = new FlowConnector( getProperties() ).connect( splitTap, sink, groupBy );
// flow.writeDOT( "othercogroup.dot" );
flow.complete();
validateLength( flow, 10, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "1\ta", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "2\tb", iterator.next().get( 1 ) );
iterator.close();
}
public void testLastEachNotModified() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new TestFunction( new Fields( "insert" ), new Tuple( "inserted" ) ) );
pipe = new GroupBy( pipe, new Fields( "insert" ) );
Tap sink = new Hfs( new TextLine(), outputPath + "/regression/lasteachmodified", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 10, null );
}
public void testComplexLogicAnd() throws Exception
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
Tap source = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Pipe pipe = new Pipe( "test" );
Filter filter = new Not( new And( new Fields( "num" ), new RegexFilter( "1", true, true ), new Fields( "char" ), new RegexFilter( "a", true, true ) ) );
// compounding the filter for the Fields.ALL case.
pipe = new Each( pipe, filter );
pipe = new Each( pipe, new Fields( "num", "char" ), filter );
Tap sink = new Hfs( new TextDelimited( Fields.ALL, " " ), outputPath + "/regression/complexlogicand", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 1, 2, Pattern.compile( "1\ta" ) );
}
public void testComplexLogicOr() throws Exception
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
Tap source = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Pipe pipe = new Pipe( "test" );
Filter filter = new Not( new Or( new Fields( "num" ), new RegexFilter( "1", true, true ), new Fields( "char" ), new RegexFilter( "a", true, true ) ) );
// compounding the filter for the Fields.ALL case.
pipe = new Each( pipe, filter );
pipe = new Each( pipe, new Fields( "num", "char" ), filter );
Tap sink = new Hfs( new TextDelimited( Fields.ALL, " " ), outputPath + "/regression/complexlogicor", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 4, 2, Pattern.compile( "(1\t.)|(.\ta)" ) );
}
public void testComplexLogicXor() throws Exception
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
Tap source = new Hfs( new TextDelimited( new Fields( "num", "char" ), " " ), inputFileLhs );
Pipe pipe = new Pipe( "test" );
Filter filter = new Not( new Xor( new Fields( "num" ), new RegexFilter( "1", true, true ), new Fields( "char" ), new RegexFilter( "a", true, true ) ) );
// compounding the filter for the Fields.ALL case.
pipe = new Each( pipe, filter );
pipe = new Each( pipe, new Fields( "num", "char" ), filter );
Tap sink = new Hfs( new TextDelimited( Fields.ALL, " " ), outputPath + "/regression/complexlogicxor", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 3, 2, Pattern.compile( "(1\t.)|(.\ta)" ) );
}
}