/*
* Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
*
* Project and contact information: http://www.cascading.org/
*
* This file is part of the Cascading project.
*
* Cascading is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Cascading is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Cascading. If not, see <http://www.gnu.org/licenses/>.
*/
package cascading;
import java.io.File;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import cascading.cascade.Cascades;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.operation.Debug;
import cascading.operation.Filter;
import cascading.operation.Function;
import cascading.operation.Identity;
import cascading.operation.aggregator.Count;
import cascading.operation.aggregator.First;
import cascading.operation.expression.ExpressionFunction;
import cascading.operation.filter.And;
import cascading.operation.function.UnGroup;
import cascading.operation.regex.RegexFilter;
import cascading.operation.regex.RegexParser;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.cogroup.InnerJoin;
import cascading.scheme.SequenceFile;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.MultiSourceTap;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryIterator;
public class FieldedPipesTest extends ClusterTestCase
{
String inputFileApache = "build/test/data/apache.10.txt";
String inputFileIps = "build/test/data/ips.20.txt";
String inputFileNums20 = "build/test/data/nums.20.txt";
String inputFileNums10 = "build/test/data/nums.10.txt";
String inputFileCritics = "build/test/data/critics.txt";
String inputFileUpper = "build/test/data/upper.txt";
String inputFileLower = "build/test/data/lower.txt";
String inputFileLowerOffset = "build/test/data/lower-offset.txt";
String inputFileJoined = "build/test/data/lower+upper.txt";
String inputFileJoinedExtra = "build/test/data/extra+lower+upper.txt";
String inputFileLhs = "build/test/data/lhs.txt";
String inputFileRhs = "build/test/data/rhs.txt";
String inputFileCross = "build/test/data/lhs+rhs-cross.txt";
String outputPath = "build/test/output/fields/";
public FieldedPipesTest()
{
super( "fielded pipes", true ); // leave cluster testing enabled
}
public void testSimpleGroup() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) );
Tap sink = new Hfs( new TextLine(), outputPath + "/simple", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "groupcount.dot" );
flow.complete();
validateLength( flow.openSource(), 10 ); // validate source, this once, as a sanity check
validateLength( flow, 8, null );
}
public void testSimpleChain() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Every( pipe, new Count( new Fields( "count1" ) ) );
pipe = new Every( pipe, new Count( new Fields( "count2" ) ) );
pipe = new Every( pipe, new Count( new Fields( "count3" ) ) );
pipe = new Every( pipe, new Count( new Fields( "count4" ) ) );
Tap sink = new Hfs( new SequenceFile( Fields.ALL ), outputPath + "/simplechain", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "chainedevery.dot" );
flow.complete();
validateLength( flow, 8, 5 );
}
public void testChainEndingWithEach() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Every( pipe, new Count( new Fields( "count1" ) ) );
pipe = new Every( pipe, new Count( new Fields( "count2" ) ) );
pipe = new Each( pipe, new Fields( "count1", "count2" ), new ExpressionFunction( new Fields( "sum" ), "count1 + count2", int.class ), Fields.ALL );
Tap sink = new Hfs( new TextLine(), outputPath + "/chaineach", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "chainedevery.dot" );
flow.complete();
validateLength( flow, 8, null );
}
// also tests the RegexSplitter
public void testNoGroup() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileApache );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new RegexSplitter( "\\s+" ), new Fields( 1 ) );
Tap sink = new Hfs( new TextLine( 1 ), outputPath + "/simplesplit", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 10, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "75.185.76.245", iterator.next().get( 1 ) );
iterator.close();
}
public void testCopy() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileApache );
Pipe pipe = new Pipe( "test" );
Tap sink = new Hfs( new TextLine( 1 ), outputPath + "/copy", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 10, null );
}
public void testSimpleMerge() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/merge/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe splice = new GroupBy( "merge", Pipe.pipes( pipeLower, pipeUpper ), new Fields( "num" ), null, false );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
flow.complete();
validateLength( flow, 10, null );
TupleEntryIterator iterator = flow.openSink();
Comparable line = iterator.next().get( 1 );
assertTrue( "not equal: tuple.get(1)", line.equals( "1\ta" ) || line.equals( "1\tA" ) );
line = iterator.next().get( 1 );
assertTrue( "not equal: tuple.get(1)", line.equals( "1\ta" ) || line.equals( "1\tA" ) );
iterator.close();
}
/**
* Specifically tests GroupBy will return the correct grouping fields to the following Every
*
* @throws Exception
*/
public void testSimpleMergeThree() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
copyFromLocal( inputFileLowerOffset );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Tap sourceLowerOffset = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLowerOffset );
Map sources = new HashMap();
sources.put( "lower", sourceLower );
sources.put( "upper", sourceUpper );
sources.put( "offset", sourceLowerOffset );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/mergethree/", true );
Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
Pipe pipeOffset = new Each( new Pipe( "offset" ), new Fields( "line" ), splitter );
Pipe splice = new GroupBy( "merge", Pipe.pipes( pipeLower, pipeUpper, pipeOffset ), new Fields( "num" ) );
splice = new Every( splice, new Fields( "char" ), new First( new Fields( "first" ) ) );
splice = new Each( splice, new Fields( "num", "first" ), new Identity() );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sink, splice );
flow.complete();
validateLength( flow, 6, null );
}
public void testUnGroup() throws Exception
{
if( !new File( inputFileJoined ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileJoined );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileJoined );
Tap sink = new Hfs( new TextLine(), outputPath + "/ungrouped", true );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num", "lower", "upper" ) ) );
pipe = new Each( pipe, new UnGroup( new Fields( "num", "char" ), new Fields( "num" ), Fields.fields( new Fields( "lower" ), new Fields( "upper" ) ) ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "ungroup.dot" );
flow.complete();
validateLength( flow, 10, null );
}
public void testUnGroupBySize() throws Exception
{
if( !new File( inputFileJoinedExtra ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileJoinedExtra );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileJoinedExtra );
Tap sink = new Hfs( new TextLine(), outputPath + "/ungrouped_size", true );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new RegexSplitter( new Fields( "num1", "num2", "lower", "upper" ) ) );
pipe = new Each( pipe, new UnGroup( new Fields( "num1", "num2", "char" ), new Fields( "num1", "num2" ), 1 ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "ungroup.dot" );
flow.complete();
validateLength( flow, 10, null );
TupleEntryIterator iterator = flow.openSink();
Comparable line = iterator.next().get( 1 );
assertTrue( "not equal: tuple.get(1)", line.equals( "1\t1\ta" ) );
line = iterator.next().get( 1 );
assertTrue( "not equal: tuple.get(1)", line.equals( "1\t1\tA" ) );
iterator.close();
}
public void testFilter() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink = new Hfs( new TextLine(), outputPath + "/filter", true );
Pipe pipe = new Pipe( "test" );
Filter filter = new RegexFilter( "^68.*" );
pipe = new Each( pipe, new Fields( "line" ), filter );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "flow.dot" );
flow.complete();
validateLength( flow, 3, null );
}
public void testLogicFilter() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink = new Hfs( new TextLine(), outputPath + "/logicfilter", true );
Pipe pipe = new Pipe( "test" );
Filter filter = new And( new RegexFilter( "^68.*$" ), new RegexFilter( "^1000.*$" ) );
pipe = new Each( pipe, new Fields( "line" ), filter );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "flow.dot" );
flow.complete();
validateLength( flow, 3, null );
}
public void testFilterComplex() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink = new Hfs( new TextLine(), outputPath + "/filtercomplex", true );
Pipe pipe = new Pipe( "test" );
// pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
pipe = new Each( pipe, new Fields( "line" ), TestConstants.APACHE_COMMON_PARSER );
pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^POST" ) );
pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^POST" ) );
pipe = new Each( pipe, new Fields( "method" ), new Identity( new Fields( "value" ) ), Fields.ALL );
pipe = new GroupBy( pipe, new Fields( "value" ) );
pipe = new Every( pipe, new Count(), new Fields( "value", "count" ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "filter.dot" );
flow.complete();
validateLength( flow, 1, null );
}
/**
* Intentionally filters all values out to test next mr job behaves
*
* @throws Exception
*/
public void testFilterAll() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink = new Hfs( new TextLine(), outputPath + "/filtercomplex", true );
Pipe pipe = new Pipe( "test" );
String regex = "^([^ ]*) +[^ ]* +[^ ]* +\\[([^]]*)\\] +\\\"([^ ]*) ([^ ]*) [^ ]*\\\" ([^ ]*) ([^ ]*).*$";
Fields fieldDeclaration = new Fields( "ip", "time", "method", "event", "status", "size" );
int[] groups = {1, 2, 3, 4, 5, 6};
RegexParser function = new RegexParser( fieldDeclaration, regex, groups );
pipe = new Each( pipe, new Fields( "line" ), function );
pipe = new Each( pipe, new Fields( "method" ), new RegexFilter( "^fobar" ) ); // intentionally filtering all
pipe = new GroupBy( pipe, new Fields( "method" ) );
pipe = new Each( pipe, new Fields( "method" ), new Identity( new Fields( "value" ) ), Fields.ALL );
pipe = new GroupBy( pipe, new Fields( "value" ) );
pipe = new Every( pipe, new Count(), new Fields( "value", "count" ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
// flow.writeDOT( "filter.dot" );
flow.complete();
validateLength( flow, 0, null );
}
// public void testLimitFilter() throws Exception
// {
// if( !new File( inputFileApache ).exists() )
// fail( "data file not found" );
//
// copyFromLocal( inputFileApache );
//
// Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
// Tap sink = new Lfs( new TextLine(), outputPath + "/limitfilter", true );
//
// Pipe pipe = new Pipe( "test" );
//
// Filter filter = new Limit( 7 );
//
// pipe = new Each( pipe, new Fields( "line" ), filter );
//
// Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
//
//// flow.writeDOT( "flow.dot" );
//
// flow.complete();
//
// validateLength( flow, 7, null );
// }
//
public void testCross() throws Exception
{
if( !new File( inputFileLhs ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLhs );
copyFromLocal( inputFileRhs );
Map sources = new HashMap();
sources.put( "lhs", new Hfs( new TextLine(), inputFileLhs ) );
sources.put( "rhs", new Hfs( new TextLine(), inputFileRhs ) );
Pipe pipeLower = new Each( "lhs", new Fields( "line" ), new RegexSplitter( new Fields( "numLHS", "charLHS" ), " " ) );
Pipe pipeUpper = new Each( "rhs", new Fields( "line" ), new RegexSplitter( new Fields( "numRHS", "charRHS" ), " " ) );
Pipe cross = new CoGroup( pipeLower, new Fields( "numLHS" ), pipeUpper, new Fields( "numRHS" ), new InnerJoin() );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cross/", true );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sink, cross );
// System.out.println( "flow =\n" + flow );
flow.complete();
validateLength( flow, 37, null );
TupleEntryIterator iterator = flow.openSink();
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\tA", iterator.next().get( 1 ) );
assertEquals( "not equal: tuple.get(1)", "1\ta\t1\tB", iterator.next().get( 1 ) );
iterator.close();
}
public void testSplit() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
// 46 192
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink1 = new Hfs( new TextLine(), outputPath + "/split1", true );
Tap sink2 = new Hfs( new TextLine(), outputPath + "/split2", true );
Pipe pipe = new Pipe( "split" );
pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );
Map sources = new HashMap();
sources.put( "split", source );
Map sinks = new HashMap();
sinks.put( "left", sink1 );
sinks.put( "right", sink2 );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sinks, left, right );
// flow.writeDOT( "split.dot" );
flow.complete();
validateLength( flow, 1, "left" );
validateLength( flow, 2, "right" );
}
/**
* verifies non-safe rules apply in the proper place
*
* @throws Exception
*/
public void testSplitNonSafe() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
// 46 192
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink1 = new Hfs( new TextLine(), outputPath + "/nonsafesplit1", true );
Tap sink2 = new Hfs( new TextLine(), outputPath + "/nonsafesplit2", true );
Pipe pipe = new Pipe( "split" );
// run job on non-safe operation, forces 3 mr jobs.
pipe = new Each( pipe, new TestFunction( new Fields( "ignore" ), new Tuple( 1 ), false ), new Fields( "line" ) );
pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );
Map sources = new HashMap();
sources.put( "split", source );
Map sinks = new HashMap();
sinks.put( "left", sink1 );
sinks.put( "right", sink2 );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sinks, left, right );
// flow.writeDOT( "split.dot" );
flow.complete();
validateLength( flow, 1, "left" );
validateLength( flow, 2, "right" );
}
public void testSplitSameSourceMerged() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
// 46 192
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink = new Hfs( new TextLine(), outputPath + "/splitsourcemerged", true );
Pipe pipe = new Pipe( "split" );
pipe = new Each( pipe, new Fields( "line" ), new RegexFilter( "^68.*" ) );
Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "line" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "line" ), new RegexFilter( ".*102.*" ) );
Pipe merged = new GroupBy( "merged", Pipe.pipes( left, right ), new Fields( "line" ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, merged );
// flow.writeDOT( "splitmerged.dot" );
flow.complete();
validateLength( flow, 3 );
}
/**
* verifies not inserting Identity between groups works
*
* @throws Exception
*/
public void testSplitOut() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap sourceLower = new Hfs( new TextLine( new Fields( "num", "line" ) ), inputFileApache );
Map sources = new HashMap();
sources.put( "lower1", sourceLower );
// using null pos so all fields are written
Tap sink1 = new Hfs( new TextLine(), outputPath + "/splitout1", true );
Tap sink2 = new Hfs( new TextLine(), outputPath + "/splitout2", true );
Map sinks = new HashMap();
sinks.put( "output1", sink1 );
sinks.put( "output2", sink2 );
Pipe pipeLower1 = new Pipe( "lower1" );
Pipe left = new GroupBy( "output1", pipeLower1, new Fields( 0 ) );
Pipe right = new GroupBy( "output2", left, new Fields( 0 ) );
Flow flow = new FlowConnector().connect( sources, sinks, Pipe.pipes( left, right ) );
// flow.writeDOT( "splitout.dot" );
flow.complete();
validateLength( flow, 10, "output1" );
validateLength( flow, 10, "output2" );
}
public void testSplitComplex() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
// 46 192
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink1 = new Hfs( new TextLine(), outputPath + "/splitcomp1", true );
Tap sink2 = new Hfs( new TextLine(), outputPath + "/splitcomp2", true );
Pipe pipe = new Pipe( "split" );
pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Every( pipe, new Fields( "ip" ), new Count(), new Fields( "ip", "count" ) );
pipe = new Each( pipe, new Fields( "ip" ), new RegexFilter( "^68.*" ) );
Pipe left = new Each( new Pipe( "left", pipe ), new Fields( "ip" ), new RegexFilter( ".*46.*" ) );
Pipe right = new Each( new Pipe( "right", pipe ), new Fields( "ip" ), new RegexFilter( ".*102.*" ) );
Map sources = Cascades.tapsMap( "split", source );
Map sinks = Cascades.tapsMap( Pipe.pipes( left, right ), Tap.taps( sink1, sink2 ) );
Flow flow = new FlowConnector( getProperties() ).connect( sources, sinks, left, right );
// flow.writeDOT( "splitcomplex.dot" );
flow.complete();
validateLength( flow, 1, "left" );
validateLength( flow, 1, "right" );
}
public void testConcatentation() throws Exception
{
if( !new File( inputFileLower ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileLower );
copyFromLocal( inputFileUpper );
Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );
Tap source = new MultiSourceTap( sourceLower, sourceUpper );
Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );
// using null pos so all fields are written
Tap sink = new Hfs( new TextLine(), outputPath + "/complex/concat/", true );
Pipe pipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter );
Pipe splice = new GroupBy( pipe, new Fields( "num" ) );
Flow countFlow = new FlowConnector( getProperties() ).connect( source, sink, splice );
// countFlow.writeDOT( "cogroup.dot" );
// System.out.println( "countFlow =\n" + countFlow );
countFlow.complete();
validateLength( countFlow, 10, null );
}
public void testGeneratorAggregator() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Pipe pipe = new Pipe( "test" );
pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Every( pipe, new TestAggregator( new Fields( "count1" ), new Fields( "ip" ), new Tuple( "first1" ), new Tuple( "first2" ) ) );
pipe = new Every( pipe, new TestAggregator( new Fields( "count2" ), new Fields( "ip" ), new Tuple( "second" ), new Tuple( "second2" ), new Tuple( "second3" ) ) );
Tap sink = new Hfs( new TextLine(), outputPath + "/generatoraggregator", true );
Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe );
flow.complete();
validateLength( flow, 8 * 2 * 3, null );
}
/**
* If the sinks have the same scheme as a temp tap, replace the temp tap
*
* @throws Exception
*/
public void testChainedTaps() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
copyFromLocal( inputFileApache );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Pipe pipe = new Each( new Pipe( "first" ), new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Each( new Pipe( "second", pipe ), new Fields( "ip" ), new RegexFilter( "7" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Each( new Pipe( "third", pipe ), new Fields( "ip" ), new RegexFilter( "6" ) );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
String path = outputPath + "/chainedtaps/";
Tap sinkFirst = new Hfs( new SequenceFile( new Fields( "ip" ) ), path + "first", true );
Tap sinkSecond = new Hfs( new SequenceFile( new Fields( "ip" ) ), path + "second", true );
Tap sinkThird = new Hfs( new SequenceFile( new Fields( "ip" ) ), path + "third", true );
Map<String, Tap> sinks = Cascades.tapsMap( new String[]{"first", "second",
"third"}, Tap.taps( sinkFirst, sinkSecond, sinkThird ) );
Flow flow = new FlowConnector( getProperties() ).connect( source, sinks, pipe );
assertEquals( "wrong number of steps", 3, flow.getSteps().size() );
// flow.writeDOT( "chainedtaps.dot" );
flow.complete();
validateLength( flow, 3, null );
}
public void testReplace() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink = new Hfs( new TextLine( new Fields( "offset", "line" ), new Fields( "offset", "line" ) ), outputPath + "/replace", true );
Pipe pipe = new Pipe( "test" );
Function parser = new RegexParser( new Fields( 0 ), "^[^ ]*" );
pipe = new Each( pipe, new Fields( "line" ), parser, Fields.REPLACE );
pipe = new Each( pipe, new Fields( "line" ), new Identity( Fields.ARGS ), Fields.REPLACE );
pipe = new Each( pipe, new Fields( "line" ), new Identity( new Fields( "line" ) ), Fields.REPLACE );
pipe = new Each( pipe, new Debug( true ) );
Flow flow = new FlowConnector().connect( source, sink, pipe );
// flow.writeDOT( "simple.dot" );
flow.complete();
validateLength( flow, 10, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) );
}
public void testSwap() throws Exception
{
if( !new File( inputFileApache ).exists() )
fail( "data file not found" );
Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );
Tap sink = new Hfs( new TextLine( new Fields( "offset", "line" ), new Fields( "count", "ipaddress" ) ), outputPath + "/swap", true );
Pipe pipe = new Pipe( "test" );
Function parser = new RegexParser( new Fields( "ip" ), "^[^ ]*" );
pipe = new Each( pipe, new Fields( "line" ), parser, Fields.SWAP );
pipe = new GroupBy( pipe, new Fields( "ip" ) );
pipe = new Every( pipe, new Fields( "ip" ), new Count( new Fields( "count" ) ) );
pipe = new Each( pipe, new Fields( "ip" ), new Identity( new Fields( "ipaddress" ) ), Fields.SWAP );
Flow flow = new FlowConnector().connect( source, sink, pipe );
// flow.writeDOT( "simple.dot" );
flow.complete();
validateLength( flow, 8, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) );
}
}