/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading; import java.io.File; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.regex.Pattern; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.operation.Aggregator; import cascading.operation.ConcreteCall; import cascading.operation.Filter; import cascading.operation.Function; import cascading.operation.Identity; import cascading.operation.aggregator.Count; import cascading.operation.expression.ExpressionFilter; import cascading.operation.function.UnGroup; import cascading.operation.regex.RegexFilter; import cascading.operation.regex.RegexParser; import cascading.operation.regex.RegexSplitter; import cascading.pipe.CoGroup; import cascading.pipe.Each; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.scheme.TextLine; import cascading.tap.Hfs; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntry; import cascading.tuple.TupleEntryIterator; import cascading.tuple.TupleListCollector; /** * These tests execute basic function using field positions, not names. so there will be duplicates with * FieldedPipestest */ public class BasicPipesTest extends CascadingTestCase { String inputFileApache = "build/test/data/apache.10.txt"; String inputFileIps = "build/test/data/ips.20.txt"; String inputFileNums = "build/test/data/nums.20.txt"; String inputFileUpper = "build/test/data/upper.txt"; String inputFileLower = "build/test/data/lower.txt"; String inputFileJoined = "build/test/data/lower+upper.txt"; String outputPath = "build/test/output/results"; public BasicPipesTest() { super( "build pipes" ); } /** * Test the count aggregator function * * @throws IOException */ public void testCount() throws Exception { runTestCount( new Fields( 1 ), new Fields( 0 ), new Fields( 0, 1 ) ); } public void testCount2() throws Exception { runTestCount( new Fields( 1 ), new Fields( "count" ), new Fields( 0, "count" ) ); } public void testCount3() throws Exception { runTestCount( new Fields( 1 ), new Fields( "count" ), Fields.ALL ); } public void testCount4() throws Exception { runTestCount( Fields.ALL, new Fields( "count" ), Fields.ALL ); } void runTestCount( Fields argumentSelector, Fields fieldDeclaration, Fields outputSelector ) throws Exception { if( !new File( inputFileIps ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileIps ); Tap sink = new Hfs( new TextLine(), outputPath + "/count", true ); Pipe pipe = new Pipe( "count" ); pipe = new GroupBy( pipe, new Fields( 1 ) ); pipe = new Every( pipe, argumentSelector, new Count( fieldDeclaration ), outputSelector ); Flow flow = new FlowConnector().connect( source, sink, pipe ); // flow.writeDOT( "basic.dot" ); flow.start(); flow.complete(); TupleEntryIterator iterator = flow.openSink(); Function splitter = new RegexSplitter( Fields.size( 2 ) ); boolean found = false; while( iterator.hasNext() ) { Tuple tuple = iterator.next().getTuple(); // System.out.println( "tuple = " + tuple ); TupleListCollector tupleEntryCollector = new TupleListCollector( Fields.size( 2 ) ); Tuple tuple1 = tuple.get( new int[]{1} ); ConcreteCall operationCall = new ConcreteCall( new TupleEntry( tuple1 ), tupleEntryCollector ); splitter.prepare( null, operationCall ); splitter.operate( null, operationCall ); Tuple tupleEntry = tupleEntryCollector.iterator().next(); if( tupleEntry.get( 0 ).equals( "63.123.238.8" ) ) { found = true; assertEquals( "wrong count", "2", tupleEntry.get( 1 ) ); } } iterator.close(); if( !found ) fail( "never found ip" ); validateLength( flow, 17 ); } /** * A slightly more complex pipe * * @throws IOException */ public void testSimple() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileApache ); Tap sink = new Hfs( new TextLine( Fields.size( 1 ) ), outputPath + "/simple", true ); Pipe pipe = new Pipe( "test" ); Function parser = new RegexParser( "^[^ ]*" ); pipe = new Each( pipe, new Fields( 1 ), parser, new Fields( 0, 2 ) ); // test that selector against incoming creates proper outgoing pipe = new Each( pipe, new Fields( 1 ), new Identity() ); pipe = new GroupBy( pipe, new Fields( 0 ) ); Aggregator counter = new Count(); pipe = new Every( pipe, new Fields( 0 ), counter, new Fields( 0, 1 ) ); Flow flow = new FlowConnector().connect( source, sink, pipe ); // flow.writeDOT( "simple.dot" ); flow.complete(); validateLength( flow, 8, 1 ); } /** * tests that the Fields.ARGS declarator properly resolves into a declarator * * @throws Exception */ public void testSimpleResult() throws Exception { if( !new File( inputFileLower ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileLower ); Tap sink = new Hfs( new TextLine( Fields.size( 1 ) ), outputPath + "/simpleresult", true ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( 0 ), new ExpressionFilter( "$0 == 0", Long.class ) ); pipe = new Each( pipe, new Fields( 1 ), new Identity() ); pipe = new Each( pipe, Fields.ALL, new RegexFilter( "a|b|c" ) ); pipe = new GroupBy( pipe, new Fields( 0 ) ); Aggregator counter = new Count(); pipe = new Every( pipe, new Fields( 0 ), counter, new Fields( 0, 1 ) ); Flow flow = new FlowConnector().connect( source, sink, pipe ); // flow.writeDOT( "simple.dot" ); flow.complete(); validateLength( flow, 2, 1 ); } public void testSimpleRelative() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileApache ); Tap sink = new Hfs( new TextLine(), outputPath + "/simplerelative", true ); Pipe pipe = new Pipe( "test" ); Function parser = new RegexParser( "^[^ ]*" ); pipe = new Each( pipe, new Fields( -1 ), parser, new Fields( -1 ) ); pipe = new GroupBy( pipe, new Fields( 0 ) ); Aggregator counter = new Count(); pipe = new Every( pipe, new Fields( 0 ), counter, new Fields( 0, 1 ) ); Flow flow = new FlowConnector().connect( source, sink, pipe ); // flow.writeDOT( "simple.dot" ); flow.complete(); validateLength( flow, 8 ); } public void testCoGroup() throws Exception { if( !new File( inputFileLower ).exists() ) fail( "data file not found" ); Tap sourceLower = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileLower ); Tap sourceUpper = new Hfs( new TextLine(), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), outputPath + "/complex/cogroup/", true ); Function splitter = new RegexSplitter( Fields.size( 2 ), " " ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( 1 ), splitter, Fields.RESULTS ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( 1 ), splitter, Fields.RESULTS ); Pipe splice = new CoGroup( pipeLower, new Fields( 0 ), pipeUpper, new Fields( 0 ) ); Flow countFlow = new FlowConnector().connect( sources, sink, splice ); // System.out.println( "countFlow =\n" + countFlow ); // countFlow.writeDOT( "cogroup.dot" ); countFlow.complete(); validateLength( countFlow, 5 ); TupleEntryIterator iterator = countFlow.openSink(); assertEquals( "not equal: tuple.get(1)", "1\ta\t1\tA", iterator.next().get( 1 ) ); assertEquals( "not equal: tuple.get(1)", "2\tb\t2\tB", iterator.next().get( 1 ) ); iterator.close(); } public void testUnGroup() throws Exception { if( !new File( inputFileJoined ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileJoined ); Tap sink = new Hfs( new TextLine(), outputPath + "/ungrouped", true ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( 1 ), new RegexSplitter( Fields.size( 3 ) ) ); pipe = new Each( pipe, new UnGroup( Fields.size( 2 ), new Fields( 0 ), Fields.fields( new Fields( 1 ), new Fields( 2 ) ) ) ); Flow flow = new FlowConnector().connect( source, sink, pipe ); // flow.writeDOT( "ungroup.dot" ); flow.complete(); validateLength( flow, 10 ); } public void testFilterAll() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileApache ); Tap sink = new Hfs( new TextLine(), outputPath + "/filterall", true ); Pipe pipe = new Pipe( "test" ); Filter filter = new RegexFilter( ".*", true ); pipe = new Each( pipe, new Fields( 1 ), filter ); Flow flow = new FlowConnector().connect( source, sink, pipe ); flow.complete(); validateLength( flow, 0 ); } public void testFilter() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileApache ); Tap sink = new Hfs( new TextLine(), outputPath + "/filter", true ); Pipe pipe = new Pipe( "test" ); Filter filter = new RegexFilter( "^68.*" ); pipe = new Each( pipe, new Fields( 1 ), filter ); Flow flow = new FlowConnector().connect( source, sink, pipe ); flow.complete(); validateLength( flow, 3 ); } public void testSimpleChain() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileApache ); Tap sink = new Hfs( new TextLine(), outputPath + "/simple", true ); Pipe pipe = new Pipe( "test" ); Function parser = new RegexParser( "^[^ ]*" ); pipe = new Each( pipe, new Fields( 1 ), parser, new Fields( 2 ) ); pipe = new GroupBy( pipe, new Fields( 0 ) ); pipe = new Every( pipe, new Fields( 0 ), new Count(), new Fields( 0, 1 ) ); // add a second group to force a new map/red pipe = new GroupBy( pipe, new Fields( 0 ) ); Flow flow = new FlowConnector().connect( source, sink, pipe ); // flow.writeDOT( "simplechain.dot" ); flow.complete(); validateLength( flow, 8 ); } public void testReplace() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileApache ); Tap sink = new Hfs( new TextLine(), outputPath + "/replace", true ); Pipe pipe = new Pipe( "test" ); Function parser = new RegexParser( Fields.ARGS, "^[^ ]*" ); pipe = new Each( pipe, new Fields( 1 ), parser, Fields.REPLACE ); Flow flow = new FlowConnector().connect( source, sink, pipe ); // flow.writeDOT( "simple.dot" ); flow.complete(); validateLength( flow, 10, 2, Pattern.compile( "\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}" ) ); } public void testSwap() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); Tap source = new Hfs( new TextLine( Fields.size( 2 ) ), inputFileApache ); Tap sink = new Hfs( new TextLine(), outputPath + "/swap", true ); Pipe pipe = new Pipe( "test" ); Function parser = new RegexParser( new Fields( 0 ), "^[^ ]*" ); pipe = new Each( pipe, new Fields( 1 ), parser, Fields.SWAP ); Flow flow = new FlowConnector().connect( source, sink, pipe ); // flow.writeDOT( "simple.dot" ); flow.complete(); validateLength( flow, 10, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) ); } }