/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.net.URI; import java.net.URISyntaxException; import java.util.regex.Pattern; import cascading.ClusterTestCase; import cascading.cascade.Cascade; import cascading.cascade.CascadeConnector; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.flow.MultiMapReducePlanner; import cascading.operation.Function; import cascading.operation.Identity; import cascading.operation.regex.RegexSplitter; import cascading.pipe.Each; import cascading.pipe.Pipe; import cascading.scheme.SequenceFile; import cascading.scheme.TextDelimited; import cascading.scheme.TextLine; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntryIterator; import org.apache.hadoop.mapred.JobConf; /** * */ public class TapTest extends ClusterTestCase implements Serializable { String inputFileComments = "build/test/data/comments+lower.txt"; String inputFileJoined = "build/test/data/lower+upper.txt"; String inputFileCross = "build/test/data/lhs+rhs-cross.txt"; String inputFileUpper = "build/test/data/upper.txt"; String inputFileLower = "build/test/data/lower.txt"; String outputPath = "build/test/output/tap/"; public TapTest() { super( "tap tests", true ); } public void testDfs() throws URISyntaxException, IOException { Tap tap = new Dfs( new Fields( "foo" ), "some/path" ); assertTrue( "wrong scheme", tap.getQualifiedPath( MultiMapReducePlanner.getJobConf( getProperties() ) ).toUri().getScheme().equalsIgnoreCase( "hdfs" ) ); new Dfs( new Fields( "foo" ), "hdfs://localhost:5001/some/path" ); new Dfs( new Fields( "foo" ), new URI( "hdfs://localhost:5001/some/path" ) ); try { new Dfs( new Fields( "foo" ), "s3://localhost:5001/some/path" ); fail( "not valid url" ); } catch( Exception exception ) { } try { new Dfs( new Fields( "foo" ), new URI( "s3://localhost:5001/some/path" ) ); fail( "not valid url" ); } catch( Exception exception ) { } } public void testS3fs() throws URISyntaxException, IOException { // don't test qualified path, it tries to connect to s3 service new S3fs( new Fields( "foo" ), "s3://localhost:5001/some/path" ); new S3fs( new Fields( "foo" ), new URI( "s3://localhost:5001/some/path" ) ); try { new S3fs( new Fields( "foo" ), "hdfs://localhost:5001/some/path" ); fail( "not valid url" ); } catch( Exception exception ) { } try { new S3fs( new Fields( "foo" ), new URI( "hdfs://localhost:5001/some/path" ) ); fail( "not valid url" ); } catch( Exception exception ) { } } public void testLfs() throws URISyntaxException, IOException { Tap tap = new Lfs( new Fields( "foo" ), "some/path" ); assertTrue( "wrong scheme", tap.getQualifiedPath( MultiMapReducePlanner.getJobConf( getProperties() ) ).toUri().getScheme().equalsIgnoreCase( "file" ) ); new Lfs( new Fields( "foo" ), "file:///some/path" ); try { new Lfs( new Fields( "foo" ), "s3://localhost:5001/some/path" ); fail( "not valid url" ); } catch( Exception exception ) { } } public class CommentScheme extends TextLine { public CommentScheme() { } public CommentScheme( Fields sourceFields ) { super( sourceFields ); } @Override public Tuple source( Object key, Object value ) { if( value.toString().matches( "^\\s*#.*$" ) ) return null; return super.source( key, value ); } } public void testNullsFromScheme() throws IOException { if( !new File( inputFileComments ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileComments ); Tap source = new Hfs( new CommentScheme( new Fields( "line" ) ), inputFileComments ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Identity() ); Tap sink = new Hfs( new TextLine( 1 ), outputPath + "/testnulls", true ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); validateLength( flow, 5, null ); TupleEntryIterator iterator = flow.openSink(); assertEquals( "not equal: tuple.get(1)", "1 a", iterator.next().get( 1 ) ); iterator.close(); // confirm the tuple iterator can handle nulls from the source validateLength( flow.openSource(), 5 ); } public void testTemplateTap() throws IOException { if( !new File( inputFileJoined ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileJoined ); Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileJoined ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new RegexSplitter( new Fields( "number", "lower", "upper" ), "\t" ) ); Tap sink = new Hfs( new TextLine( 1 ), outputPath + "/testtemplates", true ); sink = new TemplateTap( (Hfs) sink, "%s-%s", 1 ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); Tap test = new Hfs( new TextLine( 1 ), sink.getPath().toString() + "/1-a" ); validateLength( flow.openTapForRead( test ), 1 ); test = new Hfs( new TextLine( 1 ), sink.getPath().toString() + "/2-b" ); validateLength( flow.openTapForRead( test ), 1 ); } public void testTemplateTapTextDelimited() throws IOException { if( !new File( inputFileJoined ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileJoined ); Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileJoined ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new RegexSplitter( new Fields( "number", "lower", "upper" ), "\t" ) ); Tap sink = new Hfs( new TextDelimited( new Fields( "number", "lower", "upper" ), "+" ), outputPath + "/testdelimitedtemplates", true ); sink = new TemplateTap( (Hfs) sink, "%s-%s", 1 ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); Tap test = new Hfs( new TextLine( new Fields( "line" ) ), sink.getPath().toString() + "/1-a" ); validateLength( flow.openTapForRead( test ), 1, Pattern.compile( "[0-9]\\+[a-z]\\+[A-Z]" ) ); test = new Hfs( new TextLine( new Fields( "line" ) ), sink.getPath().toString() + "/2-b" ); validateLength( flow.openTapForRead( test ), 1, Pattern.compile( "[0-9]\\+[a-z]\\+[A-Z]" ) ); } public void testTemplateTapView() throws IOException { if( !new File( inputFileJoined ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileJoined ); Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileJoined ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new RegexSplitter( new Fields( "number", "lower", "upper" ), "\t" ) ); Tap sink = new Hfs( new SequenceFile( new Fields( "upper" ) ), outputPath + "/testtemplatesview", true ); sink = new TemplateTap( (Hfs) sink, "%s-%s", new Fields( "number", "lower" ), 1 ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); Tap test = new Hfs( new SequenceFile( new Fields( "upper" ) ), sink.getPath().toString() + "/1-a" ); validateLength( flow.openTapForRead( test ), 1, 1 ); test = new Hfs( new SequenceFile( new Fields( "upper" ) ), sink.getPath().toString() + "/2-b" ); validateLength( flow.openTapForRead( test ), 1, 1 ); TupleEntryIterator input = flow.openTapForRead( test ); // open 2-b assertEquals( "wrong value", "B", input.next().get( 0 ) ); input.close(); } public void testSinkDeclaredFields() throws IOException { if( !new File( inputFileCross ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileCross ); Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileCross ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new RegexSplitter( new Fields( "first", "second", "third" ), "\\s" ), Fields.ALL ); Tap sink = new Hfs( new TextLine( new Fields( "line" ), new Fields( "second", "first", "third" ) ), outputPath + "/declaredsinks", true ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); // flow.writeDOT( "declaredsinks.dot" ); flow.complete(); validateLength( flow, 37, null ); TupleEntryIterator iterator = flow.openSink(); String line = iterator.next().getString( 0 ); assertTrue( "not equal: wrong values", line.matches( "[a-z]\t[0-9]\t[A-Z]" ) ); iterator.close(); } public void testSinkUnknown() throws IOException { if( !new File( inputFileCross ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileCross ); Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileCross ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new RegexSplitter( new Fields( "first", "second", "third" ), "\\s" ), Fields.RESULTS ); Tap sink = new Hfs( new SequenceFile( Fields.UNKNOWN ), outputPath + "/unknownsinks", true ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); validateLength( flow, 37, null ); TupleEntryIterator iterator = flow.openSink(); String line = iterator.next().getTuple().toString(); assertTrue( "not equal: wrong values: " + line, line.matches( "[0-9]\t[a-z]\t[A-Z]" ) ); iterator.close(); } public void testMultiSinkTap() throws IOException { if( !new File( inputFileJoined ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileJoined ); Tap source = new Hfs( new TextLine( new Fields( "line" ) ), inputFileJoined ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new RegexSplitter( new Fields( "number", "lower", "upper" ), "\t" ) ); Tap lhsSink = new Hfs( new TextLine( new Fields( "offset", "line" ), new Fields( "number", "lower" ) ), outputPath + "/multisink/lhs", SinkMode.REPLACE ); Tap rhsSink = new Hfs( new TextLine( new Fields( "offset", "line" ), new Fields( "number", "upper" ) ), outputPath + "/multisink/rhs", SinkMode.REPLACE ); Tap sink = new MultiSinkTap( lhsSink, rhsSink ); Flow flow = new FlowConnector( getProperties() ).connect( source, sink, pipe ); flow.complete(); validateLength( flow.openTapForRead( lhsSink ), 5 ); validateLength( flow.openTapForRead( rhsSink ), 5 ); } public void testGlobHfs() throws Exception { if( !new File( inputFileLower ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileLower ); copyFromLocal( inputFileUpper ); GlobHfs source = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), "build/test/data/?{ppe[_r],owe?}.txt" ); assertEquals( 2, source.getTaps().length ); // show globhfs will just match a directory if ended with a / assertEquals( 1, new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), "build/test/?ata/" ).getTaps().length ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), outputPath + "/glob/", true ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = new FlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), outputPath + "/glob2/", true ); Flow nextFlow = new FlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector().connect( concatFlow, nextFlow ); cascade.complete(); // countFlow.writeDOT( "cogroup.dot" ); // System.out.println( "countFlow =\n" + countFlow ); validateLength( concatFlow, 10, null ); } public void testNestedMultiSource() throws Exception { if( !new File( inputFileLower ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileLower ); copyFromLocal( inputFileUpper ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), "build/test/data/?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), "build/test/data/?{owe?}.txt" ); MultiSourceTap source = new MultiSourceTap( source1, source2 ); assertEquals( 2, source.getTaps().length ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), outputPath + "/glob/", true ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), "\\s" ); Pipe concatPipe = new Each( new Pipe( "concat" ), new Fields( "line" ), splitter ); Flow concatFlow = new FlowConnector( getProperties() ).connect( "first", source, sink, concatPipe ); Tap nextSink = new Hfs( new TextLine(), outputPath + "/glob2/", true ); Flow nextFlow = new FlowConnector( getProperties() ).connect( "second", sink, nextSink, concatPipe ); Cascade cascade = new CascadeConnector().connect( concatFlow, nextFlow ); cascade.complete(); validateLength( concatFlow, 10, null ); } public void testMultiSourceIterator() throws Exception { if( !new File( inputFileLower ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileLower ); copyFromLocal( inputFileUpper ); Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper ); Tap source = new MultiSourceTap( sourceLower, sourceUpper ); validateLength( source.openForRead( new JobConf() ), 10, null ); GlobHfs source1 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), "build/test/data/?{ppe[_r]}.txt" ); GlobHfs source2 = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), "build/test/data/?{owe?}.txt" ); source = new MultiSourceTap( source1, source2 ); validateLength( source.openForRead( new JobConf() ), 10, null ); GlobHfs sourceMulti = new GlobHfs( new TextLine( new Fields( "offset", "line" ) ), "build/test/data/?{ppe[_r],owe?}.txt" ); source = new MultiSourceTap( sourceMulti ); validateLength( source.openForRead( new JobConf() ), 10, null ); } }