/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tuple.hadoop; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.HashMap; import java.util.Map; import cascading.ClusterTestCase; import cascading.flow.Flow; import cascading.flow.FlowConnector; import cascading.flow.FlowProcess; import cascading.flow.MultiMapReducePlanner; import cascading.operation.BaseOperation; import cascading.operation.Function; import cascading.operation.FunctionCall; import cascading.operation.OperationCall; import cascading.operation.aggregator.Count; import cascading.operation.regex.RegexParser; import cascading.operation.regex.RegexSplitter; import cascading.pipe.CoGroup; import cascading.pipe.Each; import cascading.pipe.Every; import cascading.pipe.GroupBy; import cascading.pipe.Pipe; import cascading.pipe.cogroup.CoGroupClosure; import cascading.scheme.SequenceFile; import cascading.scheme.TextLine; import cascading.tap.Hfs; import cascading.tap.Tap; import cascading.tuple.Fields; import cascading.tuple.Tuple; import cascading.tuple.TupleEntryIterator; import org.apache.hadoop.io.BooleanWritable; import org.apache.hadoop.io.BytesWritable; public class SerializedPipesTest extends ClusterTestCase { String inputFileApache = "build/test/data/apache.10.txt"; String inputFileUpper = "build/test/data/upper.txt"; String inputFileLower = "build/test/data/lower.txt"; String outputPath = "build/test/output/tuple/"; public static class InsertBytes extends BaseOperation implements Function { String asBytes; public InsertBytes( Fields fieldDeclaration, String asBytes ) { super( fieldDeclaration ); this.asBytes = asBytes; } public void operate( FlowProcess flowProcess, FunctionCall functionCall ) { functionCall.getOutputCollector().add( new Tuple( new BytesWritable( asBytes.getBytes() ) ) ); } } public static class ReplaceAsBytes extends BaseOperation implements Function { public ReplaceAsBytes( Fields fieldDeclaration ) { super( fieldDeclaration ); } public void operate( FlowProcess flowProcess, FunctionCall functionCall ) { functionCall.getOutputCollector().add( new Tuple( new BytesWritable( functionCall.getArguments().getString( 0 ).getBytes() ) ) ); } } public static class InsertRawBytes extends BaseOperation<Long> implements Function<Long> { String asBytes; private boolean increment; public InsertRawBytes( Fields fieldDeclaration, String asBytes, boolean increment ) { super( fieldDeclaration ); this.asBytes = asBytes; this.increment = increment; } @Override public void prepare( FlowProcess flowProcess, OperationCall<Long> operationCall ) { operationCall.setContext( increment ? 0L : -1L ); } public void operate( FlowProcess flowProcess, FunctionCall<Long> functionCall ) { String string = asBytes; if( functionCall.getContext() != -1 ) { string = functionCall.getContext() + string; functionCall.setContext( functionCall.getContext() + 1 ); } functionCall.getOutputCollector().add( new Tuple( (Object) string.getBytes() ) ); } } public static class InsertBoolean extends BaseOperation implements Function { boolean asBoolean; public InsertBoolean( Fields fieldDeclaration, boolean asBoolean ) { super( fieldDeclaration ); this.asBoolean = asBoolean; } public void operate( FlowProcess flowProcess, FunctionCall functionCall ) { functionCall.getOutputCollector().add( new Tuple( new BooleanWritable( asBoolean ) ) ); } } public static class Container implements Serializable, Comparable<String> { String value; public Container( String value ) { this.value = value; } @Override public int compareTo( String o ) { return value.compareTo( o ); } } public static class InsertTestText extends BaseOperation<Long> implements Function<Long> { private String testText; private boolean increment; private int moduloValueIsNull; private int moduloResultIsNull; public InsertTestText( Fields fieldDeclaration, String testText, boolean increment ) { this( fieldDeclaration, testText, increment, -1, -1 ); } public InsertTestText( Fields fieldDeclaration, String testText, boolean increment, int moduloValueIsNull, int moduloResultIsNull ) { super( fieldDeclaration ); this.testText = testText; this.increment = increment; this.moduloValueIsNull = moduloValueIsNull; this.moduloResultIsNull = moduloResultIsNull; } @Override public void prepare( FlowProcess flowProcess, OperationCall<Long> operationCall ) { operationCall.setContext( increment ? 0L : -1L ); } public void operate( FlowProcess flowProcess, FunctionCall<Long> functionCall ) { String string = testText; if( functionCall.getContext() != -1 ) { string = functionCall.getContext() + string; functionCall.setContext( functionCall.getContext() + 1 ); if( moduloValueIsNull != -1 && functionCall.getContext() % moduloValueIsNull == 0 ) string = null; } TestText result = null; if( moduloResultIsNull != -1 && functionCall.getContext() % moduloResultIsNull != 0 ) result = new TestText( string ); functionCall.getOutputCollector().add( new Tuple( result ) ); } } public SerializedPipesTest() { super( "serialized pipes", true ); // leave cluster testing enabled } public void testSimpleGroup() throws Exception { if( !new File( inputFileApache ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileApache ); Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new Each( pipe, new InsertBytes( new Fields( "bytes" ), "inserted text as bytes" ), Fields.ALL ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); pipe = new Each( pipe, new InsertBoolean( new Fields( "boolean" ), false ), Fields.ALL ); Tap sink = new Hfs( new SequenceFile( Fields.ALL ), outputPath + "/hadoop/serialization", true ); Map<Object, Object> jobProperties = getProperties(); TupleSerialization.addSerializationToken( jobProperties, 1000, BooleanWritable.class.getName() ); Flow flow = new FlowConnector( jobProperties ).connect( source, sink, pipe ); // flow.writeDOT( "groupcount.dot" ); flow.complete(); validateLength( flow.openSource(), 10 ); // validate source, this once, as a sanity check validateLength( flow, 8, null ); } public void testCoGroupWritableAsKeyValue() throws Exception { if( !new File( inputFileLower ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileLower ); copyFromLocal( inputFileUpper ); Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new SequenceFile( Fields.ALL ), outputPath + "/hadoop/writablekeyvalue", true ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); pipeLower = new Each( pipeLower, new InsertBytes( new Fields( "group" ), "inserted text as bytes" ), Fields.ALL ); pipeLower = new Each( pipeLower, new InsertBytes( new Fields( "value" ), "inserted text as bytes" ), Fields.ALL ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); pipeUpper = new Each( pipeUpper, new InsertBytes( new Fields( "group" ), "inserted text as bytes" ), Fields.ALL ); pipeUpper = new Each( pipeUpper, new InsertBytes( new Fields( "value" ), "inserted text as bytes" ), Fields.ALL ); Pipe splice = new CoGroup( pipeLower, new Fields( "group" ), pipeUpper, new Fields( "group" ), Fields.size( 8 ) ); Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice ); // countFlow.writeDOT( "cogroup.dot" ); // System.out.println( "countFlow =\n" + countFlow ); countFlow.complete(); validateLength( countFlow, 25, null ); } public void testCoGroupBytesWritableAsKeyValue() throws Exception { if( !new File( inputFileLower ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileLower ); copyFromLocal( inputFileUpper ); Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), outputPath + "/hadoop/byteswritablekeyvalue", true ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); pipeLower = new Each( pipeLower, new Fields( "char" ), new ReplaceAsBytes( new Fields( "char" ) ), Fields.REPLACE ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); pipeUpper = new Each( pipeUpper, new Fields( "char" ), new ReplaceAsBytes( new Fields( "char" ) ), Fields.REPLACE ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) ); Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice ); // countFlow.writeDOT( "cogroup.dot" ); // System.out.println( "countFlow =\n" + countFlow ); countFlow.complete(); validateLength( countFlow, 5, null ); TupleEntryIterator iterator = countFlow.openSink(); assertEquals( "not equal: tuple.get(1)", "1\t61\t1\t41", iterator.next().get( 1 ) ); iterator.close(); } public void testCoGroupSpillCustomWritable() throws Exception { if( !new File( inputFileLower ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileLower ); copyFromLocal( inputFileUpper ); Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new SequenceFile( Fields.ALL ), outputPath + "/hadoop/customerwritable", true ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); pipeLower = new Each( pipeLower, new InsertTestText( new Fields( "group" ), "inserted text as bytes", false ), Fields.ALL ); pipeLower = new Each( pipeLower, new InsertTestText( new Fields( "value" ), "inserted text as bytes", false ), Fields.ALL ); pipeLower = new Each( pipeLower, new InsertTestText( new Fields( "text" ), "inserted text as custom text", false ), Fields.ALL ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); pipeUpper = new Each( pipeUpper, new InsertTestText( new Fields( "group" ), "inserted text as bytes", false ), Fields.ALL ); pipeUpper = new Each( pipeUpper, new InsertTestText( new Fields( "value" ), "inserted text as bytes", false ), Fields.ALL ); pipeUpper = new Each( pipeUpper, new InsertTestText( new Fields( "text" ), "inserted text as custom text", false ), Fields.ALL ); Pipe splice = new CoGroup( pipeLower, new Fields( "group" ), pipeUpper, new Fields( "group" ), Fields.size( 10 ) ); Map<Object, Object> properties = getProperties(); properties.put( CoGroupClosure.SPILL_THRESHOLD, 1 ); // String serializations = MultiMapReducePlanner.getJobConf( properties ).get( "io.serializations" ); // serializations = Util.join( ",", serializations, JavaSerialization.class.getName() ); // System.out.println( "serializations = " + serializations ); // MultiMapReducePlanner.getJobConf( properties ).set( "io.serializations",serializations ); MultiMapReducePlanner.getJobConf( properties ).set( "io.serializations", TestSerialization.class.getName() ); Flow countFlow = new FlowConnector( properties ).connect( sources, sink, splice ); // countFlow.writeDOT( "cogroup.dot" ); // System.out.println( "countFlow =\n" + countFlow ); countFlow.complete(); validateLength( countFlow, 25, null ); } public void testCoGroupRawAsKeyValue() throws Exception { invokeRawAsKeyValue( false, true, false, false ); } public void testCoGroupRawAsKeyValueDefault() throws Exception { invokeRawAsKeyValue( true, true, false, false ); } public void testCoGroupRawAsKeyValueDefaultIgnoreToken() throws Exception { invokeRawAsKeyValue( true, true, true, false ); } public void testCoGroupRawAsKeyValueDefaultIgnoreTokenCompositeGrouping() throws Exception { invokeRawAsKeyValue( true, true, true, true ); } public void testCoGroupRawAsKeyValueNoSecondary() throws Exception { invokeRawAsKeyValue( false, false, false, false ); } public void testCoGroupRawAsKeyValueDefaultNoSecondary() throws Exception { invokeRawAsKeyValue( true, false, false, false ); } public void testCoGroupRawAsKeyValueDefaultNoSecondaryCompositeGrouping() throws Exception { invokeRawAsKeyValue( true, false, false, true ); } private void invokeRawAsKeyValue( boolean useDefaultComparator, boolean secondarySortOnValue, boolean ignoreSerializationToken, boolean compositeGrouping ) throws IOException { if( !new File( inputFileLower ).exists() ) fail( "data file not found" ); copyFromLocal( inputFileLower ); copyFromLocal( inputFileUpper ); Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Fields fields = new Fields( "num", "char", "group", "value", "num2", "char2", "group2", "value2" ); Tap sink = new Hfs( new SequenceFile( fields ), outputPath + "/hadoop/rawbyteskeyvalue/" + useDefaultComparator + "/" + secondarySortOnValue + "/" + ignoreSerializationToken + "/" + compositeGrouping, true ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); pipeLower = new Each( pipeLower, new InsertTestText( new Fields( "group" ), "inserted text as bytes", true, 3, 4 ), Fields.ALL ); pipeLower = new Each( pipeLower, new InsertRawBytes( new Fields( "value" ), "inserted text as bytes", true ), Fields.ALL ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); pipeUpper = new Each( pipeUpper, new InsertTestText( new Fields( "group" ), "inserted text as bytes", true, 3, 4 ), Fields.ALL ); pipeUpper = new Each( pipeUpper, new InsertRawBytes( new Fields( "value" ), "inserted text as bytes", true ), Fields.ALL ); Fields groupFields = new Fields( "group" ); if( compositeGrouping ) groupFields = new Fields( "group", "num" ); if( !useDefaultComparator ) groupFields.setComparator( "group", new TestTextComparator() ); Fields declaredFields = new Fields( "num", "char", "group", "value", "num2", "char2", "group2", "value2" ); Pipe splice = new CoGroup( pipeLower, groupFields, pipeUpper, groupFields, declaredFields ); // test sorting comparison Fields valueFields = new Fields( "value" ); if( !useDefaultComparator ) valueFields.setComparator( "value", new BytesComparator() ); if( secondarySortOnValue ) splice = new GroupBy( splice, groupFields, valueFields ); else splice = new GroupBy( splice, groupFields ); Map<Object, Object> properties = getProperties(); if( !ignoreSerializationToken ) { TupleSerialization.addSerialization( properties, TestSerialization.class.getName() ); TupleSerialization.addSerialization( properties, BytesSerialization.class.getName() ); } else { TupleSerialization.addSerialization( properties, NoTokenTestSerialization.class.getName() ); TupleSerialization.addSerialization( properties, NoTokenTestBytesSerialization.class.getName() ); } MultiMapReducePlanner.getJobConf( properties ).setNumMapTasks( 1 ); Flow flow = new FlowConnector( properties ).connect( sources, sink, splice ); flow.complete(); validateLength( flow, 5, null ); // test the ordering TupleEntryIterator iterator = flow.openSink(); TestText target = (TestText) iterator.next().getObject( "group" ); String value = target == null ? null : target.value; // System.out.println( "value = " + value ); while( iterator.hasNext() ) { TestText nextTarget = (TestText) iterator.next().getObject( "group" ); String next = nextTarget == null ? null : nextTarget.value; if( value != null && value.compareTo( next ) >= 0 ) fail( "not increasing: " + value + " " + value ); value = next; // System.out.println( "value = " + value ); } iterator.close(); } }