SerializedPipesTest.java example

Explorer
cascading-master
- src
/*
 * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved.
 *
 * Project and contact information: http://www.cascading.org/
 *
 * This file is part of the Cascading project.
 *
 * Cascading is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Cascading is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with Cascading.  If not, see <http://www.gnu.org/licenses/>.
 */

package cascading.tuple.hadoop;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;

import cascading.ClusterTestCase;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.FlowProcess;
import cascading.flow.MultiMapReducePlanner;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.OperationCall;
import cascading.operation.aggregator.Count;
import cascading.operation.regex.RegexParser;
import cascading.operation.regex.RegexSplitter;
import cascading.pipe.CoGroup;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.cogroup.CoGroupClosure;
import cascading.scheme.SequenceFile;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.Tap;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.TupleEntryIterator;
import org.apache.hadoop.io.BooleanWritable;
import org.apache.hadoop.io.BytesWritable;

public class SerializedPipesTest extends ClusterTestCase
  {
  String inputFileApache = "build/test/data/apache.10.txt";

  String inputFileUpper = "build/test/data/upper.txt";
  String inputFileLower = "build/test/data/lower.txt";

  String outputPath = "build/test/output/tuple/";

  public static class InsertBytes extends BaseOperation implements Function
    {
    String asBytes;

    public InsertBytes( Fields fieldDeclaration, String asBytes )
      {
      super( fieldDeclaration );
      this.asBytes = asBytes;
      }

    public void operate( FlowProcess flowProcess, FunctionCall functionCall )
      {
      functionCall.getOutputCollector().add( new Tuple( new BytesWritable( asBytes.getBytes() ) ) );
      }
    }

  public static class ReplaceAsBytes extends BaseOperation implements Function
    {
    public ReplaceAsBytes( Fields fieldDeclaration )
      {
      super( fieldDeclaration );
      }

    public void operate( FlowProcess flowProcess, FunctionCall functionCall )
      {
      functionCall.getOutputCollector().add( new Tuple( new BytesWritable( functionCall.getArguments().getString( 0 ).getBytes() ) ) );
      }
    }

  public static class InsertRawBytes extends BaseOperation<Long> implements Function<Long>
    {
    String asBytes;
    private boolean increment;

    public InsertRawBytes( Fields fieldDeclaration, String asBytes, boolean increment )
      {
      super( fieldDeclaration );
      this.asBytes = asBytes;
      this.increment = increment;
      }

    @Override
    public void prepare( FlowProcess flowProcess, OperationCall<Long> operationCall )
      {
      operationCall.setContext( increment ? 0L : -1L );
      }

    public void operate( FlowProcess flowProcess, FunctionCall<Long> functionCall )
      {
      String string = asBytes;

      if( functionCall.getContext() != -1 )
        {
        string = functionCall.getContext() + string;
        functionCall.setContext( functionCall.getContext() + 1 );
        }

      functionCall.getOutputCollector().add( new Tuple( (Object) string.getBytes() ) );
      }
    }

  public static class InsertBoolean extends BaseOperation implements Function
    {
    boolean asBoolean;

    public InsertBoolean( Fields fieldDeclaration, boolean asBoolean )
      {
      super( fieldDeclaration );
      this.asBoolean = asBoolean;
      }

    public void operate( FlowProcess flowProcess, FunctionCall functionCall )
      {
      functionCall.getOutputCollector().add( new Tuple( new BooleanWritable( asBoolean ) ) );
      }
    }

  public static class Container implements Serializable, Comparable<String>
    {
    String value;

    public Container( String value )
      {
      this.value = value;
      }

    @Override
    public int compareTo( String o )
      {
      return value.compareTo( o );
      }
    }

  public static class InsertTestText extends BaseOperation<Long> implements Function<Long>
    {
    private String testText;
    private boolean increment;
    private int moduloValueIsNull;
    private int moduloResultIsNull;

    public InsertTestText( Fields fieldDeclaration, String testText, boolean increment )
      {
      this( fieldDeclaration, testText, increment, -1, -1 );
      }

    public InsertTestText( Fields fieldDeclaration, String testText, boolean increment, int moduloValueIsNull, int moduloResultIsNull )
      {
      super( fieldDeclaration );
      this.testText = testText;
      this.increment = increment;
      this.moduloValueIsNull = moduloValueIsNull;
      this.moduloResultIsNull = moduloResultIsNull;
      }

    @Override
    public void prepare( FlowProcess flowProcess, OperationCall<Long> operationCall )
      {
      operationCall.setContext( increment ? 0L : -1L );
      }

    public void operate( FlowProcess flowProcess, FunctionCall<Long> functionCall )
      {
      String string = testText;

      if( functionCall.getContext() != -1 )
        {
        string = functionCall.getContext() + string;

        functionCall.setContext( functionCall.getContext() + 1 );

        if( moduloValueIsNull != -1 && functionCall.getContext() % moduloValueIsNull == 0 )
          string = null;
        }

      TestText result = null;

      if( moduloResultIsNull != -1 && functionCall.getContext() % moduloResultIsNull != 0 )
        result = new TestText( string );

      functionCall.getOutputCollector().add( new Tuple( result ) );
      }
    }

  public SerializedPipesTest()
    {
    super( "serialized pipes", true ); // leave cluster testing enabled
    }

  public void testSimpleGroup() throws Exception
    {
    if( !new File( inputFileApache ).exists() )
      fail( "data file not found" );

    copyFromLocal( inputFileApache );

    Tap source = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileApache );

    Pipe pipe = new Pipe( "test" );

    pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) );

    pipe = new Each( pipe, new InsertBytes( new Fields( "bytes" ), "inserted text as bytes" ), Fields.ALL );

    pipe = new GroupBy( pipe, new Fields( "ip" ) );

    pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) );

    pipe = new Each( pipe, new InsertBoolean( new Fields( "boolean" ), false ), Fields.ALL );

    Tap sink = new Hfs( new SequenceFile( Fields.ALL ), outputPath + "/hadoop/serialization", true );

    Map<Object, Object> jobProperties = getProperties();

    TupleSerialization.addSerializationToken( jobProperties, 1000, BooleanWritable.class.getName() );

    Flow flow = new FlowConnector( jobProperties ).connect( source, sink, pipe );

//    flow.writeDOT( "groupcount.dot" );

    flow.complete();

    validateLength( flow.openSource(), 10 ); // validate source, this once, as a sanity check
    validateLength( flow, 8, null );
    }

  public void testCoGroupWritableAsKeyValue() throws Exception
    {
    if( !new File( inputFileLower ).exists() )
      fail( "data file not found" );

    copyFromLocal( inputFileLower );
    copyFromLocal( inputFileUpper );

    Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    // using null pos so all fields are written
    Tap sink = new Hfs( new SequenceFile( Fields.ALL ), outputPath + "/hadoop/writablekeyvalue", true );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    pipeLower = new Each( pipeLower, new InsertBytes( new Fields( "group" ), "inserted text as bytes" ), Fields.ALL );
    pipeLower = new Each( pipeLower, new InsertBytes( new Fields( "value" ), "inserted text as bytes" ), Fields.ALL );

    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
    pipeUpper = new Each( pipeUpper, new InsertBytes( new Fields( "group" ), "inserted text as bytes" ), Fields.ALL );
    pipeUpper = new Each( pipeUpper, new InsertBytes( new Fields( "value" ), "inserted text as bytes" ), Fields.ALL );

    Pipe splice = new CoGroup( pipeLower, new Fields( "group" ), pipeUpper, new Fields( "group" ), Fields.size( 8 ) );

    Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );

//    countFlow.writeDOT( "cogroup.dot" );
//    System.out.println( "countFlow =\n" + countFlow );

    countFlow.complete();

    validateLength( countFlow, 25, null );
    }

  public void testCoGroupBytesWritableAsKeyValue() throws Exception
    {
    if( !new File( inputFileLower ).exists() )
      fail( "data file not found" );

    copyFromLocal( inputFileLower );
    copyFromLocal( inputFileUpper );

    Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    // using null pos so all fields are written
    Tap sink = new Hfs( new TextLine(), outputPath + "/hadoop/byteswritablekeyvalue", true );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    pipeLower = new Each( pipeLower, new Fields( "char" ), new ReplaceAsBytes( new Fields( "char" ) ), Fields.REPLACE );

    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
    pipeUpper = new Each( pipeUpper, new Fields( "char" ), new ReplaceAsBytes( new Fields( "char" ) ), Fields.REPLACE );

    Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), Fields.size( 4 ) );

    Flow countFlow = new FlowConnector( getProperties() ).connect( sources, sink, splice );

//    countFlow.writeDOT( "cogroup.dot" );
//    System.out.println( "countFlow =\n" + countFlow );

    countFlow.complete();

    validateLength( countFlow, 5, null );

    TupleEntryIterator iterator = countFlow.openSink();

    assertEquals( "not equal: tuple.get(1)", "1\t61\t1\t41", iterator.next().get( 1 ) );

    iterator.close();

    }

  public void testCoGroupSpillCustomWritable() throws Exception
    {
    if( !new File( inputFileLower ).exists() )
      fail( "data file not found" );

    copyFromLocal( inputFileLower );
    copyFromLocal( inputFileUpper );

    Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    // using null pos so all fields are written
    Tap sink = new Hfs( new SequenceFile( Fields.ALL ), outputPath + "/hadoop/customerwritable", true );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    pipeLower = new Each( pipeLower, new InsertTestText( new Fields( "group" ), "inserted text as bytes", false ), Fields.ALL );
    pipeLower = new Each( pipeLower, new InsertTestText( new Fields( "value" ), "inserted text as bytes", false ), Fields.ALL );
    pipeLower = new Each( pipeLower, new InsertTestText( new Fields( "text" ), "inserted text as custom text", false ), Fields.ALL );

    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
    pipeUpper = new Each( pipeUpper, new InsertTestText( new Fields( "group" ), "inserted text as bytes", false ), Fields.ALL );
    pipeUpper = new Each( pipeUpper, new InsertTestText( new Fields( "value" ), "inserted text as bytes", false ), Fields.ALL );
    pipeUpper = new Each( pipeUpper, new InsertTestText( new Fields( "text" ), "inserted text as custom text", false ), Fields.ALL );

    Pipe splice = new CoGroup( pipeLower, new Fields( "group" ), pipeUpper, new Fields( "group" ), Fields.size( 10 ) );

    Map<Object, Object> properties = getProperties();

    properties.put( CoGroupClosure.SPILL_THRESHOLD, 1 );
//    String serializations = MultiMapReducePlanner.getJobConf( properties ).get( "io.serializations" );
//    serializations = Util.join( ",", serializations, JavaSerialization.class.getName() );
//    System.out.println( "serializations = " + serializations );
//    MultiMapReducePlanner.getJobConf( properties ).set( "io.serializations",serializations );
    MultiMapReducePlanner.getJobConf( properties ).set( "io.serializations", TestSerialization.class.getName() );

    Flow countFlow = new FlowConnector( properties ).connect( sources, sink, splice );

//    countFlow.writeDOT( "cogroup.dot" );
//    System.out.println( "countFlow =\n" + countFlow );

    countFlow.complete();

    validateLength( countFlow, 25, null );
    }

  public void testCoGroupRawAsKeyValue() throws Exception
    {
    invokeRawAsKeyValue( false, true, false, false );
    }

  public void testCoGroupRawAsKeyValueDefault() throws Exception
    {
    invokeRawAsKeyValue( true, true, false, false );
    }

  public void testCoGroupRawAsKeyValueDefaultIgnoreToken() throws Exception
    {
    invokeRawAsKeyValue( true, true, true, false );
    }

  public void testCoGroupRawAsKeyValueDefaultIgnoreTokenCompositeGrouping() throws Exception
    {
    invokeRawAsKeyValue( true, true, true, true );
    }

  public void testCoGroupRawAsKeyValueNoSecondary() throws Exception
    {
    invokeRawAsKeyValue( false, false, false, false );
    }

  public void testCoGroupRawAsKeyValueDefaultNoSecondary() throws Exception
    {
    invokeRawAsKeyValue( true, false, false, false );
    }

  public void testCoGroupRawAsKeyValueDefaultNoSecondaryCompositeGrouping() throws Exception
    {
    invokeRawAsKeyValue( true, false, false, true );
    }

  private void invokeRawAsKeyValue( boolean useDefaultComparator, boolean secondarySortOnValue, boolean ignoreSerializationToken, boolean compositeGrouping )
    throws IOException
    {
    if( !new File( inputFileLower ).exists() )
      fail( "data file not found" );

    copyFromLocal( inputFileLower );
    copyFromLocal( inputFileUpper );

    Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileLower );
    Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), inputFileUpper );

    Map sources = new HashMap();

    sources.put( "lower", sourceLower );
    sources.put( "upper", sourceUpper );

    Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " );

    // using null pos so all fields are written
    Fields fields = new Fields( "num", "char", "group", "value", "num2", "char2", "group2", "value2" );
    Tap sink = new Hfs( new SequenceFile( fields ), outputPath + "/hadoop/rawbyteskeyvalue/" + useDefaultComparator + "/" + secondarySortOnValue + "/" + ignoreSerializationToken + "/" + compositeGrouping, true );

    Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter );
    pipeLower = new Each( pipeLower, new InsertTestText( new Fields( "group" ), "inserted text as bytes", true, 3, 4 ), Fields.ALL );
    pipeLower = new Each( pipeLower, new InsertRawBytes( new Fields( "value" ), "inserted text as bytes", true ), Fields.ALL );

    Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter );
    pipeUpper = new Each( pipeUpper, new InsertTestText( new Fields( "group" ), "inserted text as bytes", true, 3, 4 ), Fields.ALL );
    pipeUpper = new Each( pipeUpper, new InsertRawBytes( new Fields( "value" ), "inserted text as bytes", true ), Fields.ALL );

    Fields groupFields = new Fields( "group" );

    if( compositeGrouping )
      groupFields = new Fields( "group", "num" );

    if( !useDefaultComparator )
      groupFields.setComparator( "group", new TestTextComparator() );

    Fields declaredFields = new Fields( "num", "char", "group", "value", "num2", "char2", "group2", "value2" );
    Pipe splice = new CoGroup( pipeLower, groupFields, pipeUpper, groupFields, declaredFields );

    // test sorting comparison
    Fields valueFields = new Fields( "value" );

    if( !useDefaultComparator )
      valueFields.setComparator( "value", new BytesComparator() );

    if( secondarySortOnValue )
      splice = new GroupBy( splice, groupFields, valueFields );
    else
      splice = new GroupBy( splice, groupFields );

    Map<Object, Object> properties = getProperties();

    if( !ignoreSerializationToken )
      {
      TupleSerialization.addSerialization( properties, TestSerialization.class.getName() );
      TupleSerialization.addSerialization( properties, BytesSerialization.class.getName() );
      }
    else
      {
      TupleSerialization.addSerialization( properties, NoTokenTestSerialization.class.getName() );
      TupleSerialization.addSerialization( properties, NoTokenTestBytesSerialization.class.getName() );
      }

    MultiMapReducePlanner.getJobConf( properties ).setNumMapTasks( 1 );

    Flow flow = new FlowConnector( properties ).connect( sources, sink, splice );

    flow.complete();

    validateLength( flow, 5, null );

    // test the ordering
    TupleEntryIterator iterator = flow.openSink();
    TestText target = (TestText) iterator.next().getObject( "group" );
    String value = target == null ? null : target.value;
//    System.out.println( "value = " + value );

    while( iterator.hasNext() )
      {
      TestText nextTarget = (TestText) iterator.next().getObject( "group" );
      String next = nextTarget == null ? null : nextTarget.value;

      if( value != null && value.compareTo( next ) >= 0 )
        fail( "not increasing: " + value + " " + value );

      value = next;
//      System.out.println( "value = " + value );
      }

    iterator.close();
    }

  }