GenericTransReduce.java example

Explorer

pentaho-hadoop-shims-master
- api
  - src
    - main
      - java
        org
        pentaho
        hadoop
        mapreduce
        BaseKeyValueOrdinals.java
        InKeyValueOrdinals.java
        OutKeyValueOrdinals.java
        shim
        ActiveHadoopShimFileProvider.java
        ConfigurationException.java
        HadoopConfiguration.java
        HadoopConfigurationClassLoader.java
        HadoopConfigurationFileSystemManager.java
        HadoopConfigurationLocator.java
        ShimVersion.java
        api
        ActiveHadoopConfigurationLocator.java
        Configuration.java
        DistributedCacheUtil.java
        Required.java
        ShimProperties.java
        fs
        FileSystem.java
        Path.java
        mapred
        RunningJob.java
        TaskCompletionEvent.java
        process
        Processable.java
        RequiredCredentialsToken.java
        spi
        HadoopConfigurationProvider.java
        HadoopShim.java
        PentahoHadoopShim.java
        PigShim.java
        SnappyShim.java
        SqoopShim.java
        hbase
        shim
        api
        ColumnFilter.java
        HBaseValueMeta.java
        Mapping.java
        spi
        HBaseBytesUtilShim.java
        HBaseConnection.java
        HBaseShim.java
        oozie
        shim
        api
        OozieClient.java
        OozieClientException.java
        OozieClientFactory.java
        OozieJob.java
    - test
      - java
        org
        pentaho
        hadoop
        mapreduce
        OrdinalExtractionTest.java
        shim
        ActiveHadoopShimFileProviderTest.java
        HadoopConfigurationClassLoaderTest.java
        HadoopConfigurationLocatorTest.java
        HadoopConfigurationTest.java
        HadoopExcludeJarsTest.java
        HadoopRunningOnClusterTest.java
        MockActiveHadoopConfigurationLocator.java
        MockHadoopConfigurationProvider.java
        ShimVersionTest.java
        api
        HadoopConfigurationFileSystemManagerTest.java
        MockFileProvider.java
        ShimPropertiesTest.java
        spi
        MockHBaseShim.java
        MockHadoopShim.java
        MockPigShim.java
        MockSnappyShim.java
        MockSqoopShim.java
        hbase
        shim
        api
        ColumnFilterTest.java
        HBaseValueMetaTest.java
        MappingTest.java
        spi
        HBaseConnectionTest.java
        MockHBaseByteConverterUsingJavaByteBuffer.java
        MockHBaseBytesUtilShim.java
- common
  - common-hbase-comparators-api
    - src
      - main
        java
        org
        pentaho
        hbase
        shim
        common
        wrapper
        HBaseConnectionInterface.java
        HBaseShimInterface.java
        spi
        IDeserializedBooleanComparator.java
        IDeserializedNumericComparator.java
  - common-shim
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        CommonHadoopShim.java
        CommonPigShim.java
        CommonSnappyShim.java
        CommonSqoopShim.java
        ConfigurationProxy.java
        DistributedCacheUtilImpl.java
        DriverProxyInvocationChain.java
        HiveSQLUtils.java
        ShimUtils.java
        fs
        FileSystemProxy.java
        PathProxy.java
        invocationhandler
        CaptureResultSetInvocationHandler.java
        ConnectionInvocationHandler.java
        DatabaseMetaDataInvocationHandler.java
        DriverInvocationHandler.java
        ResultSetInvocationHandler.java
        ResultSetMetaDataInvocationHandler.java
        mapred
        RunningJobProxy.java
        TaskCompletionEventProxy.java
        utils
        OverloadedIterator.java
        OverloadedServiceLoader.java
        hbase
        shim
        common
        CommonHBaseBytesUtil.java
        CommonHBaseConnection.java
        CommonHBaseShim.java
        fake
        FakeHBaseConnection.java
        FakeHBaseShim.java
      - test
        java
        org
        apache
        hadoop
        mapred
        TaskCompletionEventList.java
        pentaho
        hadoop
        mapreduce
        GenericTransCombinerTest.java
        GenericTransReduceTest.java
        MRTestUtil.java
        MockOutputCollector.java
        MockRecordReader.java
        PentahoMapReduceIT.java
        PentahoMapRunnableTest.java
        converter
        TypeConverterFactoryTest.java
        converters
        BytesWritableToByteArrayConverterTest.java
        DoubleWritableToDoubleConverterTest.java
        DoubleWritableToLongConverterTest.java
        ImmutableBytesWritablePassThroughConverterTest.java
        IntWritableToLongConverterTest.java
        KettleTypeToBooleanWritableConverterTest.java
        KettleTypeToBytesWritableConverterTest.java
        KettleTypeToDoubleWritableConverterTest.java
        KettleTypeToIntWritableConverterTest.java
        KettleTypeToLongWritableConverterTest.java
        KettleTypeToTextConverterTest.java
        LongWritableToLongConverterTest.java
        LongWritableToTextConverterTest.java
        NullConverterTest.java
        NullWritableConverterTest.java
        ObjectToStringConverterTest.java
        ResultPassThroughConverterTest.java
        TextToIntegerConverterTest.java
        TextToLongConverterTest.java
        TextToStringConverterTest.java
        test
        MRUtilTest.java
        shim
        common
        CommonHadoopShimTest.java
        CommonPigShimTest.java
        ConfigurationProxyTest.java
        DistributedCacheTestUtil.java
        DistributedCacheUtilImplOSDependentTest.java
        DistributedCacheUtilImplTest.java
        DriverProxyInvocationChainTest.java
        HiveSQLUtilsTest.java
        fs
        FileSystemProxyTest.java
        mapred
        MockRunningJob.java
        RunningJobProxyTest.java
        TaskCompletionEventProxyTest.java
        hbase
        shim
        common
        CommonHBaseBytesUtilTest.java
  - hadoop-shim
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        ConfigurationProxyV2.java
        FileSystemProxyV2.java
        HadoopShimImpl.java
        RunningJobProxyV2.java
        SnappyShimImpl.java
      - test
        java
        org
        pentaho
        hadoop
        shim
        common
        ConfigurationProxyV2Test.java
  - hbase
    - src
      - main
        java
        org
        apache
        hadoop
        hbase
        mapred
        Table10InputFormatDiscloser.java
        pentaho
        hadoop
        hbase
        factory
        HBase10Admin.java
        HBase10ClientFactory.java
        HBase10ClientFactoryLocatorImpl.java
        HBase10Put.java
        HBase10Table.java
  - hbase-comparators
    - src
      - main
        java
        org
        pentaho
        hbase
        shim
        common
        DeserializedBooleanComparator.java
        DeserializedNumericComparator.java
        HBaseConnectionImpl.java
        HBaseShimImpl.java
      - test
        java
        org
        pentaho
        hbase
        shim
        common
        CommonHBaseConnectionTest.java
        HBase11HBaseConnectionTest.java
  - mapred
    - src
      - main
        java
        org
        pentaho
        hadoop
        mapreduce
        GenericTransCombiner.java
        GenericTransReduce.java
        MRUtil.java
        OutputCollectorRowListener.java
        PentahoMapReduceBase.java
        PentahoMapRunnable.java
        converter
        TypeConversionException.java
        TypeConverterFactory.java
        converters
        BytesWritableToByteArrayConverter.java
        DoubleWritableToDoubleConverter.java
        DoubleWritableToLongConverter.java
        ImmutableBytesWritablePassThroughConverter.java
        IntWritableToLongConverter.java
        KettleTypeToBooleanWritableConverter.java
        KettleTypeToBytesWritableConverter.java
        KettleTypeToDoubleWritableConverter.java
        KettleTypeToIntWritableConverter.java
        KettleTypeToLongWritableConverter.java
        KettleTypeToTextConverter.java
        LongWritableToLongConverter.java
        LongWritableToTextConverter.java
        NullConverter.java
        NullWritableConverter.java
        ObjectToStringConverter.java
        ResultPassThroughConverter.java
        TextToIntegerConverter.java
        TextToLongConverter.java
        TextToStringConverter.java
        spi
        ITypeConverter.java
        hbase
        factory
        HBaseAdmin.java
        HBaseClientFactory.java
        HBaseClientFactoryLocator.java
        HBasePut.java
        HBaseTable.java
        mapred
        PentahoTableInputFormat.java
        PentahoTableRecordReader.java
        PentahoTableRecordReaderImpl.java
  - modern
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        ClassPathModifyingSqoopShim.java
        authentication
        HadoopNoAuthConsumer.java
        PropertyAuthenticationProviderParser.java
        authorization
        AuthenticatingHadoopShim.java
        HadoopAuthorizationService.java
        HasHadoopAuthorizationService.java
        NoOpHadoopAuthorizationService.java
        delegating
        DelegatingHBaseConnection.java
        DelegatingHBaseShim.java
        DelegatingHadoopShim.java
        DelegatingOozieClientFactory.java
        DelegatingPigShim.java
        DelegatingSnappyShim.java
        DelegatingSqoopShim.java
      - test
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        di
        job
        entries
        oozie
        OozieClientFactoryImpl.java
        hadoop
        shim
        common
        ClassPathModifyingSqoopShimTest.java
        HadoopShimTest.java
        ShimRegistrationTest.java
        authentication
        PropertyAuthenticationProviderParserTest.java
        authorization
        AuthenticatingHadoopShimTest.java
        delegating
        DelegatingHBaseConnectionTest.java
        DelegatingHBaseShimTest.java
        DelegatingHadoopShimTest.java
        DelegatingOozieClientFactoryTest.java
        DelegatingPigShimTest.java
        DelegatingSnappyShimTest.java
        DelegatingUtils.java
  - pig-shim-1.0
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        PigShimImpl.java
  - pig-shim-1.1
    - src
      - main
        java
        org
        pentaho
        hadoop
        shim
        common
        PigShimImpl.java
- shims
  - cdh510
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        cdh510
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - cdh511
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        cdh511
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - emr52
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        emr52
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - emr531
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        emr531
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - hdi35
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        hdi35
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
        invocationhandler
        HDIDriverInvocationHandler.java
  - hdp25
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        hdp25
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - hdp26
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        hdp26
        HadoopShim.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
  - mapr510
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        mapr510
        ConfigurationProxyV2.java
        HadoopShim.java
        MapR5DistributedCacheUtilImpl.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
        test
        java
        org
        pentaho
        hadoop
        shim
        mapr510
        MapR5DistributedCacheUtilImplOSDependentTest.java
  - mapr520
    - impl
      - src
        main
        java
        org
        apache
        hadoop
        hive
        jdbc
        HiveDriver.java
        pentaho
        hadoop
        shim
        mapr520
        ConfigurationProxyV2.java
        HadoopShim.java
        MapR5DistributedCacheUtilImpl.java
        authorization
        ShimNoOpHadoopAuthorizationService.java
        test
        java
        org
        pentaho
        hadoop
        shim
        mapr520
        MapR5DistributedCacheUtilImplOSDependentTest.java

/*******************************************************************************
*
* Pentaho Big Data
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
*    http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/

package org.pentaho.hadoop.mapreduce;

import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.trans.RowProducer;
import org.pentaho.di.trans.SingleThreadedTransExecutor;
import org.pentaho.di.trans.step.BaseStepMeta;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.hadoop.mapreduce.converter.TypeConverterFactory;
import org.pentaho.hadoop.mapreduce.converter.spi.ITypeConverter;

/**
 * A reducer class that just emits the sum of the input values.
 */
@SuppressWarnings( "deprecation" )
public class GenericTransReduce<K extends WritableComparable<?>, V extends Iterator<Writable>, K2, V2> extends PentahoMapReduceBase<K2, V2> implements
    Reducer<K, V, K2, V2> {

  private static Logger logger = Logger.getLogger( GenericTransReduce.class );

  protected RowProducer rowProducer;
  protected Object value;
  protected InKeyValueOrdinals inOrdinals = null;
  protected TypeConverterFactory typeConverterFactory;
  protected ITypeConverter inConverterK = null;
  protected ITypeConverter inConverterV = null;
  protected RowMetaInterface injectorRowMeta;
  protected SingleThreadedTransExecutor executor;

  public GenericTransReduce() throws KettleException {
    super();
    this.setMRType( MROperations.Reduce );
    typeConverterFactory = new TypeConverterFactory();
  }

  public boolean isSingleThreaded() {
    return reduceSingleThreaded;
  }

  public String getInputStepName() {
    return reduceInputStepName;
  }

  public String getOutputStepName() {
    return reduceOutputStepName;
  }

  public void reduce( final K key, final Iterator<V> values, final OutputCollector<K2, V2> output, final Reporter reporter ) throws IOException {
    try {
      if ( debug ) {
        reporter.setStatus( "Begin processing record" );
      }

      // Just to make sure the configuration is not broken...
      if ( trans == null ) {
        throw new RuntimeException( "Error initializing transformation. See error log." ); //$NON-NLS-1$
      }

      // The transformation needs to be prepared and started...
      // Only ever initialize once!
      if ( !trans.isRunning() ) {
        shareVariableSpaceWithTrans( reporter );
        setTransLogLevel( reporter );
        prepareExecution( reporter );
        addInjectorAndProducerToTrans( key, values, output, reporter, getInputStepName(), getOutputStepName() );

        // If we're using the single threading engine we're going to keep pushing rows into our construct.
        // If not, we're going to re-create the Trans engine every time.
        if ( isSingleThreaded() ) {
          executor = new SingleThreadedTransExecutor( trans );

          // This validates whether or not a step is capable of running in Single Threaded mode.
          boolean ok = executor.init();
          if ( !ok ) {
            throw new KettleException( "Unable to initialize the single threaded transformation, check the log for details." );
          }

          // The transformation is considered in a "running" state now.
        }
      }

      // The following 2 statements are the only things left to do for one set of data coming from Hadoop...

      // Inject the values, including the one we probed...
      injectValues( key, values, output, reporter );

      if ( isSingleThreaded() ) {
        // Signal to the executor that we have enough data in the pipeline to do one iteration.
        // All steps are executed in a loop once in sequence, one after the other.
        executor.oneIteration();
      }

    } catch ( Exception e ) {
      printException( reporter, e );
      setDebugStatus( reporter, "An exception was raised" );
      throw new IOException( e );
    }
  }

  private void printException( Reporter reporter, Exception e ) throws IOException {
    e.printStackTrace( System.err );
    setDebugStatus( reporter, "An exception was raised" );
    throw new IOException( e );
  }

  private void disposeTransformation() {
    try {
      trans.stopAll();
    } catch ( Exception ex ) {
      ex.printStackTrace();
    }
    try {
      trans.cleanup();
    } catch ( Exception ex ) {
      ex.printStackTrace();
    }
  }

  private void injectValues( final K key, final Iterator<V> values, final OutputCollector<K2, V2> output, final Reporter reporter ) throws Exception {
    if ( rowProducer != null ) {
      // Execute row injection
      // We loop through the values to do this

      if ( value != null ) {
        if ( inOrdinals != null ) {
          injectValue(key, inOrdinals.getKeyOrdinal(), inConverterK, value, inOrdinals.getValueOrdinal(), inConverterV, injectorRowMeta, rowProducer, reporter );
        } else {
          injectValue(key, inConverterK, value, inConverterV, injectorRowMeta, rowProducer, reporter );
        }
      }

      while ( values.hasNext() ) {
        value = values.next();

        if ( inOrdinals != null ) {
          injectValue(key, inOrdinals.getKeyOrdinal(), inConverterK, value, inOrdinals.getValueOrdinal(), inConverterV, injectorRowMeta, rowProducer, reporter );
        } else {
          injectValue(key, inConverterK, value, inConverterV, injectorRowMeta, rowProducer, reporter );
        }
      }

      // make sure we don't pick up a bogus row next time this method is called without rows.
      value = null;
    }
  }

  private void prepareExecution( Reporter reporter ) throws KettleException {
    setDebugStatus( reporter, "Preparing transformation for execution" );
    trans.prepareExecution( null );
  }


  /**
   * set the trans' log level if we have our's set
   * @param reporter
   */
  private void setTransLogLevel( Reporter reporter ) {
    if ( logLevel != null ) {
      setDebugStatus( reporter, "Setting the trans.logLevel to " + logLevel.toString() );
      trans.setLogLevel( logLevel );
    } else {
      setDebugStatus( reporter, getClass().getName() + ".logLevel is null.  The trans log level will not be set." );
    }
  }

  /**
   * share the variables from the PDI job.
   * we do this here instead of in createTrans() as MRUtil.recreateTrans() will not
   * copy "execution" trans information.
   */
  private void shareVariableSpaceWithTrans( Reporter reporter ) {
    if ( variableSpace != null ) {
      setDebugStatus( reporter, "Sharing the VariableSpace from the PDI job." );
      trans.shareVariablesWith( variableSpace );

      if ( debug ) {

        //  list the variables
        List<String> variables = Arrays.asList( trans.listVariables() );
        Collections.sort( variables );

        if ( variables != null ) {
          setDebugStatus( reporter, "Variables: " );
          for ( String variable : variables ) {
            setDebugStatus( reporter, "     " + variable + " = " + trans.getVariable( variable ) );
          }
        }
      }
    } else {
      setDebugStatus( reporter, "variableSpace is null.  We are not going to share it with the trans." );
    }

  }

  private void addInjectorAndProducerToTrans( K key, Iterator<V> values, OutputCollector<K2, V2> output, Reporter reporter, String inputStepName, String outputStepName ) throws Exception {
    setDebugStatus( reporter, "Locating output step: " + outputStepName );
    StepInterface outputStep = trans.findRunThread( outputStepName );
    if ( outputStep != null ) {
      rowCollector = new OutputCollectorRowListener( output, outClassK, outClassV, reporter, debug );
      outputStep.addRowListener( rowCollector );

      injectorRowMeta = new RowMeta();
      setDebugStatus( reporter, "Locating input step: " + inputStepName );
      if ( inputStepName != null ) {
        // Setup row injection
        rowProducer = trans.addRowProducer( inputStepName, 0 );
        StepInterface inputStep = rowProducer.getStepInterface();
        StepMetaInterface inputStepMeta = inputStep.getStepMeta().getStepMetaInterface();

        inOrdinals = null;
        if ( inputStepMeta instanceof BaseStepMeta ) {
          setDebugStatus( reporter, "Generating converters from RowMeta for injection into the transformation" );

          // Convert to BaseStepMeta and use getFields(...) to get the row meta and therefore the expected input types
          ( (BaseStepMeta) inputStepMeta ).getFields( injectorRowMeta, null, null, null, null );

          inOrdinals = new InKeyValueOrdinals( injectorRowMeta );

          if ( inOrdinals.getKeyOrdinal() < 0 || inOrdinals.getValueOrdinal() < 0 ) {
            throw new KettleException( "key or value is not defined in transformation injector step" );
          }

          // Get a converter for the Key if the value meta has a concrete Java class we can use.
          // If no converter can be found here we wont do any type conversion.
          if ( injectorRowMeta.getValueMeta( inOrdinals.getKeyOrdinal() ) != null ) {
            inConverterK = typeConverterFactory.getConverter(key.getClass(), injectorRowMeta.getValueMeta(inOrdinals.getKeyOrdinal() ) );
          }

          // we need to peek into the first value to get the class (the combination of Iterator and generic makes this a pain)
          if ( values.hasNext() ) {
            value = values.next();
          }
          if ( value != null ) {
            // Get a converter for the Value if the value meta has a concrete Java class we can use.
            // If no converter can be found here we wont do any type conversion.
            if ( injectorRowMeta.getValueMeta( inOrdinals.getValueOrdinal() ) != null ) {
              inConverterV = typeConverterFactory.getConverter(value.getClass(), injectorRowMeta.getValueMeta(inOrdinals.getValueOrdinal() ) );
            }
          }
        }

        trans.startThreads();
      } else {
        setDebugStatus(reporter, "No input stepname was defined" );
      }

      if ( getException() != null ) {
        setDebugStatus( reporter, "An exception was generated by the transformation" );
        // Bubble the exception from within Kettle to Hadoop
        throw getException();
      }

    } else {
      if ( outputStepName != null ) {
        setDebugStatus( reporter, "Output step [" + outputStepName + "] could not be found" );
        throw new KettleException( "Output step not defined in transformation" );
      } else {
        setDebugStatus( reporter, "Output step name not specified" );
      }
    }
  }

  @Override
  public void close() throws IOException {
    if ( rowProducer != null ) {
      rowProducer.finished();
    }
    // Stop the executor if any is defined...
    if ( isSingleThreaded() && executor != null ) {
      try {
        executor.dispose();
      } catch ( KettleException e ) {
        e.printStackTrace( System.err );
        trans.getLogChannel().logError( "Error disposing of single threading transformation: ", e );
      }

    } else if ( !isSingleThreaded() && trans != null ) {
      if ( rowProducer != null ) {
        trans.waitUntilFinished();
      }
      disposeTransformation();
    }

    super.close();
  }
}