/*******************************************************************************
*
* Pentaho Big Data
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.hadoop.mapreduce;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.log4j.Logger;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.trans.RowProducer;
import org.pentaho.di.trans.SingleThreadedTransExecutor;
import org.pentaho.di.trans.step.BaseStepMeta;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.hadoop.mapreduce.converter.TypeConverterFactory;
import org.pentaho.hadoop.mapreduce.converter.spi.ITypeConverter;
/**
* A reducer class that just emits the sum of the input values.
*/
@SuppressWarnings( "deprecation" )
public class GenericTransReduce<K extends WritableComparable<?>, V extends Iterator<Writable>, K2, V2> extends PentahoMapReduceBase<K2, V2> implements
Reducer<K, V, K2, V2> {
private static Logger logger = Logger.getLogger( GenericTransReduce.class );
protected RowProducer rowProducer;
protected Object value;
protected InKeyValueOrdinals inOrdinals = null;
protected TypeConverterFactory typeConverterFactory;
protected ITypeConverter inConverterK = null;
protected ITypeConverter inConverterV = null;
protected RowMetaInterface injectorRowMeta;
protected SingleThreadedTransExecutor executor;
public GenericTransReduce() throws KettleException {
super();
this.setMRType( MROperations.Reduce );
typeConverterFactory = new TypeConverterFactory();
}
public boolean isSingleThreaded() {
return reduceSingleThreaded;
}
public String getInputStepName() {
return reduceInputStepName;
}
public String getOutputStepName() {
return reduceOutputStepName;
}
public void reduce( final K key, final Iterator<V> values, final OutputCollector<K2, V2> output, final Reporter reporter ) throws IOException {
try {
if ( debug ) {
reporter.setStatus( "Begin processing record" );
}
// Just to make sure the configuration is not broken...
if ( trans == null ) {
throw new RuntimeException( "Error initializing transformation. See error log." ); //$NON-NLS-1$
}
// The transformation needs to be prepared and started...
// Only ever initialize once!
if ( !trans.isRunning() ) {
shareVariableSpaceWithTrans( reporter );
setTransLogLevel( reporter );
prepareExecution( reporter );
addInjectorAndProducerToTrans( key, values, output, reporter, getInputStepName(), getOutputStepName() );
// If we're using the single threading engine we're going to keep pushing rows into our construct.
// If not, we're going to re-create the Trans engine every time.
if ( isSingleThreaded() ) {
executor = new SingleThreadedTransExecutor( trans );
// This validates whether or not a step is capable of running in Single Threaded mode.
boolean ok = executor.init();
if ( !ok ) {
throw new KettleException( "Unable to initialize the single threaded transformation, check the log for details." );
}
// The transformation is considered in a "running" state now.
}
}
// The following 2 statements are the only things left to do for one set of data coming from Hadoop...
// Inject the values, including the one we probed...
injectValues( key, values, output, reporter );
if ( isSingleThreaded() ) {
// Signal to the executor that we have enough data in the pipeline to do one iteration.
// All steps are executed in a loop once in sequence, one after the other.
executor.oneIteration();
}
} catch ( Exception e ) {
printException( reporter, e );
setDebugStatus( reporter, "An exception was raised" );
throw new IOException( e );
}
}
private void printException( Reporter reporter, Exception e ) throws IOException {
e.printStackTrace( System.err );
setDebugStatus( reporter, "An exception was raised" );
throw new IOException( e );
}
private void disposeTransformation() {
try {
trans.stopAll();
} catch ( Exception ex ) {
ex.printStackTrace();
}
try {
trans.cleanup();
} catch ( Exception ex ) {
ex.printStackTrace();
}
}
private void injectValues( final K key, final Iterator<V> values, final OutputCollector<K2, V2> output, final Reporter reporter ) throws Exception {
if ( rowProducer != null ) {
// Execute row injection
// We loop through the values to do this
if ( value != null ) {
if ( inOrdinals != null ) {
injectValue(key, inOrdinals.getKeyOrdinal(), inConverterK, value, inOrdinals.getValueOrdinal(), inConverterV, injectorRowMeta, rowProducer, reporter );
} else {
injectValue(key, inConverterK, value, inConverterV, injectorRowMeta, rowProducer, reporter );
}
}
while ( values.hasNext() ) {
value = values.next();
if ( inOrdinals != null ) {
injectValue(key, inOrdinals.getKeyOrdinal(), inConverterK, value, inOrdinals.getValueOrdinal(), inConverterV, injectorRowMeta, rowProducer, reporter );
} else {
injectValue(key, inConverterK, value, inConverterV, injectorRowMeta, rowProducer, reporter );
}
}
// make sure we don't pick up a bogus row next time this method is called without rows.
value = null;
}
}
private void prepareExecution( Reporter reporter ) throws KettleException {
setDebugStatus( reporter, "Preparing transformation for execution" );
trans.prepareExecution( null );
}
/**
* set the trans' log level if we have our's set
* @param reporter
*/
private void setTransLogLevel( Reporter reporter ) {
if ( logLevel != null ) {
setDebugStatus( reporter, "Setting the trans.logLevel to " + logLevel.toString() );
trans.setLogLevel( logLevel );
} else {
setDebugStatus( reporter, getClass().getName() + ".logLevel is null. The trans log level will not be set." );
}
}
/**
* share the variables from the PDI job.
* we do this here instead of in createTrans() as MRUtil.recreateTrans() will not
* copy "execution" trans information.
*/
private void shareVariableSpaceWithTrans( Reporter reporter ) {
if ( variableSpace != null ) {
setDebugStatus( reporter, "Sharing the VariableSpace from the PDI job." );
trans.shareVariablesWith( variableSpace );
if ( debug ) {
// list the variables
List<String> variables = Arrays.asList( trans.listVariables() );
Collections.sort( variables );
if ( variables != null ) {
setDebugStatus( reporter, "Variables: " );
for ( String variable : variables ) {
setDebugStatus( reporter, " " + variable + " = " + trans.getVariable( variable ) );
}
}
}
} else {
setDebugStatus( reporter, "variableSpace is null. We are not going to share it with the trans." );
}
}
private void addInjectorAndProducerToTrans( K key, Iterator<V> values, OutputCollector<K2, V2> output, Reporter reporter, String inputStepName, String outputStepName ) throws Exception {
setDebugStatus( reporter, "Locating output step: " + outputStepName );
StepInterface outputStep = trans.findRunThread( outputStepName );
if ( outputStep != null ) {
rowCollector = new OutputCollectorRowListener( output, outClassK, outClassV, reporter, debug );
outputStep.addRowListener( rowCollector );
injectorRowMeta = new RowMeta();
setDebugStatus( reporter, "Locating input step: " + inputStepName );
if ( inputStepName != null ) {
// Setup row injection
rowProducer = trans.addRowProducer( inputStepName, 0 );
StepInterface inputStep = rowProducer.getStepInterface();
StepMetaInterface inputStepMeta = inputStep.getStepMeta().getStepMetaInterface();
inOrdinals = null;
if ( inputStepMeta instanceof BaseStepMeta ) {
setDebugStatus( reporter, "Generating converters from RowMeta for injection into the transformation" );
// Convert to BaseStepMeta and use getFields(...) to get the row meta and therefore the expected input types
( (BaseStepMeta) inputStepMeta ).getFields( injectorRowMeta, null, null, null, null );
inOrdinals = new InKeyValueOrdinals( injectorRowMeta );
if ( inOrdinals.getKeyOrdinal() < 0 || inOrdinals.getValueOrdinal() < 0 ) {
throw new KettleException( "key or value is not defined in transformation injector step" );
}
// Get a converter for the Key if the value meta has a concrete Java class we can use.
// If no converter can be found here we wont do any type conversion.
if ( injectorRowMeta.getValueMeta( inOrdinals.getKeyOrdinal() ) != null ) {
inConverterK = typeConverterFactory.getConverter(key.getClass(), injectorRowMeta.getValueMeta(inOrdinals.getKeyOrdinal() ) );
}
// we need to peek into the first value to get the class (the combination of Iterator and generic makes this a pain)
if ( values.hasNext() ) {
value = values.next();
}
if ( value != null ) {
// Get a converter for the Value if the value meta has a concrete Java class we can use.
// If no converter can be found here we wont do any type conversion.
if ( injectorRowMeta.getValueMeta( inOrdinals.getValueOrdinal() ) != null ) {
inConverterV = typeConverterFactory.getConverter(value.getClass(), injectorRowMeta.getValueMeta(inOrdinals.getValueOrdinal() ) );
}
}
}
trans.startThreads();
} else {
setDebugStatus(reporter, "No input stepname was defined" );
}
if ( getException() != null ) {
setDebugStatus( reporter, "An exception was generated by the transformation" );
// Bubble the exception from within Kettle to Hadoop
throw getException();
}
} else {
if ( outputStepName != null ) {
setDebugStatus( reporter, "Output step [" + outputStepName + "] could not be found" );
throw new KettleException( "Output step not defined in transformation" );
} else {
setDebugStatus( reporter, "Output step name not specified" );
}
}
}
@Override
public void close() throws IOException {
if ( rowProducer != null ) {
rowProducer.finished();
}
// Stop the executor if any is defined...
if ( isSingleThreaded() && executor != null ) {
try {
executor.dispose();
} catch ( KettleException e ) {
e.printStackTrace( System.err );
trans.getLogChannel().logError( "Error disposing of single threading transformation: ", e );
}
} else if ( !isSingleThreaded() && trans != null ) {
if ( rowProducer != null ) {
trans.waitUntilFinished();
}
disposeTransformation();
}
super.close();
}
}