/*******************************************************************************
*
* Pentaho Big Data
*
* Copyright (C) 2002-2015 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.hadoop.mapreduce;
import com.thoughtworks.xstream.XStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.logging.KettleLogStore;
import org.pentaho.di.core.logging.KettleLoggingEvent;
import org.pentaho.di.core.logging.LogLevel;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.variables.VariableSpace;
import org.pentaho.di.core.variables.Variables;
import org.pentaho.di.trans.RowProducer;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.step.BaseStepMeta;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.di.trans.steps.missing.MissingTrans;
import org.pentaho.hadoop.mapreduce.converter.TypeConverterFactory;
import org.pentaho.hadoop.mapreduce.converter.spi.ITypeConverter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
/**
* Map runner that uses the normal Kettle execution engine to process all input data during one single run.<p> This
* relies on newly un-@Deprecated interfaces ({@link MapRunnable}, {@link JobConf}) in Hadoop 0.21.0.
*/
public class PentahoMapRunnable<K1, V1, K2, V2> implements MapRunnable<K1, V1, K2, V2> {
public static final String KETTLE_PMR_PLUGIN_TIMEOUT = "KETTLE_PMR_PLUGIN_TIMEOUT";
private long pluginWaitTimeout;
protected static enum Counter {
INPUT_RECORDS, OUTPUT_RECORDS, OUT_RECORD_WITH_NULL_KEY, OUT_RECORD_WITH_NULL_VALUE
}
protected String transMapXml;
protected String transReduceXml;
protected String mapInputStepName;
protected String reduceInputStepName;
protected String mapOutputStepName;
protected String reduceOutputStepName;
protected Class<K2> outClassK;
protected Class<V2> outClassV;
protected String id = UUID.randomUUID().toString();
protected boolean debug = false;
// the transformation that will be used as a mapper or reducer
protected Trans trans;
protected VariableSpace variableSpace = null;
protected LogLevel logLevel;
protected OutputCollectorRowListener<K2, V2> rowCollector;
private final String ENVIRONMENT_VARIABLE_PREFIX = "java.system.";
private final String KETTLE_VARIABLE_PREFIX = "KETTLE_";
public PentahoMapRunnable() throws KettleException {
}
public void configure( JobConf job ) {
pluginWaitTimeout = TimeUnit.MINUTES.toMillis( 5 );
debug = "true".equalsIgnoreCase( job.get( "debug" ) ); //$NON-NLS-1$
transMapXml = job.get( "transformation-map-xml" );
transReduceXml = job.get( "transformation-reduce-xml" );
mapInputStepName = job.get( "transformation-map-input-stepname" );
mapOutputStepName = job.get( "transformation-map-output-stepname" );
reduceInputStepName = job.get( "transformation-reduce-input-stepname" );
reduceOutputStepName = job.get( "transformation-reduce-output-stepname" );
String xmlVariableSpace = job.get( "variableSpace" );
outClassK = (Class<K2>) job.getMapOutputKeyClass();
outClassV = (Class<V2>) job.getMapOutputValueClass();
if ( !Const.isEmpty( xmlVariableSpace ) ) {
setDebugStatus( "PentahoMapRunnable(): variableSpace was retrieved from the job. The contents: " );
setDebugStatus( xmlVariableSpace );
// deserialize from xml to variable space
XStream xStream = new XStream();
setDebugStatus( "PentahoMapRunnable(): Setting classes variableSpace property.: " );
variableSpace = (VariableSpace) xStream.fromXML( xmlVariableSpace );
for ( String variableName : variableSpace.listVariables() ) {
if ( variableName.startsWith( KETTLE_VARIABLE_PREFIX ) ) {
System.setProperty( variableName, variableSpace.getVariable( variableName ) );
}
if ( KETTLE_PMR_PLUGIN_TIMEOUT.equals( variableName ) ) {
try {
pluginWaitTimeout = Long.parseLong( variableSpace.getVariable( variableName ) );
} catch ( Exception e ) {
System.out.println( "Unable to parse plugin wait timeout, defaulting to 5 minutes" );
}
}
}
} else {
setDebugStatus( "PentahoMapRunnable(): The PDI Job's variable space was not sent." );
variableSpace = new Variables();
}
// Check for environment variables in the userDefined variables
Iterator<Entry<String, String>> iter = job.iterator();
while ( iter.hasNext() ) {
Entry<String, String> entry = iter.next();
if ( entry.getKey().startsWith( ENVIRONMENT_VARIABLE_PREFIX ) ) {
System.setProperty( entry.getKey().substring( ENVIRONMENT_VARIABLE_PREFIX.length() ), entry.getValue() );
} else if ( entry.getKey().startsWith( KETTLE_VARIABLE_PREFIX ) ) {
System.setProperty( entry.getKey(), entry.getValue() );
}
}
MRUtil.passInformationToTransformation( variableSpace, job );
setDebugStatus( "Job configuration" );
setDebugStatus( "Output key class: " + outClassK.getName() );
setDebugStatus( "Output value class: " + outClassV.getName() );
// set the log level to what the level of the job is
String stringLogLevel = job.get( "logLevel" );
if ( !Const.isEmpty( stringLogLevel ) ) {
logLevel = LogLevel.valueOf( stringLogLevel );
setDebugStatus( "Log level set to " + stringLogLevel );
} else {
System.out.println( "Could not retrieve the log level from the job configuration. logLevel will not be set." );
}
long deadline = 0;
boolean first = true;
while ( true ) {
createTrans( job );
if ( first ) {
deadline = pluginWaitTimeout + System.currentTimeMillis();
System.out
.println( PentahoMapRunnable.class + ": Trans creation checking starting now " + new Date().toString() );
first = false;
}
List<MissingTrans> missingTranses = new ArrayList<MissingTrans>();
for ( StepMeta stepMeta : trans.getTransMeta().getSteps() ) {
StepMetaInterface stepMetaInterface = stepMeta.getStepMetaInterface();
if ( stepMetaInterface instanceof MissingTrans ) {
MissingTrans missingTrans = (MissingTrans) stepMetaInterface;
System.out.println(
MissingTrans.class + "{stepName: " + missingTrans.getStepName() + ", missingPluginId: " + missingTrans
.getMissingPluginId() + "}" );
missingTranses.add( missingTrans );
}
}
if ( missingTranses.size() == 0 ) {
System.out.println( PentahoMapRunnable.class + ": Done waiting on plugins now " + new Date().toString() );
break;
} else {
if ( System.currentTimeMillis() > deadline ) {
StringBuilder stringBuilder = new StringBuilder( "Failed to initialize plugins: " );
for ( MissingTrans missingTrans : missingTranses ) {
stringBuilder.append( missingTrans.getMissingPluginId() );
stringBuilder.append( " on step " ).append( missingTrans.getStepName() );
stringBuilder.append( ", " );
}
stringBuilder.setLength( stringBuilder.length() - 2 );
throw new RuntimeException( stringBuilder.toString() );
} else {
try {
Thread.sleep( Math.min( 100, deadline - System.currentTimeMillis() ) );
} catch ( InterruptedException e ) {
throw new RuntimeException( e );
}
}
}
}
}
public void injectValue( Object key, ITypeConverter inConverterK, Object value, ITypeConverter inConverterV,
RowMeta injectorRowMeta, RowProducer rowProducer, Reporter reporter ) throws Exception {
injectValue( key, 0, inConverterK, value, 1, inConverterV, injectorRowMeta, rowProducer, reporter );
}
public void injectValue( Object key, int keyOrdinal, ITypeConverter inConverterK, Object value, int valueOrdinal,
ITypeConverter inConverterV, RowMeta injectorRowMeta, RowProducer rowProducer,
Reporter reporter )
throws Exception {
Object[] row = new Object[ injectorRowMeta.size() ];
row[ keyOrdinal ] =
inConverterK != null ? inConverterK.convert( injectorRowMeta.getValueMeta( keyOrdinal ), key ) : key;
row[ valueOrdinal ] =
inConverterV != null ? inConverterV.convert( injectorRowMeta.getValueMeta( valueOrdinal ), value )
: value;
if ( debug ) {
setDebugStatus( reporter, "Injecting input record [" + row[ keyOrdinal ] + "] - [" + row[ valueOrdinal ] + "]" );
}
rowProducer.putRow( injectorRowMeta, row );
}
protected void createTrans( final Configuration conf ) {
try {
setDebugStatus( "Creating a transformation for a map." );
trans = MRUtil.getTrans( conf, transMapXml, false );
} catch ( KettleException ke ) {
throw new RuntimeException( "Error loading transformation", ke ); //$NON-NLS-1$
}
}
public String getTransMapXml() {
return transMapXml;
}
public void setTransMapXml( String transMapXml ) {
this.transMapXml = transMapXml;
}
public String getTransReduceXml() {
return transReduceXml;
}
public void setTransReduceXml( String transReduceXml ) {
this.transReduceXml = transReduceXml;
}
public String getMapInputStepName() {
return mapInputStepName;
}
public void setMapInputStepName( String mapInputStepName ) {
this.mapInputStepName = mapInputStepName;
}
public String getMapOutputStepName() {
return mapOutputStepName;
}
public void setMapOutputStepName( String mapOutputStepName ) {
this.mapOutputStepName = mapOutputStepName;
}
public String getReduceInputStepName() {
return reduceInputStepName;
}
public void setReduceInputStepName( String reduceInputStepName ) {
this.reduceInputStepName = reduceInputStepName;
}
public String getReduceOutputStepName() {
return reduceOutputStepName;
}
public void setReduceOutputStepName( String reduceOutputStepName ) {
this.reduceOutputStepName = reduceOutputStepName;
}
public Class<?> getOutClassK() {
return outClassK;
}
public void setOutClassK( Class<K2> outClassK ) {
this.outClassK = outClassK;
}
public Class<?> getOutClassV() {
return outClassV;
}
public void setOutClassV( Class<V2> outClassV ) {
this.outClassV = outClassV;
}
public Trans getTrans() {
return trans;
}
public void setTrans( Trans trans ) {
this.trans = trans;
}
public String getId() {
return id;
}
public void setId( String id ) {
this.id = id;
}
public Exception getException() {
return rowCollector != null ? rowCollector.getException() : null;
}
public void setDebugStatus( Reporter reporter, String message ) {
if ( debug ) {
System.out.println( message );
reporter.setStatus( message );
}
}
private void setDebugStatus( String message ) {
if ( debug ) {
System.out.println( message );
}
}
public void run( RecordReader<K1, V1> input, final OutputCollector<K2, V2> output, final Reporter reporter )
throws IOException {
try {
if ( trans == null ) {
throw new RuntimeException( "Error initializing transformation. See error log." ); //$NON-NLS-1$
} else {
// Clean up old logging
KettleLogStore.discardLines( trans.getLogChannelId(), true );
}
// Create a copy of trans so we don't continue to add new TransListeners and run into a
// ConcurrentModificationException
// when this mapper is reused "quickly"
trans = MRUtil.recreateTrans( trans );
String logLinePrefix = getClass().getName() + ".run: ";
setDebugStatus( logLinePrefix + " The transformation was just recreated." );
// share the variables from the PDI job.
// we do this here instead of in createTrans() as MRUtil.recreateTrans() wil not
// copy "execution" trans information.
if ( variableSpace != null ) {
setDebugStatus( "Sharing the VariableSpace from the PDI job." );
trans.shareVariablesWith( variableSpace );
if ( debug ) {
// list the variables
List<String> variables = Arrays.asList( trans.listVariables() );
Collections.sort( variables );
if ( variables != null ) {
setDebugStatus( "Variables: " );
for ( String variable : variables ) {
setDebugStatus( " " + variable + " = " + trans.getVariable( variable ) );
}
}
}
} else {
setDebugStatus( reporter, "variableSpace is null. We are not going to share it with the trans." );
}
// set the trans' log level if we have our's set
if ( logLevel != null ) {
setDebugStatus( "Setting the trans.logLevel to " + logLevel.toString() );
trans.setLogLevel( logLevel );
} else {
setDebugStatus( "logLevel is null. The trans log level will not be set." );
}
// allocate key & value instances that are re-used for all entries
K1 key = input.createKey();
V1 value = input.createValue();
setDebugStatus( reporter, "Preparing transformation for execution" );
trans.prepareExecution( null );
try {
setDebugStatus( reporter, "Locating output step: " + mapOutputStepName );
StepInterface outputStep = trans.findRunThread( mapOutputStepName );
if ( outputStep != null ) {
rowCollector = new OutputCollectorRowListener( output, outClassK, outClassV, reporter, debug );
// rowCollector = OutputCollectorRowListener.build(output, outputRowMeta, outClassK, outClassV,
// reporter, debug);
outputStep.addRowListener( rowCollector );
RowMeta injectorRowMeta = new RowMeta();
RowProducer rowProducer = null;
TypeConverterFactory typeConverterFactory = new TypeConverterFactory();
ITypeConverter inConverterK = null;
ITypeConverter inConverterV = null;
setDebugStatus( reporter, "Locating input step: " + mapInputStepName );
if ( mapInputStepName != null ) {
// Setup row injection
rowProducer = trans.addRowProducer( mapInputStepName, 0 );
StepInterface inputStep = rowProducer.getStepInterface();
StepMetaInterface inputStepMeta = inputStep.getStepMeta().getStepMetaInterface();
InKeyValueOrdinals inOrdinals = null;
if ( inputStepMeta instanceof BaseStepMeta ) {
setDebugStatus( reporter,
"Generating converters from RowMeta for injection into the mapper transformation" );
// Use getFields(...) to get the row meta and therefore the expected input types
inputStepMeta.getFields( injectorRowMeta, null, null, null, null );
inOrdinals = new InKeyValueOrdinals( injectorRowMeta );
if ( inOrdinals.getKeyOrdinal() < 0 || inOrdinals.getValueOrdinal() < 0 ) {
throw new KettleException( "key or value is not defined in transformation injector step" );
}
// Get a converter for the Key if the value meta has a concrete Java class we can use.
// If no converter can be found here we wont do any type conversion.
if ( injectorRowMeta.getValueMeta( inOrdinals.getKeyOrdinal() ) != null ) {
inConverterK = typeConverterFactory
.getConverter( key.getClass(), injectorRowMeta.getValueMeta( inOrdinals.getKeyOrdinal() ) );
}
// Get a converter for the Value if the value meta has a concrete Java class we can use.
// If no converter can be found here we wont do any type conversion.
if ( injectorRowMeta.getValueMeta( inOrdinals.getValueOrdinal() ) != null ) {
inConverterV = typeConverterFactory
.getConverter( value.getClass(), injectorRowMeta.getValueMeta( inOrdinals.getValueOrdinal() ) );
}
}
trans.startThreads();
if ( rowProducer != null ) {
while ( input.next( key, value ) ) {
if ( inOrdinals != null ) {
injectValue( key, inOrdinals.getKeyOrdinal(), inConverterK, value, inOrdinals.getValueOrdinal(),
inConverterV, injectorRowMeta, rowProducer, reporter );
} else {
injectValue( key, inConverterK, value, inConverterV, injectorRowMeta, rowProducer, reporter );
}
}
rowProducer.finished();
}
trans.waitUntilFinished();
setDebugStatus( reporter, "Mapper transformation has finished" );
if ( trans.getErrors() > 0 ) {
setDebugStatus( "Errors detected for mapper transformation" );
List<KettleLoggingEvent> logList = KettleLogStore
.getLogBufferFromTo( trans.getLogChannelId(), false, 0, KettleLogStore.getLastBufferLineNr() );
StringBuffer buff = new StringBuffer();
for ( KettleLoggingEvent le : logList ) {
if ( le.getLevel() == LogLevel.ERROR ) {
buff.append( le.getMessage().toString() ).append( "\n" );
}
}
throw new Exception( "Errors were detected for mapper transformation:\n\n"
+ buff.toString() );
}
} else {
setDebugStatus( reporter, "No input stepname was defined" );
}
if ( getException() != null ) {
setDebugStatus( reporter, "An exception was generated by the mapper transformation" );
// Bubble the exception from within Kettle to Hadoop
throw getException();
}
} else {
if ( mapOutputStepName != null ) {
setDebugStatus( reporter, "Output step [" + mapOutputStepName + "]could not be found" );
throw new KettleException( "Output step not defined in transformation" );
} else {
setDebugStatus( reporter, "Output step name not specified" );
}
}
} finally {
try {
trans.stopAll();
} catch ( Exception ex ) {
ex.printStackTrace();
}
try {
trans.cleanup();
} catch ( Exception ex ) {
ex.printStackTrace();
}
}
} catch ( Exception e ) {
e.printStackTrace( System.err );
setDebugStatus( reporter, "An exception was generated by the mapper task" );
throw new IOException( e );
}
reporter.setStatus( "Completed processing record" );
}
}