/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2017 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.normaliser;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
/**
* Normalise de-normalised input data.
*
* @author Matt
* @since 5-apr-2003
*/
public class Normaliser extends BaseStep implements StepInterface {
private static Class<?> PKG = NormaliserMeta.class; // for i18n purposes, needed by Translator2!!
private NormaliserMeta meta;
private NormaliserData data;
public Normaliser( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
Trans trans ) {
super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
}
public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
meta = (NormaliserMeta) smi;
data = (NormaliserData) sdi;
Object[] r = getRow(); // get row from rowset, wait for our turn, indicate busy!
if ( r == null ) { // no more input to be expected...
setOutputDone();
return false;
}
List<Integer> normFieldList;
int i, e;
if ( first ) { // INITIALISE
first = false;
data.inputRowMeta = getInputRowMeta();
data.outputRowMeta = data.inputRowMeta.clone();
meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore );
int normFieldsLength = meta.getNormaliserFields().length;
data.typeToFieldIndex = new HashMap<>();
String typeValue;
int dataFieldNr;
// Get a unique list of occurrences...
//
data.type_occ = new ArrayList<>();
data.maxlen = 0;
for ( i = 0; i < normFieldsLength; i++ ) {
typeValue = meta.getNormaliserFields()[i].getValue();
if ( !data.type_occ.contains( typeValue ) ) {
data.type_occ.add( typeValue );
}
if ( typeValue.length() > data.maxlen ) {
data.maxlen = typeValue.length();
}
// This next section creates a map of arraylist objects. The key is the Type in the Normaliser
// and the ArrayList is the list of indexes on the row of all fields that get normalized under that Type.
// This eliminates the inner loop that iterated over all the fields finding the fields associated with the Type.
// On a test data set with 2500 fields and about 36000 input rows (outputting over 22m rows), the time went from
// 12min to about 1min 35sec.
dataFieldNr = data.inputRowMeta.indexOfValue( meta.getNormaliserFields()[i].getName() );
if ( dataFieldNr < 0 ) {
logError( BaseMessages.getString( PKG, "Normaliser.Log.CouldNotFindFieldInRow", meta.getNormaliserFields()[i].getName() ) );
setErrors( 1 );
stopAll();
return false;
}
normFieldList = data.typeToFieldIndex.get( typeValue );
if ( normFieldList == null ) {
normFieldList = new ArrayList<>();
data.typeToFieldIndex.put( typeValue, normFieldList );
}
normFieldList.add( dataFieldNr );
}
// Which fields are not impacted? We can just copy these, leave them alone.
//
data.copy_fieldnrs = new ArrayList<>();
Set<String> normaliserFields = meta.getFieldNames();
int irmSize = data.inputRowMeta.size();
for ( i = 0; i < irmSize; i++ ) {
ValueMetaInterface v = data.inputRowMeta.getValueMeta( Integer.valueOf( i ) );
// Backwards compatibility - old loop called Const.indexofstring which uses equalsIgnoreCase
if ( !normaliserFields.contains( v.getName().toLowerCase() ) ) {
data.copy_fieldnrs.add( Integer.valueOf( i ) );
}
}
}
// Modest performance improvement over millions of rows - don't recalculate on each loop iteration something that doesn't change
int typeOccSize = data.type_occ.size();
int copyFldNrsSz = data.copy_fieldnrs.size();
int rowMetaSz = data.outputRowMeta.size();
// Modest performance improvement (large memory improvement) - re-use temporary objects instead of re-creating them - better for GC over time
String typeValue;
Object[] outputRowData;
int outputIndex, nr, normFieldListSz;
Object value;
// Now do the normalization
// Loop over the unique occurrences of the different types.
//
for ( e = 0; e < typeOccSize; e++ ) {
typeValue = data.type_occ.get( e );
// Create an output row per type
//
outputRowData = new Object[rowMetaSz];
outputIndex = 0;
// Copy the input row data, excluding the fields that are normalized...
//
for ( i = 0; i < copyFldNrsSz; i++ ) {
nr = data.copy_fieldnrs.get( i );
outputRowData[outputIndex++] = r[nr];
}
// Add the typefield_value
//
outputRowData[outputIndex++] = typeValue;
// Then add the normalized fields...
//
normFieldList = data.typeToFieldIndex.get( typeValue );
normFieldListSz = normFieldList.size();
for ( i = 0; i < normFieldListSz; i++ ) {
value = r[normFieldList.get( i )];
outputRowData[outputIndex++] = value;
}
// The row is constructed, now give it to the next step(s)...
//
putRow( data.outputRowMeta, outputRowData );
}
if ( checkFeedback( getLinesRead() ) ) {
if ( log.isBasic() ) {
logBasic( BaseMessages.getString( PKG, "Normaliser.Log.LineNumber" ) + getLinesRead() );
}
}
return true;
}
public boolean init( StepMetaInterface smi, StepDataInterface sdi ) {
meta = (NormaliserMeta) smi;
data = (NormaliserData) sdi;
if ( super.init( smi, sdi ) ) {
// Add init code here.
return true;
}
return false;
}
}