/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.univariatestats; import java.util.ArrayList; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleStepException; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.util.Utils; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; /** * Calculate univariate statistics based on one column of the input data. * <p> * Calculates N, mean, standard deviation, minimum, maximum, median and arbitrary percentiles. Percentiles can be * calculated using interpolation or a simple method. See <a * href="http://www.itl.nist.gov/div898/handbook/prc/section2/prc252.htm"> The Engineering Statistics Handbook</a> for * details. * * @author Mark Hall (mhall{[at]}pentaho.org) * @version 1.0 */ public class UnivariateStats extends BaseStep implements StepInterface { private UnivariateStatsMeta m_meta; private UnivariateStatsData m_data; /** * holds cached input values if median/percentiles are to be calculated */ private ArrayList<Number>[] m_dataCache; /** * Creates a new <code>UnivariateStats</code> instance. * * @param stepMeta * holds the step's meta data * @param stepDataInterface * holds the step's temporary data * @param copyNr * the number assigned to the step * @param transMeta * meta data for the transformation * @param trans * a <code>Trans</code> value */ public UnivariateStats( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans ) { super( stepMeta, stepDataInterface, copyNr, transMeta, trans ); } /** * Process an incoming row of data. * * @param smi * a <code>StepMetaInterface</code> value * @param sdi * a <code>StepDataInterface</code> value * @return a <code>boolean</code> value * @exception KettleException * if an error occurs */ @SuppressWarnings( { "unchecked" } ) public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { m_meta = (UnivariateStatsMeta) smi; m_data = (UnivariateStatsData) sdi; Object[] r = getRow(); // get row, set busy! if ( r == null ) { // no more input to be expected... // compute the derived stats and generate an output row Object[] outputRow = generateOutputRow(); // emit the single output row putRow( m_data.getOutputRowMeta(), outputRow ); setOutputDone(); // save memory m_dataCache = null; return false; } // Handle the first row if ( first ) { first = false; // Don't want to clone and add to the input meta data - want // to create a new row meta data for derived calculations RowMetaInterface outputMeta = new RowMeta(); m_data.setInputRowMeta( getInputRowMeta() ); m_data.setOutputRowMeta( outputMeta ); // Determine the output format m_meta.getFields( m_data.getOutputRowMeta(), getStepname(), null, null, this, repository, metaStore ); // Set up data cache for calculating median/percentiles m_dataCache = new ArrayList[m_meta.getNumFieldsToProcess()]; // Initialize the step meta data FieldIndex[] fi = new FieldIndex[m_meta.getNumFieldsToProcess()]; m_data.setFieldIndexes( fi ); // allocate the field indexes in the data class and meta stats functions // in the step meta for ( int i = 0; i < m_meta.getNumFieldsToProcess(); i++ ) { UnivariateStatsMetaFunction usmf = m_meta.getInputFieldMetaFunctions()[i]; //CHECKSTYLE:Indentation:OFF m_data.getFieldIndexes()[i] = new FieldIndex(); // check that this univariate stats computation has been // defined on an input field if ( !Utils.isEmpty( usmf.getSourceFieldName() ) ) { int fieldIndex = m_data.getInputRowMeta().indexOfValue( usmf.getSourceFieldName() ); if ( fieldIndex < 0 ) { throw new KettleStepException( "Unable to find the specified fieldname '" + usmf.getSourceFieldName() + "' for stats calc #" + ( i + 1 ) ); } FieldIndex tempData = m_data.getFieldIndexes()[i]; tempData.m_columnIndex = fieldIndex; ValueMetaInterface inputFieldMeta = m_data.getInputRowMeta().getValueMeta( fieldIndex ); // check the type of the input field if ( !inputFieldMeta.isNumeric() ) { throw new KettleException( "The input field for stats calc #" + ( i + 1 ) + "is not numeric." ); } // finish initializing tempData.m_min = Double.MAX_VALUE; tempData.m_max = Double.MIN_VALUE; // set up caches if median/percentiles have been // requested if ( usmf.getCalcMedian() || usmf.getCalcPercentile() >= 0 ) { m_dataCache[i] = new ArrayList<Number>(); } } else { throw new KettleException( "There is no input field specified for stats calc #" + ( i + 1 ) ); } } } // end (if first) for ( int i = 0; i < m_meta.getNumFieldsToProcess(); i++ ) { UnivariateStatsMetaFunction usmf = m_meta.getInputFieldMetaFunctions()[i]; if ( !Utils.isEmpty( usmf.getSourceFieldName() ) ) { FieldIndex tempData = m_data.getFieldIndexes()[i]; ValueMetaInterface metaI = getInputRowMeta().getValueMeta( tempData.m_columnIndex ); Number input = null; try { input = metaI.getNumber( r[tempData.m_columnIndex] ); } catch ( Exception ex ) { // quietly ignore -- assume missing for anything not // parsable as a number } if ( input != null ) { // add to the cache? if ( usmf.getCalcMedian() || usmf.getCalcPercentile() >= 0 ) { m_dataCache[i].add( input ); } // update stats double val = input.doubleValue(); tempData.m_count++; tempData.m_sum += val; tempData.m_sumSq += ( val * val ); if ( val < tempData.m_min ) { tempData.m_min = val; } if ( val > tempData.m_max ) { tempData.m_max = val; } } // otherwise, treat non-numeric values as missing } } if ( log.isRowLevel() ) { logRowlevel( "Read row #" + getLinesRead() + " : " + r ); } if ( checkFeedback( getLinesRead() ) ) { logBasic( "Linenr " + getLinesRead() ); } return true; } /** * Generates an output row * * @return an <code>Object[]</code> value */ private Object[] generateOutputRow() { int totalNumOutputFields = 0; for ( int i = 0; i < m_meta.getNumFieldsToProcess(); i++ ) { UnivariateStatsMetaFunction usmf = m_meta.getInputFieldMetaFunctions()[i]; if ( !Utils.isEmpty( usmf.getSourceFieldName() ) ) { totalNumOutputFields += usmf.numberOfMetricsRequested(); } } Object[] result = new Object[totalNumOutputFields]; int index = 0; for ( int i = 0; i < m_meta.getNumFieldsToProcess(); i++ ) { UnivariateStatsMetaFunction usmf = m_meta.getInputFieldMetaFunctions()[i]; if ( !Utils.isEmpty( usmf.getSourceFieldName() ) ) { Object[] tempOut = m_data.getFieldIndexes()[i].generateOutputValues( usmf, m_dataCache[i] ); for ( int j = 0; j < tempOut.length; j++ ) { result[index++] = tempOut[j]; } } } return result; } /** * Initialize the step. * * @param smi * a <code>StepMetaInterface</code> value * @param sdi * a <code>StepDataInterface</code> value * @return a <code>boolean</code> value */ public boolean init( StepMetaInterface smi, StepDataInterface sdi ) { m_meta = (UnivariateStatsMeta) smi; m_data = (UnivariateStatsData) sdi; if ( super.init( smi, sdi ) ) { return true; } return false; } }