/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2017 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.groupby; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.net.SocketTimeoutException; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.TreeSet; import org.apache.commons.math.stat.descriptive.rank.Percentile; import org.apache.commons.vfs2.FileObject; import org.pentaho.di.core.Const; import org.pentaho.di.core.util.Utils; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleFileException; import org.pentaho.di.core.exception.KettlePluginException; import org.pentaho.di.core.exception.KettleValueException; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueDataUtil; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.core.row.value.ValueMetaBase; import org.pentaho.di.core.row.value.ValueMetaFactory; import org.pentaho.di.core.row.value.ValueMetaInteger; import org.pentaho.di.core.row.value.ValueMetaNone; import org.pentaho.di.core.row.value.ValueMetaNumber; import org.pentaho.di.core.row.value.ValueMetaString; import org.pentaho.di.core.vfs.KettleVFS; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; /** * Groups informations based on aggregation rules. (sum, count, ...) * * @author Matt * @since 2-jun-2003 */ public class GroupBy extends BaseStep implements StepInterface { private static Class<?> PKG = GroupByMeta.class; // for i18n purposes, needed by Translator2!! private GroupByMeta meta; private GroupByData data; private boolean allNullsAreZero = false; private boolean minNullIsValued = false; public GroupBy( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans ) { super( stepMeta, stepDataInterface, copyNr, transMeta, trans ); meta = (GroupByMeta) getStepMeta().getStepMetaInterface(); data = (GroupByData) stepDataInterface; } @Override public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { meta = (GroupByMeta) smi; data = (GroupByData) sdi; Object[] r = getRow(); // get row! if ( first ) { String val = getVariable( Const.KETTLE_AGGREGATION_ALL_NULLS_ARE_ZERO, "N" ); allNullsAreZero = ValueMetaBase.convertStringToBoolean( val ); val = getVariable( Const.KETTLE_AGGREGATION_MIN_NULL_IS_VALUED, "N" ); minNullIsValued = ValueMetaBase.convertStringToBoolean( val ); // What is the output looking like? // data.inputRowMeta = getInputRowMeta(); // In case we have 0 input rows, we still want to send out a single row aggregate // However... the problem then is that we don't know the layout from receiving it from the previous step over the // row set. // So we need to calculated based on the metadata... // if ( data.inputRowMeta == null ) { data.inputRowMeta = getTransMeta().getPrevStepFields( getStepMeta() ); } data.outputRowMeta = data.inputRowMeta.clone(); meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore ); // Do all the work we can beforehand // Calculate indexes, loop up fields, etc. // data.counts = new long[ meta.getSubjectField().length ]; data.subjectnrs = new int[ meta.getSubjectField().length ]; data.cumulativeSumSourceIndexes = new ArrayList<>(); data.cumulativeSumTargetIndexes = new ArrayList<>(); data.cumulativeAvgSourceIndexes = new ArrayList<>(); data.cumulativeAvgTargetIndexes = new ArrayList<>(); for ( int i = 0; i < meta.getSubjectField().length; i++ ) { if ( meta.getAggregateType()[ i ] == GroupByMeta.TYPE_GROUP_COUNT_ANY ) { data.subjectnrs[ i ] = 0; } else { data.subjectnrs[ i ] = data.inputRowMeta.indexOfValue( meta.getSubjectField()[i] ); } if ( ( r != null ) && ( data.subjectnrs[ i ] < 0 ) ) { logError( BaseMessages.getString( PKG, "GroupBy.Log.AggregateSubjectFieldCouldNotFound", meta.getSubjectField()[ i ] ) ); setErrors( 1 ); stopAll(); return false; } if ( meta.getAggregateType()[ i ] == GroupByMeta.TYPE_GROUP_CUMULATIVE_SUM ) { data.cumulativeSumSourceIndexes.add( data.subjectnrs[ i ] ); // The position of the target in the output row is the input row size + i // data.cumulativeSumTargetIndexes.add( data.inputRowMeta.size() + i ); } if ( meta.getAggregateType()[ i ] == GroupByMeta.TYPE_GROUP_CUMULATIVE_AVERAGE ) { data.cumulativeAvgSourceIndexes.add( data.subjectnrs[ i ] ); // The position of the target in the output row is the input row size + i // data.cumulativeAvgTargetIndexes.add( data.inputRowMeta.size() + i ); } } data.previousSums = new Object[ data.cumulativeSumTargetIndexes.size() ]; data.previousAvgSum = new Object[ data.cumulativeAvgTargetIndexes.size() ]; data.previousAvgCount = new long[ data.cumulativeAvgTargetIndexes.size() ]; data.groupnrs = new int[ meta.getGroupField().length ]; for ( int i = 0; i < meta.getGroupField().length; i++ ) { data.groupnrs[ i ] = data.inputRowMeta.indexOfValue( meta.getGroupField()[i] ); if ( ( r != null ) && ( data.groupnrs[ i ] < 0 ) ) { logError( BaseMessages.getString( PKG, "GroupBy.Log.GroupFieldCouldNotFound", meta.getGroupField()[ i ] ) ); setErrors( 1 ); stopAll(); return false; } } // Create a metadata value for the counter Integers // data.valueMetaInteger = new ValueMetaInteger( "count" ); data.valueMetaNumber = new ValueMetaNumber( "sum" ); // Initialize the group metadata // initGroupMeta( data.inputRowMeta ); } if ( first || data.newBatch ) { // Create a new group aggregate (init) // newAggregate( r ); } if ( first ) { // for speed: groupMeta+aggMeta // data.groupAggMeta = new RowMeta(); data.groupAggMeta.addRowMeta( data.groupMeta ); data.groupAggMeta.addRowMeta( data.aggMeta ); } if ( r == null ) { // no more input to be expected... (or none received in the first place) handleLastOfGroup(); setOutputDone(); return false; } if ( first || data.newBatch ) { first = false; data.newBatch = false; data.previous = data.inputRowMeta.cloneRow( r ); // copy the row to previous } else { calcAggregate( data.previous ); if ( meta.passAllRows() ) { addToBuffer( data.previous ); } } if ( !sameGroup( data.previous, r ) ) { if ( meta.passAllRows() ) { // Not the same group: close output (if any) closeOutput(); // Get all rows from the buffer! data.groupResult = getAggregateResult(); Object[] row = getRowFromBuffer(); long lineNr = 0; while ( row != null ) { int size = data.inputRowMeta.size(); row = RowDataUtil.addRowData( row, size, data.groupResult ); size += data.groupResult.length; lineNr++; if ( meta.isAddingLineNrInGroup() && !Utils.isEmpty( meta.getLineNrInGroupField() ) ) { Object lineNrValue = new Long( lineNr ); // ValueMetaInterface lineNrValueMeta = new ValueMeta(meta.getLineNrInGroupField(), // ValueMetaInterface.TYPE_INTEGER); // lineNrValueMeta.setLength(9); row = RowDataUtil.addValueData( row, size, lineNrValue ); size++; } addCumulativeSums( row ); addCumulativeAverages( row ); putRow( data.outputRowMeta, row ); row = getRowFromBuffer(); } closeInput(); } else { Object[] result = buildResult( data.previous ); if ( result != null ) { putRow( data.groupAggMeta, result ); // copy row to possible alternate rowset(s). } } newAggregate( r ); // Create a new group aggregate (init) } data.previous = data.inputRowMeta.cloneRow( r ); if ( checkFeedback( getLinesRead() ) ) { if ( log.isBasic() ) { logBasic( BaseMessages.getString( PKG, "GroupBy.LineNumber" ) + getLinesRead() ); } } return true; } private void handleLastOfGroup() throws KettleException { if ( meta.passAllRows() ) { // ALL ROWS if ( data.previous != null ) { calcAggregate( data.previous ); addToBuffer( data.previous ); } data.groupResult = getAggregateResult(); Object[] row = getRowFromBuffer(); long lineNr = 0; while ( row != null ) { int size = data.inputRowMeta.size(); row = RowDataUtil.addRowData( row, size, data.groupResult ); size += data.groupResult.length; lineNr++; if ( meta.isAddingLineNrInGroup() && !Utils.isEmpty( meta.getLineNrInGroupField() ) ) { Object lineNrValue = new Long( lineNr ); // ValueMetaInterface lineNrValueMeta = new ValueMeta(meta.getLineNrInGroupField(), // ValueMetaInterface.TYPE_INTEGER); // lineNrValueMeta.setLength(9); row = RowDataUtil.addValueData( row, size, lineNrValue ); size++; } addCumulativeSums( row ); addCumulativeAverages( row ); putRow( data.outputRowMeta, row ); row = getRowFromBuffer(); } closeInput(); } else { // JUST THE GROUP + AGGREGATE // Don't forget the last set of rows... if ( data.previous != null ) { calcAggregate( data.previous ); } Object[] result = buildResult( data.previous ); if ( result != null ) { putRow( data.groupAggMeta, result ); } } } private void addCumulativeSums( Object[] row ) throws KettleValueException { // We need to adjust this row with cumulative averages? // for ( int i = 0; i < data.cumulativeSumSourceIndexes.size(); i++ ) { int sourceIndex = data.cumulativeSumSourceIndexes.get( i ); Object previousTarget = data.previousSums[ i ]; Object sourceValue = row[ sourceIndex ]; int targetIndex = data.cumulativeSumTargetIndexes.get( i ); ValueMetaInterface sourceMeta = data.inputRowMeta.getValueMeta( sourceIndex ); ValueMetaInterface targetMeta = data.outputRowMeta.getValueMeta( targetIndex ); // If the first values where null, or this is the first time around, just take the source value... // if ( targetMeta.isNull( previousTarget ) ) { row[ targetIndex ] = sourceMeta.convertToNormalStorageType( sourceValue ); } else { // If the source value is null, just take the previous target value // if ( sourceMeta.isNull( sourceValue ) ) { row[ targetIndex ] = previousTarget; } else { row[ targetIndex ] = ValueDataUtil.plus( targetMeta, data.previousSums[ i ], sourceMeta, row[ sourceIndex ] ); } } data.previousSums[ i ] = row[ targetIndex ]; } } private void addCumulativeAverages( Object[] row ) throws KettleValueException { // We need to adjust this row with cumulative sums // for ( int i = 0; i < data.cumulativeAvgSourceIndexes.size(); i++ ) { int sourceIndex = data.cumulativeAvgSourceIndexes.get( i ); Object previousTarget = data.previousAvgSum[ i ]; Object sourceValue = row[ sourceIndex ]; int targetIndex = data.cumulativeAvgTargetIndexes.get( i ); ValueMetaInterface sourceMeta = data.inputRowMeta.getValueMeta( sourceIndex ); ValueMetaInterface targetMeta = data.outputRowMeta.getValueMeta( targetIndex ); // If the first values where null, or this is the first time around, just take the source value... // Object sum = null; if ( targetMeta.isNull( previousTarget ) ) { sum = sourceMeta.convertToNormalStorageType( sourceValue ); } else { // If the source value is null, just take the previous target value // if ( sourceMeta.isNull( sourceValue ) ) { sum = previousTarget; } else { if ( sourceMeta.isInteger() ) { sum = ValueDataUtil.plus( data.valueMetaInteger, data.previousAvgSum[ i ], sourceMeta, row[ sourceIndex ] ); } else { sum = ValueDataUtil.plus( targetMeta, data.previousAvgSum[ i ], sourceMeta, row[ sourceIndex ] ); } } } data.previousAvgSum[ i ] = sum; if ( !sourceMeta.isNull( sourceValue ) ) { data.previousAvgCount[ i ]++; } if ( sourceMeta.isInteger() ) { // Change to number as the exception // if ( sum == null ) { row[ targetIndex ] = null; } else { row[ targetIndex ] = new Double( ( (Long) sum ).doubleValue() / data.previousAvgCount[ i ] ); } } else { row[ targetIndex ] = ValueDataUtil.divide( targetMeta, sum, data.valueMetaInteger, data.previousAvgCount[ i ] ); } } } // Is the row r of the same group as previous? boolean sameGroup( Object[] previous, Object[] r ) throws KettleValueException { return data.inputRowMeta.compare( previous, r, data.groupnrs ) == 0; } /** * used for junits in GroupByAggregationNullsTest * * @param row * @throws KettleValueException */ @SuppressWarnings( "unchecked" ) void calcAggregate( Object[] row ) throws KettleValueException { for ( int i = 0; i < data.subjectnrs.length; i++ ) { Object subj = row[ data.subjectnrs[ i ] ]; ValueMetaInterface subjMeta = data.inputRowMeta.getValueMeta( data.subjectnrs[ i ] ); Object value = data.agg[ i ]; ValueMetaInterface valueMeta = data.aggMeta.getValueMeta( i ); switch ( meta.getAggregateType()[ i ] ) { case GroupByMeta.TYPE_GROUP_SUM: data.agg[ i ] = ValueDataUtil.sum( valueMeta, value, subjMeta, subj ); break; case GroupByMeta.TYPE_GROUP_AVERAGE: if ( !subjMeta.isNull( subj ) ) { data.agg[ i ] = ValueDataUtil.sum( valueMeta, value, subjMeta, subj ); data.counts[ i ]++; } break; case GroupByMeta.TYPE_GROUP_MEDIAN: case GroupByMeta.TYPE_GROUP_PERCENTILE: if ( !subjMeta.isNull( subj ) ) { ( (List<Double>) data.agg[ i ] ).add( subjMeta.getNumber( subj ) ); } break; case GroupByMeta.TYPE_GROUP_STANDARD_DEVIATION: if ( !subjMeta.isNull( subj ) ) { data.counts[ i ]++; double n = data.counts[ i ]; double x = subjMeta.getNumber( subj ); // for standard deviation null is exact 0 double sum = value == null ? new Double( 0 ) : (Double) value; double mean = data.mean[ i ]; double delta = x - mean; mean = mean + ( delta / n ); sum = sum + delta * ( x - mean ); data.mean[ i ] = mean; data.agg[ i ] = sum; } break; case GroupByMeta.TYPE_GROUP_COUNT_DISTINCT: if ( !subjMeta.isNull( subj ) ) { if ( data.distinctObjs == null ) { data.distinctObjs = new Set[ meta.getSubjectField().length ]; } if ( data.distinctObjs[ i ] == null ) { data.distinctObjs[ i ] = new TreeSet<Object>(); } Object obj = subjMeta.convertToNormalStorageType( subj ); if ( !data.distinctObjs[ i ].contains( obj ) ) { data.distinctObjs[ i ].add( obj ); // null is exact 0, or we will not be able to ++. value = value == null ? new Long( 0 ) : value; data.agg[ i ] = (Long) value + 1; } } break; case GroupByMeta.TYPE_GROUP_COUNT_ALL: if ( !subjMeta.isNull( subj ) ) { data.counts[ i ]++; } break; case GroupByMeta.TYPE_GROUP_COUNT_ANY: data.counts[ i ]++; break; case GroupByMeta.TYPE_GROUP_MIN: { if ( subj == null && !minNullIsValued ) { // PDI-10250 do not compare null break; } // PDI-15648 set the initial value for further comparing if ( value == null && subj != null && !minNullIsValued ) { data.agg[ i ] = subj; break; } if ( subjMeta.isSortedDescending() ) { // Account for negation in ValueMeta.compare() - See PDI-2302 if ( subjMeta.compare( value, valueMeta, subj ) < 0 ) { data.agg[ i ] = subj; } } else { if ( subjMeta.compare( subj, valueMeta, value ) < 0 ) { data.agg[ i ] = subj; } } break; } case GroupByMeta.TYPE_GROUP_MAX: if ( subjMeta.isSortedDescending() ) { // Account for negation in ValueMeta.compare() - See PDI-2302 if ( subjMeta.compare( value, valueMeta, subj ) > 0 ) { data.agg[ i ] = subj; } } else { if ( subjMeta.compare( subj, valueMeta, value ) > 0 ) { data.agg[ i ] = subj; } } break; case GroupByMeta.TYPE_GROUP_FIRST: if ( !( subj == null ) && value == null ) { data.agg[ i ] = subj; } break; case GroupByMeta.TYPE_GROUP_LAST: if ( !( subj == null ) ) { data.agg[ i ] = subj; } break; case GroupByMeta.TYPE_GROUP_FIRST_INCL_NULL: // This is on purpose. The calculation of the // first field is done when setting up a new group // This is just the field of the first row // if (linesWritten==0) value.setValue(subj); break; case GroupByMeta.TYPE_GROUP_LAST_INCL_NULL: data.agg[ i ] = subj; break; case GroupByMeta.TYPE_GROUP_CONCAT_COMMA: if ( !( subj == null ) ) { StringBuilder sb = (StringBuilder) value; if ( sb.length() > 0 ) { sb.append( ", " ); } sb.append( subjMeta.getString( subj ) ); } break; case GroupByMeta.TYPE_GROUP_CONCAT_STRING: if ( !( subj == null ) ) { String separator = ""; if ( !Utils.isEmpty( meta.getValueField()[ i ] ) ) { separator = environmentSubstitute( meta.getValueField()[ i ] ); } StringBuilder sb = (StringBuilder) value; if ( sb.length() > 0 ) { sb.append( separator ); } sb.append( subjMeta.getString( subj ) ); } break; default: break; } } } /** * used for junits in GroupByAggregationNullsTest * * @param r */ void newAggregate( Object[] r ) { // Put all the counters at 0 for ( int i = 0; i < data.counts.length; i++ ) { data.counts[ i ] = 0; } data.distinctObjs = null; data.agg = new Object[ data.subjectnrs.length ]; data.mean = new double[ data.subjectnrs.length ]; // sets all doubles to 0.0 data.aggMeta = new RowMeta(); for ( int i = 0; i < data.subjectnrs.length; i++ ) { ValueMetaInterface subjMeta = data.inputRowMeta.getValueMeta( data.subjectnrs[ i ] ); Object v = null; ValueMetaInterface vMeta = null; int aggType = meta.getAggregateType()[ i ]; switch ( aggType ) { case GroupByMeta.TYPE_GROUP_SUM: case GroupByMeta.TYPE_GROUP_AVERAGE: case GroupByMeta.TYPE_GROUP_CUMULATIVE_SUM: case GroupByMeta.TYPE_GROUP_CUMULATIVE_AVERAGE: if ( subjMeta.isNumeric() ) { try { vMeta = ValueMetaFactory.createValueMeta( meta.getAggregateField()[ i ], subjMeta.getType() ); } catch ( KettlePluginException e ) { vMeta = new ValueMetaNone( meta.getAggregateField()[ i ] ); } } else { vMeta = new ValueMetaNumber( meta.getAggregateField()[ i ] ); } break; case GroupByMeta.TYPE_GROUP_MEDIAN: case GroupByMeta.TYPE_GROUP_PERCENTILE: vMeta = new ValueMetaNumber( meta.getAggregateField()[ i ] ); v = new ArrayList<Double>(); break; case GroupByMeta.TYPE_GROUP_STANDARD_DEVIATION: vMeta = new ValueMetaNumber( meta.getAggregateField()[ i ] ); break; case GroupByMeta.TYPE_GROUP_COUNT_DISTINCT: case GroupByMeta.TYPE_GROUP_COUNT_ANY: case GroupByMeta.TYPE_GROUP_COUNT_ALL: vMeta = new ValueMetaInteger( meta.getAggregateField()[ i ] ); break; case GroupByMeta.TYPE_GROUP_FIRST: case GroupByMeta.TYPE_GROUP_LAST: case GroupByMeta.TYPE_GROUP_FIRST_INCL_NULL: case GroupByMeta.TYPE_GROUP_LAST_INCL_NULL: case GroupByMeta.TYPE_GROUP_MIN: case GroupByMeta.TYPE_GROUP_MAX: vMeta = subjMeta.clone(); vMeta.setName( meta.getAggregateField()[ i ] ); v = r == null ? null : r[ data.subjectnrs[ i ] ]; break; case GroupByMeta.TYPE_GROUP_CONCAT_COMMA: vMeta = new ValueMetaString( meta.getAggregateField()[ i ] ); v = new StringBuilder(); break; case GroupByMeta.TYPE_GROUP_CONCAT_STRING: vMeta = new ValueMetaString( meta.getAggregateField()[ i ] ); v = new StringBuilder(); break; default: // TODO raise an error here because we cannot continue successfully maybe the UI should validate this break; } if ( ( subjMeta != null ) && ( aggType != GroupByMeta.TYPE_GROUP_COUNT_ALL && aggType != GroupByMeta.TYPE_GROUP_COUNT_DISTINCT && aggType != GroupByMeta.TYPE_GROUP_COUNT_ANY ) ) { vMeta.setLength( subjMeta.getLength(), subjMeta.getPrecision() ); } data.agg[ i ] = v; data.aggMeta.addValueMeta( vMeta ); } // Also clear the cumulative data... // for ( int i = 0; i < data.previousSums.length; i++ ) { data.previousSums[ i ] = null; } for ( int i = 0; i < data.previousAvgCount.length; i++ ) { data.previousAvgCount[ i ] = 0L; data.previousAvgSum[ i ] = null; } } private Object[] buildResult( Object[] r ) throws KettleValueException { Object[] result = null; if ( r != null || meta.isAlwaysGivingBackOneRow() ) { result = RowDataUtil.allocateRowData( data.groupnrs.length ); if ( r != null ) { for ( int i = 0; i < data.groupnrs.length; i++ ) { result[ i ] = r[ data.groupnrs[ i ] ]; } } result = RowDataUtil.addRowData( result, data.groupnrs.length, getAggregateResult() ); } return result; } private void initGroupMeta( RowMetaInterface previousRowMeta ) throws KettleValueException { data.groupMeta = new RowMeta(); for ( int i = 0; i < data.groupnrs.length; i++ ) { data.groupMeta.addValueMeta( previousRowMeta.getValueMeta( data.groupnrs[ i ] ) ); } } /** * Used for junits in GroupByAggregationNullsTest * * @return * @throws KettleValueException */ Object[] getAggregateResult() throws KettleValueException { if ( data.subjectnrs == null ) { return new Object[ 0 ]; } Object[] result = new Object[ data.subjectnrs.length ]; for ( int i = 0; i < data.subjectnrs.length; i++ ) { Object ag = data.agg[ i ]; switch ( meta.getAggregateType()[ i ] ) { case GroupByMeta.TYPE_GROUP_SUM: break; case GroupByMeta.TYPE_GROUP_AVERAGE: ag = ValueDataUtil.divide( data.aggMeta.getValueMeta( i ), ag, new ValueMetaInteger( "c" ), new Long( data.counts[ i ] ) ); break; case GroupByMeta.TYPE_GROUP_MEDIAN: case GroupByMeta.TYPE_GROUP_PERCENTILE: double percentile = 50.0; if ( meta.getAggregateType()[ i ] == GroupByMeta.TYPE_GROUP_PERCENTILE ) { percentile = Double.parseDouble( meta.getValueField()[ i ] ); } @SuppressWarnings( "unchecked" ) List<Double> valuesList = (List<Double>) data.agg[ i ]; double[] values = new double[ valuesList.size() ]; for ( int v = 0; v < values.length; v++ ) { values[ v ] = valuesList.get( v ); } ag = new Percentile().evaluate( values, percentile ); break; case GroupByMeta.TYPE_GROUP_COUNT_ANY: case GroupByMeta.TYPE_GROUP_COUNT_ALL: ag = new Long( data.counts[ i ] ); break; case GroupByMeta.TYPE_GROUP_COUNT_DISTINCT: break; case GroupByMeta.TYPE_GROUP_MIN: break; case GroupByMeta.TYPE_GROUP_MAX: break; case GroupByMeta.TYPE_GROUP_STANDARD_DEVIATION: if ( ag == null ) { // PMD-1037 - when all input data is null ag is null, npe on access ag break; } double sum = (Double) ag / data.counts[ i ]; ag = Double.valueOf( Math.sqrt( sum ) ); break; case GroupByMeta.TYPE_GROUP_CONCAT_COMMA: case GroupByMeta.TYPE_GROUP_CONCAT_STRING: ag = ( (StringBuilder) ag ).toString(); break; default: break; } if ( ag == null && allNullsAreZero ) { // PDI-10250, 6960 seems all rows for min function was nulls... // get output subject meta based on original subject meta calculation ValueMetaInterface vm = data.aggMeta.getValueMeta( i ); ag = ValueDataUtil.getZeroForValueMetaType( vm ); } result[ i ] = ag; } return result; } // Method is defined as package-protected in order to be accessible by unit tests void addToBuffer( Object[] row ) throws KettleFileException { data.bufferList.add( row ); if ( data.bufferList.size() > 5000 && data.rowsOnFile == 0 ) { String pathToTmp = environmentSubstitute( getMeta().getDirectory() ); try { File ioFile = new File( pathToTmp ); if ( !ioFile.exists() ) { // try to resolve as Apache VFS file pathToTmp = retrieveVfsPath( pathToTmp ); } data.tempFile = File.createTempFile( getMeta().getPrefix(), ".tmp", new File( pathToTmp ) ); data.fosToTempFile = new FileOutputStream( data.tempFile ); data.dosToTempFile = new DataOutputStream( data.fosToTempFile ); data.firstRead = true; } catch ( IOException e ) { throw new KettleFileException( BaseMessages.getString( PKG, "GroupBy.Exception.UnableToCreateTemporaryFile" ), e ); } // OK, save the oldest rows to disk! Object[] oldest = data.bufferList.get( 0 ); data.inputRowMeta.writeData( data.dosToTempFile, oldest ); data.bufferList.remove( 0 ); data.rowsOnFile++; } } // Method is defined as package-protected in order to be accessible by unit tests String retrieveVfsPath( String pathToTmp ) throws KettleFileException { FileObject vfsFile = KettleVFS.getFileObject( pathToTmp ); String path = vfsFile.getName().getPath(); return path; } private Object[] getRowFromBuffer() throws KettleFileException { if ( data.rowsOnFile > 0 ) { if ( data.firstRead ) { // Open the inputstream first... try { data.fisToTmpFile = new FileInputStream( data.tempFile ); data.disToTmpFile = new DataInputStream( data.fisToTmpFile ); data.firstRead = false; } catch ( IOException e ) { throw new KettleFileException( BaseMessages.getString( PKG, "GroupBy.Exception.UnableToReadBackRowFromTemporaryFile" ), e ); } } // Read one row from the file! Object[] row; try { row = data.inputRowMeta.readData( data.disToTmpFile ); } catch ( SocketTimeoutException e ) { throw new KettleFileException( e ); // Shouldn't happen on files } data.rowsOnFile--; return row; } else { if ( data.bufferList.size() > 0 ) { Object[] row = data.bufferList.get( 0 ); data.bufferList.remove( 0 ); return row; } else { return null; // Nothing left! } } } private void closeOutput() throws KettleFileException { try { if ( data.dosToTempFile != null ) { data.dosToTempFile.close(); data.dosToTempFile = null; } if ( data.fosToTempFile != null ) { data.fosToTempFile.close(); data.fosToTempFile = null; } data.firstRead = true; } catch ( IOException e ) { throw new KettleFileException( BaseMessages.getString( PKG, "GroupBy.Exception.UnableToCloseInputStream", data.tempFile.getPath() ), e ); } } private void closeInput() throws KettleFileException { try { if ( data.fisToTmpFile != null ) { data.fisToTmpFile.close(); data.fisToTmpFile = null; } if ( data.disToTmpFile != null ) { data.disToTmpFile.close(); data.disToTmpFile = null; } } catch ( IOException e ) { throw new KettleFileException( BaseMessages.getString( PKG, "GroupBy.Exception.UnableToCloseInputStream", data.tempFile.getPath() ), e ); } } @Override public boolean init( StepMetaInterface smi, StepDataInterface sdi ) { meta = (GroupByMeta) smi; data = (GroupByData) sdi; if ( super.init( smi, sdi ) ) { data.bufferList = new ArrayList<>(); data.rowsOnFile = 0; return true; } return false; } @Override public void dispose( StepMetaInterface smi, StepDataInterface sdi ) { if ( data.tempFile != null ) { try { closeInput(); closeOutput(); } catch ( KettleFileException e ) { log.logError( e.getLocalizedMessage() ); } boolean tempFileDeleted = data.tempFile.delete(); if ( !tempFileDeleted && log.isDetailed() ) { log.logDetailed( BaseMessages.getString( PKG, "GroupBy.Exception.UnableToDeleteTemporaryFile", data.tempFile.getPath() ) ); } } super.dispose( smi, sdi ); } @Override public void batchComplete() throws KettleException { handleLastOfGroup(); data.newBatch = true; } /** * Used for junits in GroupByAggregationNullsTest * * @param allNullsAreZero the allNullsAreZero to set */ void setAllNullsAreZero( boolean allNullsAreZero ) { this.allNullsAreZero = allNullsAreZero; } /** * Used for junits in GroupByAggregationNullsTest * * @param minNullIsValued the minNullIsValued to set */ void setMinNullIsValued( boolean minNullIsValued ) { this.minNullIsValued = minNullIsValued; } public GroupByMeta getMeta() { return meta; } }