/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.sort;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.vfs2.FileObject;
import org.apache.commons.vfs2.FileSystemException;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleFileException;
import org.pentaho.di.core.exception.KettleValueException;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.vfs.KettleVFS;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
/**
* Sort the rows in the input-streams based on certain criteria
*
* @author Matt
* @since 29-apr-2003
*/
public class SortRows extends BaseStep implements StepInterface {
private static Class<?> PKG = SortRows.class; // for i18n
private SortRowsMeta meta;
private SortRowsData data;
public SortRows( StepMeta stepMeta, StepDataInterface stepDataInterface,
int copyNr, TransMeta transMeta, Trans trans ) {
super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
meta = (SortRowsMeta) getStepMeta().getStepMetaInterface();
data = (SortRowsData) stepDataInterface;
}
void addBuffer( RowMetaInterface rowMeta, Object[] r ) throws KettleException {
// we need convert some keys?
if ( data.convertKeysToNative != null ) {
for ( int i = 0; i < data.convertKeysToNative.length; i++ ) {
int index = data.convertKeysToNative[i];
r[index] = rowMeta.getValueMeta( index ).convertBinaryStringToNativeType( (byte[]) r[index] );
}
}
// Save row
data.buffer.add( r );
// Check the free memory every 1000 rows...
//
data.freeCounter++;
if ( data.sortSize <= 0 && data.freeCounter >= 1000 ) {
data.freeMemoryPct = Const.getPercentageFreeMemory();
data.freeCounter = 0;
if ( log.isDetailed() ) {
data.memoryReporting++;
if ( data.memoryReporting >= 10 ) {
if ( log.isDetailed() ) {
logDetailed( BaseMessages.getString( PKG, "SortRows.Detailed.AvailableMemory", data.freeMemoryPct ) );
}
data.memoryReporting = 0;
}
}
}
// Buffer is full: sort & dump to disk
boolean doSort = data.buffer.size() == data.sortSize;
doSort |=
data.freeMemoryPctLimit > 0 && data.freeMemoryPct < data.freeMemoryPctLimit
&& data.buffer.size() >= data.minSortSize;
if ( log.isDebug() ) {
this.logDebug( BaseMessages.getString( PKG, "SortRows.Debug.StartDumpToDisk", data.freeMemoryPct, data.buffer
.size() ) );
}
// time to sort the buffer and write the data to disk...
if ( doSort ) {
sortExternalRows();
}
}
// dump sorted rows from in-memory buffer to fs file
// clean current buffer
void sortExternalRows() throws KettleException {
// we just recently dump buffer - but there is no new rows came.
if ( data.buffer.isEmpty() ) {
return;
}
// First sort the rows in buffer[]
quickSort( data.buffer );
// Then write them to disk...
DataOutputStream dos;
GZIPOutputStream gzos;
int p;
try {
FileObject fileObject =
KettleVFS.createTempFile( meta.getPrefix(), ".tmp", environmentSubstitute( meta.getDirectory() ),
getTransMeta() );
data.files.add( fileObject ); // Remember the files!
OutputStream outputStream = KettleVFS.getOutputStream( fileObject, false );
if ( data.compressFiles ) {
gzos = new GZIPOutputStream( new BufferedOutputStream( outputStream ) );
dos = new DataOutputStream( gzos );
} else {
dos = new DataOutputStream( new BufferedOutputStream( outputStream, 500000 ) );
gzos = null;
}
// Just write the data, nothing else
List<Integer> duplicates = new ArrayList<Integer>();
Object[] previousRow = null;
if ( meta.isOnlyPassingUniqueRows() ) {
int index = 0;
while ( index < data.buffer.size() ) {
Object[] row = data.buffer.get( index );
if ( previousRow != null ) {
int result = data.outputRowMeta.compare( row, previousRow, data.fieldnrs );
if ( result == 0 ) {
duplicates.add( index );
if ( log.isRowLevel() ) {
logRowlevel( BaseMessages.getString( PKG, "SortRows.RowLevel.DuplicateRowRemoved", data.outputRowMeta
.getString( row ) ) );
}
}
}
index++;
previousRow = row;
}
}
// How many records do we have left?
data.bufferSizes.add( data.buffer.size() - duplicates.size() );
int duplicatesIndex = 0;
for ( p = 0; p < data.buffer.size(); p++ ) {
boolean skip = false;
if ( duplicatesIndex < duplicates.size() ) {
if ( p == duplicates.get( duplicatesIndex ) ) {
skip = true;
duplicatesIndex++;
}
}
if ( !skip ) {
data.outputRowMeta.writeData( dos, data.buffer.get( p ) );
}
}
if ( data.sortSize < 0 ) {
if ( data.buffer.size() > data.minSortSize ) {
data.minSortSize = data.buffer.size(); // if we did it once, we can do
// it again.
// Memory usage goes up over time, even with garbage collection
// We need pointers, file handles, etc.
// As such, we're going to lower the min sort size a bit
//
data.minSortSize = (int) Math.round( data.minSortSize * 0.90 );
}
}
// Clear the list
data.buffer.clear();
// Close temp-file
dos.close(); // close data stream
if ( gzos != null ) {
gzos.close(); // close gzip stream
}
outputStream.close(); // close file stream
// How much memory do we have left?
//
data.freeMemoryPct = Const.getPercentageFreeMemory();
data.freeCounter = 0;
if ( data.sortSize <= 0 ) {
if ( log.isDetailed() ) {
logDetailed( BaseMessages.getString( PKG, "SortRows.Detailed.AvailableMemory", data.freeMemoryPct ) );
}
}
} catch ( Exception e ) {
throw new KettleException( "Error processing temp-file!", e );
}
data.getBufferIndex = 0;
}
private DataInputStream getDataInputStream( GZIPInputStream gzipInputStream ) {
DataInputStream result = new DataInputStream( gzipInputStream );
data.gzis.add( gzipInputStream );
return result;
}
// get sorted rows from available files in iterative manner.
// that means call to this method will continue to return rows
// till all temp files will not be read to the end.
Object[] getBuffer() throws KettleValueException {
Object[] retval;
// Open all files at once and read one row from each file...
if ( data.files.size() > 0 && ( data.dis.size() == 0 || data.fis.size() == 0 ) ) {
if ( log.isBasic() ) {
logBasic( BaseMessages.getString( PKG, "SortRows.Basic.OpeningTempFiles", data.files.size() ) );
}
try {
for ( int f = 0; f < data.files.size() && !isStopped(); f++ ) {
FileObject fileObject = data.files.get( f );
String filename = KettleVFS.getFilename( fileObject );
if ( log.isDetailed() ) {
logDetailed( BaseMessages.getString( PKG, "SortRows.Detailed.OpeningTempFile", filename ) );
}
InputStream fi = KettleVFS.getInputStream( fileObject );
DataInputStream di;
data.fis.add( fi );
if ( data.compressFiles ) {
di = getDataInputStream( new GZIPInputStream( new BufferedInputStream( fi ) ) );
} else {
di = new DataInputStream( new BufferedInputStream( fi, 50000 ) );
}
data.dis.add( di );
// How long is the buffer?
int buffersize = data.bufferSizes.get( f );
if ( log.isDetailed() ) {
logDetailed( BaseMessages.getString( PKG, "SortRows.Detailed.FromFileExpectingRows",
filename, buffersize ) );
}
if ( buffersize > 0 ) {
Object[] row = data.outputRowMeta.readData( di );
data.rowbuffer.add( row ); // new row from input stream
data.tempRows.add( new RowTempFile( row, f ) );
}
}
// Sort the data row buffer
Collections.sort( data.tempRows, data.comparator );
} catch ( Exception e ) {
logError( BaseMessages.getString( PKG, "SortRows.Error.ErrorReadingBackTempFiles" ), e );
}
}
if ( data.files.size() == 0 ) {
// read from in-memory processing
if ( data.getBufferIndex < data.buffer.size() ) {
retval = data.buffer.get( data.getBufferIndex );
data.getBufferIndex++;
} else {
retval = null;
}
} else {
// read from disk processing
if ( data.rowbuffer.size() == 0 ) {
retval = null;
} else {
// We now have "filenr" rows waiting: which one is the smallest?
//
if ( log.isRowLevel() ) {
for ( int i = 0; i < data.rowbuffer.size() && !isStopped(); i++ ) {
Object[] b = data.rowbuffer.get( i );
logRowlevel( BaseMessages
.getString( PKG, "SortRows.RowLevel.PrintRow", i, data.outputRowMeta.getString( b ) ) );
}
}
RowTempFile rowTempFile = data.tempRows.remove( 0 );
retval = rowTempFile.row;
int smallest = rowTempFile.fileNumber;
// now get another Row for position smallest
FileObject file = data.files.get( smallest );
DataInputStream di = data.dis.get( smallest );
InputStream fi = data.fis.get( smallest );
try {
Object[] row2 = data.outputRowMeta.readData( di );
RowTempFile extra = new RowTempFile( row2, smallest );
int index = Collections.binarySearch( data.tempRows, extra, data.comparator );
if ( index < 0 ) {
data.tempRows.add( index * ( -1 ) - 1, extra );
} else {
data.tempRows.add( index, extra );
}
} catch ( KettleFileException fe ) { // empty file or EOF mostly
GZIPInputStream gzfi = ( data.compressFiles ) ? data.gzis.get( smallest ) : null;
try {
di.close();
fi.close();
if ( gzfi != null ) {
gzfi.close();
}
file.delete();
} catch ( IOException e ) {
logError( BaseMessages.getString( PKG, "SortRows.Error.UnableToCloseFile", smallest, file.toString() ) );
setErrors( 1 );
stopAll();
return null;
}
data.files.remove( smallest );
data.dis.remove( smallest );
data.fis.remove( smallest );
if ( gzfi != null ) {
data.gzis.remove( smallest );
}
// Also update all file numbers in in data.tempRows if they are larger
// than smallest.
//
for ( RowTempFile rtf : data.tempRows ) {
if ( rtf.fileNumber > smallest ) {
rtf.fileNumber--;
}
}
} catch ( SocketTimeoutException e ) {
throw new KettleValueException( e ); // should never happen on local files
}
}
}
return retval;
}
@Override
public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
// wait for first for is available
Object[] r = getRow();
List<String> groupFields = null;
if ( first ) {
this.first = false;
// do we have any row at start processing?
if ( r == null ) {
// seems that we don't
this.setOutputDone();
return false;
}
RowMetaInterface inputRowMeta = getInputRowMeta();
// do we have group numbers?
if ( meta.isGroupSortEnabled() ) {
data.newBatch = true;
// we do set exact list instead of null
groupFields = meta.getGroupFields();
data.groupnrs = new int[groupFields.size()];
for ( int i = 0; i < groupFields.size(); i++ ) {
data.groupnrs[i] = inputRowMeta.indexOfValue( groupFields.get( i ) );
if ( data.groupnrs[i] < 0 ) {
logError( BaseMessages.getString( PKG, "SortRows.Error.PresortedFieldNotFound", groupFields.get( i ) ) );
setErrors( 1 );
stopAll();
return false;
}
}
}
String[] fieldNames = meta.getFieldName();
data.fieldnrs = new int[fieldNames.length];
List<Integer> toConvert = new ArrayList<Integer>();
// Metadata
data.outputRowMeta = inputRowMeta.clone();
meta.getFields( data.outputRowMeta, getStepname(), null, null, this, repository, metaStore );
data.comparator = new RowTemapFileComparator( data.outputRowMeta, data.fieldnrs );
for ( int i = 0; i < fieldNames.length; i++ ) {
data.fieldnrs[i] = inputRowMeta.indexOfValue( fieldNames[i] );
if ( data.fieldnrs[i] < 0 ) {
throw new KettleException( BaseMessages.getString( PKG, "SortRowsMeta.CheckResult.StepFieldNotInInputStream",
meta.getFieldName()[i], getStepname() ) );
}
// do we need binary conversion for this type?
if ( inputRowMeta.getValueMeta( data.fieldnrs[i] ).isStorageBinaryString() ) {
toConvert.add( data.fieldnrs[i] );
}
}
data.convertKeysToNative = toConvert.isEmpty() ? null : new int[toConvert.size()];
int i = 0;
for ( Integer in : toConvert ) {
data.convertKeysToNative[i] = in;
i++;
}
data.rowComparator = new RowObjectArrayComparator( data.outputRowMeta, data.fieldnrs );
} // end if first
// it is not first row and it is null
if ( r == null ) {
// flush result and set output done.
this.preSortBeforeFlush();
this.passBuffer();
this.setOutputDone();
return false;
}
// if Group Sort is not enabled then do the normal sort.
if ( !meta.isGroupSortEnabled() ) {
this.addBuffer( getInputRowMeta(), r );
} else {
// Otherwise do grouping sort
if ( data.newBatch ) {
data.newBatch = false;
setPrevious( r );
// this enables Sort stuff to initialize it's state.
this.addBuffer( getInputRowMeta(), r );
} else {
if ( this.sameGroup( data.previous, r ) ) {
// setPrevious( r ); // we are not need to set it every time
// this performs SortRows normal row collection functionality.
this.addBuffer( getInputRowMeta(), r );
} else {
this.preSortBeforeFlush();
// flush sorted block to next step:
this.passBuffer();
// new sorted block beginning
setPrevious( r );
data.newBatch = true;
this.addBuffer( getInputRowMeta(), r );
}
}
}
if ( checkFeedback( getLinesRead() ) ) {
if ( log.isBasic() ) {
logBasic( "Linenr " + getLinesRead() );
}
}
return true;
}
/**
* This method passes all rows in the buffer to the next steps. Usually call to this method indicates that this
* particular step finishing processing.
*
*/
void passBuffer() throws KettleException {
// Now we can start the output!
//
Object[] r = getBuffer();
Object[] previousRow = null;
// log time spent for external merge (expected time consuming operation)
if ( log.isDebug() && !data.files.isEmpty() ) {
this.logDebug( BaseMessages.getString( PKG, "SortRows.Debug.ExternalMergeStarted" ) );
}
while ( r != null && !isStopped() ) {
if ( log.isRowLevel() ) {
logRowlevel( BaseMessages.getString( PKG, "SortRows.RowLevel.ReadRow", getInputRowMeta().getString( r ) ) );
}
// Do another verification pass for unique rows...
//
if ( meta.isOnlyPassingUniqueRows() ) {
if ( previousRow != null ) {
// See if this row is the same as the previous one as far as the keys
// are concerned.
// If so, we don't put forward this row.
int result = data.outputRowMeta.compare( r, previousRow, data.fieldnrs );
if ( result != 0 ) {
putRow( data.outputRowMeta, r ); // copy row to possible alternate
// rowset(s).
}
} else {
putRow( data.outputRowMeta, r ); // copy row to next steps
}
previousRow = r;
} else {
putRow( data.outputRowMeta, r ); // copy row to possible alternate
// rowset(s).
}
r = getBuffer();
}
if ( log.isDebug() && !data.files.isEmpty() ) {
this.logDebug( BaseMessages.getString( PKG, "SortRows.Debug.ExternalMergeFinished" ) );
}
// Clear out the buffer for the next batch
//
clearBuffers();
}
@Override
public boolean init( StepMetaInterface smi, StepDataInterface sdi ) {
meta = (SortRowsMeta) smi;
data = (SortRowsData) sdi;
if ( !super.init( smi, sdi ) ) {
return false;
}
data.sortSize = Const.toInt( environmentSubstitute( meta.getSortSize() ), -1 );
data.freeMemoryPctLimit = Const.toInt( meta.getFreeMemoryLimit(), -1 );
if ( data.sortSize <= 0 && data.freeMemoryPctLimit <= 0 ) {
// Prefer the memory limit as it should never fail
//
data.freeMemoryPctLimit = 25;
}
// In memory buffer
//
data.buffer = new ArrayList<Object[]>( 5000 );
// Buffer for reading from disk
//
data.rowbuffer = new ArrayList<Object[]>( 5000 );
data.compressFiles = getBooleanValueOfVariable( meta.getCompressFilesVariable(), meta.getCompressFiles() );
data.tempRows = new ArrayList<RowTempFile>();
data.minSortSize = 5000;
return true;
}
@Override
public void dispose( StepMetaInterface smi, StepDataInterface sdi ) {
clearBuffers();
super.dispose( smi, sdi );
}
private void clearBuffers() {
// Clean out the sort buffer
data.buffer.clear();
data.getBufferIndex = 0;
data.rowbuffer.clear();
// close any open DataInputStream objects
if ( ( data.dis != null ) && ( data.dis.size() > 0 ) ) {
for ( DataInputStream dis : data.dis ) {
BaseStep.closeQuietly( dis );
}
}
// close any open InputStream objects
if ( ( data.fis != null ) && ( data.fis.size() > 0 ) ) {
for ( InputStream is : data.fis ) {
BaseStep.closeQuietly( is );
}
}
// remove temp files
for ( int f = 0; f < data.files.size(); f++ ) {
FileObject fileToDelete = data.files.get( f );
try {
if ( fileToDelete != null && fileToDelete.exists() ) {
fileToDelete.delete();
}
} catch ( FileSystemException e ) {
logError( e.getLocalizedMessage(), e );
}
}
}
/**
* Sort the entire vector, if it is not empty.
*/
void quickSort( List<Object[]> elements ) throws KettleException {
if ( elements.size() > 0 ) {
Collections.sort( elements, data.rowComparator );
long nrConversions = 0L;
for ( ValueMetaInterface valueMeta : data.outputRowMeta.getValueMetaList() ) {
nrConversions += valueMeta.getNumberOfBinaryStringConversions();
valueMeta.setNumberOfBinaryStringConversions( 0L );
}
if ( log.isDetailed() ) {
logDetailed( BaseMessages.getString( PKG, "SortRows.Detailed.ReportNumberOfBinaryStringConv", nrConversions ) );
}
}
}
/**
* Calling this method will alert the step that we finished passing records to the step. Specifically for steps like
* "Sort Rows" it means that the buffered rows can be sorted and passed on.
*/
@Override
public void batchComplete() throws KettleException {
preSortBeforeFlush();
passBuffer();
setOutputDone();
}
private void preSortBeforeFlush() throws KettleException {
if ( data.files.size() > 0 ) {
// dump to dist and then read from disk
sortExternalRows();
} else {
// sort in memory
quickSort( data.buffer );
}
}
/*
* Group Fields Implementation heroic
*/
// Is the row r of the same group as previous?
private boolean sameGroup( Object[] previous, Object[] r ) throws KettleValueException {
if ( r == null ) {
return false;
}
return getInputRowMeta().compare( previous, r, data.groupnrs ) == 0;
}
private void setPrevious( Object[] r ) throws KettleException {
if ( r != null ) {
this.data.previous = getInputRowMeta().cloneRow( r );
}
}
private class SortRowsComparator {
protected RowMetaInterface rowMeta;
protected int[] fieldNrs;
SortRowsComparator( RowMetaInterface rowMeta, int[] fieldNrs ) {
this.rowMeta = rowMeta;
this.fieldNrs = fieldNrs;
}
}
private class RowTemapFileComparator extends SortRowsComparator implements Comparator<RowTempFile> {
RowTemapFileComparator( RowMetaInterface rowMeta, int[] fieldNrs ) {
super( rowMeta, fieldNrs );
}
@Override
public int compare( RowTempFile o1, RowTempFile o2 ) {
try {
return rowMeta.compare( o1.row, o2.row, fieldNrs );
} catch ( KettleValueException e ) {
logError( "Error comparing rows: " + e.toString() );
return 0;
}
}
}
private class RowObjectArrayComparator extends SortRowsComparator implements Comparator<Object[]> {
RowObjectArrayComparator( RowMetaInterface rowMeta, int[] fieldNrs ) {
super( rowMeta, fieldNrs );
}
@Override
public int compare( Object[] o1, Object[] o2 ) {
try {
return rowMeta.compare( o1, o2, fieldNrs );
} catch ( KettleValueException e ) {
logError( "Error comparing rows: " + e.toString() );
return 0;
}
}
}
}