/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.reservoirsampling;
import java.util.List;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.di.trans.steps.reservoirsampling.ReservoirSamplingData.PROC_MODE;
public class ReservoirSampling extends BaseStep implements StepInterface {
private ReservoirSamplingMeta m_meta;
private ReservoirSamplingData m_data;
/**
* Creates a new <code>ReservoirSampling</code> instance.
* <p>
*
* Implements the reservoir sampling algorithm "R" by Jeffrey Scott Vitter. (algorithm is implemented in
* ReservoirSamplingData.java
* <p>
* For more information see:<br>
* <br>
*
* Vitter, J. S. Random Sampling with a Reservoir. ACM Transactions on Mathematical Software, Vol. 11, No. 1, March
* 1985. Pages 37-57.
*
* @param stepMeta
* holds the step's meta data
* @param stepDataInterface
* holds the step's temporary data
* @param copyNr
* the number assigned to the step
* @param transMeta
* meta data for the transformation
* @param trans
* a <code>Trans</code> value
*/
public ReservoirSampling( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr,
TransMeta transMeta, Trans trans ) {
super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
}
/**
* Process an incoming row of data.
*
* @param smi
* a <code>StepMetaInterface</code> value
* @param sdi
* a <code>StepDataInterface</code> value
* @return a <code>boolean</code> value
* @exception KettleException
* if an error occurs
*/
public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
if ( m_data.getProcessingMode() == PROC_MODE.DISABLED ) {
setOutputDone();
m_data.cleanUp();
return ( false );
}
m_meta = (ReservoirSamplingMeta) smi;
m_data = (ReservoirSamplingData) sdi;
Object[] r = getRow();
// Handle the first row
if ( first ) {
first = false;
if ( r == null ) { // no input to be expected...
setOutputDone();
return false;
}
// Initialize the data object
m_data.setOutputRowMeta( getInputRowMeta().clone() );
String sampleSize = getTransMeta().environmentSubstitute( m_meta.getSampleSize() );
String seed = getTransMeta().environmentSubstitute( m_meta.getSeed() );
m_data.initialize( Integer.valueOf( sampleSize ), Integer.valueOf( seed ) );
// no real reason to determine the output fields here
// as we don't add/delete any fields
} // end (if first)
if ( m_data.getProcessingMode() == PROC_MODE.PASSTHROUGH ) {
if ( r == null ) {
setOutputDone();
m_data.cleanUp();
return ( false );
}
putRow( m_data.getOutputRowMeta(), r );
} else if ( m_data.getProcessingMode() == PROC_MODE.SAMPLING ) {
if ( r == null ) {
// Output the rows in the sample
List<Object[]> samples = m_data.getSample();
int numRows = ( samples != null ) ? samples.size() : 0;
logBasic( this.getStepname()
+ " Actual/Sample: " + numRows + "/" + m_data.m_k + " Seed:"
+ getTransMeta().environmentSubstitute( m_meta.m_randomSeed ) );
if ( samples != null ) {
for ( int i = 0; i < samples.size(); i++ ) {
Object[] sample = samples.get( i );
if ( sample != null ) {
putRow( m_data.getOutputRowMeta(), sample );
} else {
// user probably requested more rows in
// the sample than there were in total
// in the end. Just break in this case
break;
}
}
}
setOutputDone();
m_data.cleanUp();
return false;
}
// just pass the row to the data class for possible caching
// in the sample
m_data.processRow( r );
}
if ( log.isRowLevel() ) {
logRowlevel( "Read row #" + getLinesRead() + " : " + r );
}
if ( checkFeedback( getLinesRead() ) ) {
logBasic( "Line number " + getLinesRead() );
}
return true;
}
/**
* Initialize the step.
*
* @param smi
* a <code>StepMetaInterface</code> value
* @param sdi
* a <code>StepDataInterface</code> value
* @return a <code>boolean</code> value
*/
public boolean init( StepMetaInterface smi, StepDataInterface sdi ) {
m_meta = (ReservoirSamplingMeta) smi;
m_data = (ReservoirSamplingData) sdi;
if ( super.init( smi, sdi ) ) {
boolean remoteInput = getStepMeta().getRemoteInputSteps().size() > 0;
List<StepMeta> previous = getTransMeta().findPreviousSteps( getStepMeta() );
if ( !remoteInput && ( previous == null || previous.size() <= 0 ) ) {
m_data.setProcessingMode( PROC_MODE.DISABLED );
}
return true;
}
return false;
}
/**
* Run is where the action happens!
*/
public void run() {
logBasic( "Starting to run..." );
try {
// Wait
while ( processRow( m_meta, m_data ) ) {
if ( isStopped() ) {
break;
}
}
} catch ( Exception e ) {
logError( "Unexpected error : " + e.toString() );
logError( Const.getStackTracker( e ) );
setErrors( 1 );
stopAll();
} finally {
dispose( m_meta, m_data );
logBasic( "Finished, processing " + getLinesRead() + " rows" );
markStop();
}
}
}