/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.reservoirsampling;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.trans.step.BaseStepData;
import org.pentaho.di.trans.step.StepDataInterface;
/**
* Holds temporary data (i.e. sampled rows). Implements the reservoir sampling algorithm "R" by Jeffrey Scott Vitter.
* <p>
* For more information see:<br>
* <br>
*
* Vitter, J. S. Random Sampling with a Reservoir. ACM Transactions on Mathematical Software, Vol. 11, No. 1, March
* 1985. Pages 37-57.
*
* @author Mark Hall (mhall{[at]}pentaho.org)
* @version 1.0
*/
public class ReservoirSamplingData extends BaseStepData implements StepDataInterface {
// the output data format
protected RowMetaInterface m_outputRowMeta;
// holds the sampled rows
protected List<Object[]> m_sample = null;
// the size of the sample
protected int m_k;
// the current row number
protected int m_currentRow;
// random number generator
protected Random m_random;
// state of processing
protected PROC_MODE m_state;
public enum PROC_MODE {
SAMPLING, PASSTHROUGH, DISABLED
}
/**
* Set the meta data for the output format
*
* @param rmi
* a <code>RowMetaInterface</code> value
*/
public void setOutputRowMeta( RowMetaInterface rmi ) {
m_outputRowMeta = rmi;
}
/**
* Get the output meta data
*
* @return a <code>RowMetaInterface</code> value
*/
public RowMetaInterface getOutputRowMeta() {
return m_outputRowMeta;
}
/**
* Gets the sample as an array of rows
*
* @return the sampled rows
*/
public List<Object[]> getSample() {
return m_sample;
}
/**
* Initialize this data object
*
* @param sampleSize
* the number of rows to sample
* @param seed
* the seed for the random number generator
*/
public void initialize( int sampleSize, int seed ) {
m_k = sampleSize;
if ( m_k == 0 ) {
m_state = PROC_MODE.PASSTHROUGH;
} else if ( m_k < 0 ) {
m_state = PROC_MODE.DISABLED;
} else if ( m_k > 0 ) {
m_state = PROC_MODE.SAMPLING;
}
m_sample = ( m_k > 0 ) ? new ArrayList<Object[]>( m_k ) : new ArrayList<Object[]>();
m_currentRow = 0;
m_random = new Random( seed );
// throw away the first 100 random numbers
for ( int i = 0; i < 100; i++ ) {
m_random.nextDouble();
}
}
/**
* Determine the current operational state of the Reservoir Sampling step. Sampling, PassThrough(Do not wait until
* end, pass through on the fly), Disabled.
*
* @return current operational state
*/
public PROC_MODE getProcessingMode() {
return m_state;
}
/**
*
* Set this component to sample, pass through or be disabled
*
* @param state
* member of PROC_MODE enumeration indicating the desired operational state
*/
public void setProcessingMode( PROC_MODE state ) {
this.m_state = state;
}
/**
* Here is where the action happens. Sampling is done using the "R" algorithm of Jeffrey Scott Vitter.
*
* @param row
* an incoming row
*
*/
public void processRow( Object[] row ) {
if ( m_currentRow < m_k ) {
// Fill sample size with first available data
setElement( m_sample, m_currentRow, row );
} else if ( m_k > 0 ) {
// Replace random positions within the sample
double r = m_random.nextDouble();
if ( r < ( (double) m_k / (double) m_currentRow ) ) {
r = m_random.nextDouble();
int replace = (int) ( m_k * r );
setElement( m_sample, replace, row );
}
}
m_currentRow++;
}
// brute force way of filling list when item index is out of range,
// should be ported to a commons or some library call or something
// that works well with the "R" randomizing algorithm
private void setElement( List<Object[]> list, int idx, Object item ) {
final int size = list.size();
if ( size <= idx ) {
int buff = ( size == 0 ) ? 100 : size * 2;
for ( int i = 0; i < buff; i++ ) {
list.add( null );
}
}
list.set( idx, (Object[]) item );
}
public void cleanUp() {
m_sample = null;
}
}