/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.trans.steps.mergejoin;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.pentaho.di.core.exception.KettleException;
import org.pentaho.di.core.exception.KettleStepException;
import org.pentaho.di.core.row.RowDataUtil;
import org.pentaho.di.core.row.RowMeta;
import org.pentaho.di.core.row.RowMetaInterface;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.i18n.BaseMessages;
import org.pentaho.di.trans.Trans;
import org.pentaho.di.trans.TransMeta;
import org.pentaho.di.trans.step.BaseStep;
import org.pentaho.di.trans.step.StepDataInterface;
import org.pentaho.di.trans.step.StepInterface;
import org.pentaho.di.trans.step.StepMeta;
import org.pentaho.di.trans.step.StepMetaInterface;
import org.pentaho.di.trans.step.errorhandling.StreamInterface;
/**
* Merge rows from 2 sorted streams and output joined rows with matched key fields. Use this instead of hash join is
* both your input streams are too big to fit in memory. Note that both the inputs must be sorted on the join key.
*
* This is a first prototype implementation that only handles two streams and inner join. It also always outputs all
* values from both streams. Ideally, we should: 1) Support any number of incoming streams 2) Allow user to choose the
* join type (inner, outer) for each stream 3) Allow user to choose which fields to push to next step 4) Have multiple
* output ports as follows: a) Containing matched records b) Unmatched records for each input port 5) Support incoming
* rows to be sorted either on ascending or descending order. The currently implementation only supports ascending
*
* @author Biswapesh
* @since 24-nov-2006
*/
public class MergeJoin extends BaseStep implements StepInterface {
private static Class<?> PKG = MergeJoinMeta.class; // for i18n purposes, needed by Translator2!!
private MergeJoinMeta meta;
private MergeJoinData data;
public MergeJoin( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta,
Trans trans ) {
super( stepMeta, stepDataInterface, copyNr, transMeta, trans );
}
public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException {
meta = (MergeJoinMeta) smi;
data = (MergeJoinData) sdi;
int compare;
if ( first ) {
first = false;
// Find the RowSet to read from
//
List<StreamInterface> infoStreams = meta.getStepIOMeta().getInfoStreams();
data.oneRowSet = findInputRowSet( infoStreams.get( 0 ).getStepname() );
if ( data.oneRowSet == null ) {
throw new KettleException( BaseMessages.getString(
PKG, "MergeJoin.Exception.UnableToFindSpecifiedStep", infoStreams.get( 0 ).getStepname() ) );
}
data.twoRowSet = findInputRowSet( infoStreams.get( 1 ).getStepname() );
if ( data.twoRowSet == null ) {
throw new KettleException( BaseMessages.getString(
PKG, "MergeJoin.Exception.UnableToFindSpecifiedStep", infoStreams.get( 1 ).getStepname() ) );
}
data.one = getRowFrom( data.oneRowSet );
if ( data.one != null ) {
data.oneMeta = data.oneRowSet.getRowMeta();
} else {
data.one = null;
data.oneMeta = getTransMeta().getStepFields( infoStreams.get( 0 ).getStepname() );
}
data.two = getRowFrom( data.twoRowSet );
if ( data.two != null ) {
data.twoMeta = data.twoRowSet.getRowMeta();
} else {
data.two = null;
data.twoMeta = getTransMeta().getStepFields( infoStreams.get( 1 ).getStepname() );
}
// just for speed: oneMeta+twoMeta
//
data.outputRowMeta = new RowMeta();
data.outputRowMeta.mergeRowMeta( data.oneMeta.clone() );
data.outputRowMeta.mergeRowMeta( data.twoMeta.clone() );
if ( data.one != null ) {
// Find the key indexes:
data.keyNrs1 = new int[meta.getKeyFields1().length];
for ( int i = 0; i < data.keyNrs1.length; i++ ) {
data.keyNrs1[i] = data.oneMeta.indexOfValue( meta.getKeyFields1()[i] );
if ( data.keyNrs1[i] < 0 ) {
String message =
BaseMessages.getString( PKG, "MergeJoin.Exception.UnableToFindFieldInReferenceStream", meta
.getKeyFields1()[i] );
logError( message );
throw new KettleStepException( message );
}
}
}
if ( data.two != null ) {
// Find the key indexes:
data.keyNrs2 = new int[meta.getKeyFields2().length];
for ( int i = 0; i < data.keyNrs2.length; i++ ) {
data.keyNrs2[i] = data.twoMeta.indexOfValue( meta.getKeyFields2()[i] );
if ( data.keyNrs2[i] < 0 ) {
String message =
BaseMessages.getString( PKG, "MergeJoin.Exception.UnableToFindFieldInReferenceStream", meta
.getKeyFields2()[i] );
logError( message );
throw new KettleStepException( message );
}
}
}
// Calculate one_dummy... defaults to null
data.one_dummy = RowDataUtil.allocateRowData( data.oneMeta.size() + data.twoMeta.size() );
// Calculate two_dummy... defaults to null
//
data.two_dummy = new Object[data.twoMeta.size()];
}
if ( log.isRowLevel() ) {
logRowlevel( BaseMessages.getString( PKG, "MergeJoin.Log.DataInfo", data.oneMeta.getString( data.one ) + "" )
+ data.twoMeta.getString( data.two ) );
}
/*
* We can stop processing if any of the following is true: a) Both streams are empty b) First stream is empty and
* join type is INNER or LEFT OUTER c) Second stream is empty and join type is INNER or RIGHT OUTER
*/
if ( ( data.one == null && data.two == null )
|| ( data.one == null && data.one_optional == false ) || ( data.two == null && data.two_optional == false ) ) {
// Before we stop processing, we have to make sure that all rows from both input streams are depleted!
// If we don't do this, the transformation can stall.
//
while ( data.one != null && !isStopped() ) {
data.one = getRowFrom( data.oneRowSet );
}
while ( data.two != null && !isStopped() ) {
data.two = getRowFrom( data.twoRowSet );
}
setOutputDone();
return false;
}
if ( data.one == null ) {
compare = -1;
} else {
if ( data.two == null ) {
compare = 1;
} else {
int cmp = data.oneMeta.compare( data.one, data.twoMeta, data.two, data.keyNrs1, data.keyNrs2 );
compare = cmp > 0 ? 1 : cmp < 0 ? -1 : 0;
}
}
switch ( compare ) {
case 0:
/*
* We've got a match. This is what we do next (to handle duplicate keys correctly): Read the next record from
* both streams If any of the keys match, this means we have duplicates. We therefore Create an array of all
* rows that have the same keys Push a Cartesian product of the two arrays to output Else Just push the combined
* rowset to output
*/
data.one_next = getRowFrom( data.oneRowSet );
data.two_next = getRowFrom( data.twoRowSet );
int compare1 =
( data.one_next == null ) ? -1 : data.oneMeta.compare(
data.one, data.one_next, data.keyNrs1, data.keyNrs1 );
int compare2 =
( data.two_next == null ) ? -1 : data.twoMeta.compare(
data.two, data.two_next, data.keyNrs2, data.keyNrs2 );
if ( compare1 == 0 || compare2 == 0 ) { // Duplicate keys
if ( data.ones == null ) {
data.ones = new ArrayList<Object[]>();
} else {
data.ones.clear();
}
if ( data.twos == null ) {
data.twos = new ArrayList<Object[]>();
} else {
data.twos.clear();
}
data.ones.add( data.one );
if ( compare1 == 0 ) {
// First stream has duplicates
data.ones.add( data.one_next );
for ( ; !isStopped(); ) {
data.one_next = getRowFrom( data.oneRowSet );
if ( 0 != ( ( data.one_next == null ) ? -1 : data.oneMeta.compare(
data.one, data.one_next, data.keyNrs1, data.keyNrs1 ) ) ) {
break;
}
data.ones.add( data.one_next );
}
if ( isStopped() ) {
return false;
}
}
data.twos.add( data.two );
if ( compare2 == 0 ) { // Second stream has duplicates
data.twos.add( data.two_next );
for ( ; !isStopped(); ) {
data.two_next = getRowFrom( data.twoRowSet );
if ( 0 != ( ( data.two_next == null ) ? -1 : data.twoMeta.compare(
data.two, data.two_next, data.keyNrs2, data.keyNrs2 ) ) ) {
break;
}
data.twos.add( data.two_next );
}
if ( isStopped() ) {
return false;
}
}
for ( Iterator<Object[]> oneIter = data.ones.iterator(); oneIter.hasNext() && !isStopped(); ) {
Object[] one = oneIter.next();
for ( Iterator<Object[]> twoIter = data.twos.iterator(); twoIter.hasNext() && !isStopped(); ) {
Object[] two = twoIter.next();
Object[] oneBig = RowDataUtil.createResizedCopy( one, data.oneMeta.size() + data.twoMeta.size() );
Object[] combi = RowDataUtil.addRowData( oneBig, data.oneMeta.size(), two );
putRow( data.outputRowMeta, combi );
}
// Remove the rows as we merge them to keep the overall memory footprint minimal
oneIter.remove();
}
data.twos.clear();
} else {
// No duplicates
Object[] outputRowData = RowDataUtil.addRowData( data.one, data.oneMeta.size(), data.two );
putRow( data.outputRowMeta, outputRowData );
}
data.one = data.one_next;
data.two = data.two_next;
break;
case 1:
// if (log.isDebug()) logDebug("First stream has missing key");
/*
* First stream is greater than the second stream. This means: a) This key is missing in the first stream b)
* Second stream may have finished So, if full/right outer join is set and 2nd stream is not null, we push a
* record to output with only the values for the second row populated. Next, if 2nd stream is not finished, we
* get a row from it; otherwise signal that we are done
*/
if ( data.one_optional == true ) {
if ( data.two != null ) {
Object[] outputRowData = RowDataUtil.createResizedCopy( data.one_dummy, data.outputRowMeta.size() );
outputRowData = RowDataUtil.addRowData( outputRowData, data.oneMeta.size(), data.two );
putRow( data.outputRowMeta, outputRowData );
data.two = getRowFrom( data.twoRowSet );
} else if ( data.two_optional == false ) {
/*
* If we are doing right outer join then we are done since there are no more rows in the second set
*/
// Before we stop processing, we have to make sure that all rows from both input streams are depleted!
// If we don't do this, the transformation can stall.
//
while ( data.one != null && !isStopped() ) {
data.one = getRowFrom( data.oneRowSet );
}
while ( data.two != null && !isStopped() ) {
data.two = getRowFrom( data.twoRowSet );
}
setOutputDone();
return false;
} else {
/*
* We are doing full outer join so print the 1st stream and get the next row from 1st stream
*/
Object[] outputRowData = RowDataUtil.createResizedCopy( data.one, data.outputRowMeta.size() );
outputRowData = RowDataUtil.addRowData( outputRowData, data.oneMeta.size(), data.two_dummy );
putRow( data.outputRowMeta, outputRowData );
data.one = getRowFrom( data.oneRowSet );
}
} else if ( data.two == null && data.two_optional == true ) {
/**
* We have reached the end of stream 2 and there are records present in the first stream. Also, join is left
* or full outer. So, create a row with just the values in the first stream and push it forward
*/
Object[] outputRowData = RowDataUtil.createResizedCopy( data.one, data.outputRowMeta.size() );
outputRowData = RowDataUtil.addRowData( outputRowData, data.oneMeta.size(), data.two_dummy );
putRow( data.outputRowMeta, outputRowData );
data.one = getRowFrom( data.oneRowSet );
} else if ( data.two != null ) {
/*
* We are doing an inner or left outer join, so throw this row away from the 2nd stream
*/
data.two = getRowFrom( data.twoRowSet );
}
break;
case -1:
// if (log.isDebug()) logDebug("Second stream has missing key");
/*
* Second stream is greater than the first stream. This means: a) This key is missing in the second stream b)
* First stream may have finished So, if full/left outer join is set and 1st stream is not null, we push a
* record to output with only the values for the first row populated. Next, if 1st stream is not finished, we
* get a row from it; otherwise signal that we are done
*/
if ( data.two_optional == true ) {
if ( data.one != null ) {
Object[] outputRowData = RowDataUtil.createResizedCopy( data.one, data.outputRowMeta.size() );
outputRowData = RowDataUtil.addRowData( outputRowData, data.oneMeta.size(), data.two_dummy );
putRow( data.outputRowMeta, outputRowData );
data.one = getRowFrom( data.oneRowSet );
} else if ( data.one_optional == false ) {
/*
* We are doing a left outer join and there are no more rows in the first stream; so we are done
*/
// Before we stop processing, we have to make sure that all rows from both input streams are depleted!
// If we don't do this, the transformation can stall.
//
while ( data.one != null && !isStopped() ) {
data.one = getRowFrom( data.oneRowSet );
}
while ( data.two != null && !isStopped() ) {
data.two = getRowFrom( data.twoRowSet );
}
setOutputDone();
return false;
} else {
/*
* We are doing a full outer join so print the 2nd stream and get the next row from the 2nd stream
*/
Object[] outputRowData = RowDataUtil.createResizedCopy( data.one_dummy, data.outputRowMeta.size() );
outputRowData = RowDataUtil.addRowData( outputRowData, data.oneMeta.size(), data.two );
putRow( data.outputRowMeta, outputRowData );
data.two = getRowFrom( data.twoRowSet );
}
} else if ( data.one == null && data.one_optional == true ) {
/*
* We have reached the end of stream 1 and there are records present in the second stream. Also, join is right
* or full outer. So, create a row with just the values in the 2nd stream and push it forward
*/
Object[] outputRowData = RowDataUtil.createResizedCopy( data.one_dummy, data.outputRowMeta.size() );
outputRowData = RowDataUtil.addRowData( outputRowData, data.oneMeta.size(), data.two );
putRow( data.outputRowMeta, outputRowData );
data.two = getRowFrom( data.twoRowSet );
} else if ( data.one != null ) {
/*
* We are doing an inner or right outer join so a non-matching row in the first stream is of no use to us -
* throw it away and get the next row
*/
data.one = getRowFrom( data.oneRowSet );
}
break;
default:
logDebug( "We shouldn't be here!!" );
// Make sure we do not go into an infinite loop by continuing to read data
data.one = getRowFrom( data.oneRowSet );
data.two = getRowFrom( data.twoRowSet );
break;
}
if ( checkFeedback( getLinesRead() ) ) {
logBasic( BaseMessages.getString( PKG, "MergeJoin.LineNumber" ) + getLinesRead() );
}
return true;
}
/**
* @see StepInterface#init(org.pentaho.di.trans.step.StepMetaInterface , org.pentaho.di.trans.step.StepDataInterface)
*/
public boolean init( StepMetaInterface smi, StepDataInterface sdi ) {
meta = (MergeJoinMeta) smi;
data = (MergeJoinData) sdi;
if ( super.init( smi, sdi ) ) {
List<StreamInterface> infoStreams = meta.getStepIOMeta().getInfoStreams();
if ( infoStreams.get( 0 ).getStepMeta() == null || infoStreams.get( 1 ).getStepMeta() == null ) {
logError( BaseMessages.getString( PKG, "MergeJoin.Log.BothTrueAndFalseNeeded" ) );
return false;
}
String joinType = meta.getJoinType();
for ( int i = 0; i < MergeJoinMeta.join_types.length; ++i ) {
if ( joinType.equalsIgnoreCase( MergeJoinMeta.join_types[i] ) ) {
data.one_optional = MergeJoinMeta.one_optionals[i];
data.two_optional = MergeJoinMeta.two_optionals[i];
return true;
}
}
logError( BaseMessages.getString( PKG, "MergeJoin.Log.InvalidJoinType", meta.getJoinType() ) );
return false;
}
return true;
}
/**
* Checks whether incoming rows are join compatible. This essentially means that the keys being compared should be of
* the same datatype and both rows should have the same number of keys specified
*
* @param row1
* Reference row
* @param row2
* Row to compare to
*
* @return true when templates are compatible.
*/
protected boolean isInputLayoutValid( RowMetaInterface row1, RowMetaInterface row2 ) {
if ( row1 != null && row2 != null ) {
// Compare the key types
String[] keyFields1 = meta.getKeyFields1();
int nrKeyFields1 = keyFields1.length;
String[] keyFields2 = meta.getKeyFields2();
int nrKeyFields2 = keyFields2.length;
if ( nrKeyFields1 != nrKeyFields2 ) {
logError( "Number of keys do not match " + nrKeyFields1 + " vs " + nrKeyFields2 );
return false;
}
for ( int i = 0; i < nrKeyFields1; i++ ) {
ValueMetaInterface v1 = row1.searchValueMeta( keyFields1[i] );
if ( v1 == null ) {
return false;
}
ValueMetaInterface v2 = row2.searchValueMeta( keyFields2[i] );
if ( v2 == null ) {
return false;
}
if ( v1.getType() != v2.getType() ) {
return false;
}
}
}
// we got here, all seems to be ok.
return true;
}
}