/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2013 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.multimerge; import java.util.ArrayList; import java.util.List; import java.util.PriorityQueue; import org.pentaho.di.core.RowSet; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleStepException; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransHopMeta; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepIOMetaInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; import org.pentaho.di.trans.step.errorhandling.StreamInterface; /** * Merge rows from 2 sorted streams and output joined rows with matched key fields. Use this instead of hash join is * both your input streams are too big to fit in memory. Note that both the inputs must be sorted on the join key. * * This is a first prototype implementation that only handles two streams and inner join. It also always outputs all * values from both streams. Ideally, we should: 1) Support any number of incoming streams 2) Allow user to choose the * join type (inner, outer) for each stream 3) Allow user to choose which fields to push to next step 4) Have multiple * output ports as follows: a) Containing matched records b) Unmatched records for each input port 5) Support incoming * rows to be sorted either on ascending or descending order. The currently implementation only supports ascending * * @author Biswapesh * @since 24-nov-2006 */ public class MultiMergeJoin extends BaseStep implements StepInterface { private static Class<?> PKG = MultiMergeJoinMeta.class; // for i18n purposes, needed by Translator2!! private MultiMergeJoinMeta meta; private MultiMergeJoinData data; public MultiMergeJoin( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans ) { super( stepMeta, stepDataInterface, copyNr, transMeta, trans ); } private boolean processFirstRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { meta = (MultiMergeJoinMeta) smi; data = (MultiMergeJoinData) sdi; TransMeta transMeta = getTransMeta(); TransHopMeta transHopMeta; StepIOMetaInterface stepIOMeta = meta.getStepIOMeta(); List<StreamInterface> infoStreams = stepIOMeta.getInfoStreams(); StreamInterface stream; StepMeta toStepMeta = meta.getParentStepMeta(); StepMeta fromStepMeta; ArrayList<String> inputStepNameList = new ArrayList<String>(); String[] inputStepNames = meta.getInputSteps(); String inputStepName; for ( int i = 0; i < infoStreams.size(); i++ ) { inputStepName = inputStepNames[i]; stream = infoStreams.get( i ); fromStepMeta = stream.getStepMeta(); if ( fromStepMeta == null ) { //should not arrive here, shoud typically have been caught by init. throw new KettleException( BaseMessages.getString( PKG, "MultiMergeJoin.Log.UnableToFindReferenceStream", inputStepName ) ); } //check the hop transHopMeta = transMeta.findTransHop( fromStepMeta, toStepMeta, true ); //there is no hop: this is unexpected. if ( transHopMeta == null ) { //should not arrive here, shoud typically have been caught by init. throw new KettleException( BaseMessages.getString( PKG, "MultiMergeJoin.Log.UnableToFindReferenceStream", inputStepName ) ); } else if ( transHopMeta.isEnabled() ) { inputStepNameList.add( inputStepName ); } else { logDetailed( BaseMessages.getString( PKG, "MultiMergeJoin.Log.IgnoringStep", inputStepName ) ); } } int streamSize = inputStepNameList.size(); if ( streamSize == 0 ) { return false; } String keyField; String[] keyFields; data.rowSets = new RowSet[streamSize]; RowSet rowSet; Object[] row; data.rows = new Object[streamSize][]; data.metas = new RowMetaInterface[streamSize]; data.rowLengths = new int[streamSize]; MultiMergeJoinData.QueueComparator comparator = new MultiMergeJoinData.QueueComparator( data ); data.queue = new PriorityQueue<MultiMergeJoinData.QueueEntry>( streamSize, comparator ); data.results = new ArrayList<List<Object[]>>( streamSize ); MultiMergeJoinData.QueueEntry queueEntry; data.queueEntries = new MultiMergeJoinData.QueueEntry[streamSize]; data.drainIndices = new int[streamSize]; data.keyNrs = new int[streamSize][]; data.dummy = new Object[streamSize][]; RowMetaInterface rowMeta; data.outputRowMeta = new RowMeta(); for ( int i = 0, j = 0; i < inputStepNames.length; i++ ) { inputStepName = inputStepNames[i]; if ( !inputStepNameList.contains( inputStepName ) ) { //ignore step with disabled hop. continue; } queueEntry = new MultiMergeJoinData.QueueEntry(); queueEntry.index = j; data.queueEntries[j] = queueEntry; data.results.add( new ArrayList<Object[]>() ); rowSet = findInputRowSet( inputStepName ); if ( rowSet == null ) { throw new KettleException( BaseMessages.getString( PKG, "MultiMergeJoin.Exception.UnableToFindSpecifiedStep", inputStepName ) ); } data.rowSets[j] = rowSet; row = getRowFrom( rowSet ); data.rows[j] = row; if ( row == null ) { rowMeta = getTransMeta().getStepFields( inputStepName ); data.metas[j] = rowMeta; } else { queueEntry.row = row; rowMeta = rowSet.getRowMeta(); keyField = meta.getKeyFields()[i]; String[] keyFieldParts = keyField.split( "," ); String keyFieldPart; data.keyNrs[j] = new int[keyFieldParts.length]; for ( int k = 0; k < keyFieldParts.length; k++ ) { keyFieldPart = keyFieldParts[k]; data.keyNrs[j][k] = rowMeta.indexOfValue( keyFieldPart ); if ( data.keyNrs[j][k] < 0 ) { String message = BaseMessages.getString( PKG, "MultiMergeJoin.Exception.UnableToFindFieldInReferenceStream", keyFieldPart, inputStepName ); logError( message ); throw new KettleStepException( message ); } } data.metas[j] = rowMeta; data.queue.add( data.queueEntries[j] ); } data.outputRowMeta.mergeRowMeta( rowMeta.clone() ); data.rowLengths[j] = rowMeta.size(); data.dummy[j] = RowDataUtil.allocateRowData( rowMeta.size() ); j++; } return true; } public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { meta = (MultiMergeJoinMeta) smi; data = (MultiMergeJoinData) sdi; if ( first ) { if ( !processFirstRow( smi, sdi ) ) { setOutputDone(); return false; } first = false; } if ( log.isRowLevel() ) { String metaString = BaseMessages .getString( PKG, "MultiMergeJoin.Log.DataInfo", data.metas[0].getString( data.rows[0] ) + "" ); for ( int i = 1; i < data.metas.length; i++ ) { metaString += data.metas[i].getString( data.rows[i] ); } logRowlevel( metaString ); } /* * We can stop processing if any of the following is true: a) All streams are empty b) Any stream is empty and join * type is INNER */ int streamSize = data.metas.length; if ( data.optional ) { if ( data.queue.isEmpty() ) { setOutputDone(); return false; } MultiMergeJoinData.QueueEntry minEntry = data.queue.poll(); int drainSize = 1; data.rows[minEntry.index] = minEntry.row; data.drainIndices[0] = minEntry.index; MultiMergeJoinData.QueueComparator comparator = (MultiMergeJoinData.QueueComparator) data.queue.comparator(); while ( !data.queue.isEmpty() && comparator.compare( data.queue.peek(), minEntry ) == 0 ) { MultiMergeJoinData.QueueEntry entry = data.queue.poll(); data.rows[entry.index] = entry.row; data.drainIndices[drainSize++] = entry.index; } int index; Object[] row = null; // rows from nonempty input streams match: get all equal rows and create result set for ( int i = 0; i < drainSize; i++ ) { index = data.drainIndices[i]; data.results.get( index ).add( data.rows[index] ); while ( !isStopped() && ( ( row = getRowFrom( data.rowSets[index] ) ) != null && data.metas[index].compare( data.rows[index], row, data.keyNrs[index] ) == 0 ) ) { data.results.get( index ).add( row ); } if ( isStopped() ) { return false; } if ( row != null ) { data.queueEntries[index].row = row; data.queue.add( data.queueEntries[index] ); } } for ( int i = 0; i < streamSize; i++ ) { data.drainIndices[i] = 0; if ( data.results.get( i ).isEmpty() ) { data.results.get( i ).add( data.dummy[i] ); } } int current = 0; while ( true ) { for ( int i = 0; i < streamSize; i++ ) { data.rows[i] = data.results.get( i ).get( data.drainIndices[i] ); } row = RowDataUtil.createResizedCopy( data.rows, data.rowLengths ); putRow( data.outputRowMeta, row ); while ( ++data.drainIndices[current] >= data.results.get( current ).size() ) { data.drainIndices[current] = 0; if ( ++current >= streamSize ) { break; } } if ( current >= streamSize ) { break; } current = 0; } for ( int i = 0; i < streamSize; i++ ) { data.results.get( i ).clear(); } } else { if ( data.queue.size() < streamSize ) { data.queue.clear(); for ( int i = 0; i < streamSize; i++ ) { while ( data.rows[i] != null && !isStopped() ) { data.rows[i] = getRowFrom( data.rowSets[i] ); } } setOutputDone(); return false; } MultiMergeJoinData.QueueEntry minEntry = data.queue.poll(); int drainSize = 1; data.rows[minEntry.index] = minEntry.row; data.drainIndices[0] = minEntry.index; MultiMergeJoinData.QueueComparator comparator = (MultiMergeJoinData.QueueComparator) data.queue.comparator(); while ( !data.queue.isEmpty() && comparator.compare( data.queue.peek(), minEntry ) == 0 ) { MultiMergeJoinData.QueueEntry entry = data.queue.poll(); data.rows[entry.index] = entry.row; data.drainIndices[drainSize++] = entry.index; } Object[] row = null; if ( data.queue.isEmpty() ) { // rows from all input streams match: get all equal rows and create result set for ( int i = 0; i < streamSize; i++ ) { data.results.get( i ).add( data.rows[i] ); while ( !isStopped() && ( ( row = getRowFrom( data.rowSets[i] ) ) != null && data.metas[i].compare( data.rows[i], row, data.keyNrs[i] ) == 0 ) ) { data.results.get( i ).add( row ); } if ( isStopped() ) { return false; } if ( row != null ) { data.queueEntries[i].row = row; data.queue.add( data.queueEntries[i] ); } } for ( int i = 0; i < streamSize; i++ ) { data.drainIndices[i] = 0; } int current = 0; while ( true ) { for ( int i = 0; i < streamSize; i++ ) { data.rows[i] = data.results.get( i ).get( data.drainIndices[i] ); } row = RowDataUtil.createResizedCopy( data.rows, data.rowLengths ); putRow( data.outputRowMeta, row ); while ( ++data.drainIndices[current] >= data.results.get( current ).size() ) { data.drainIndices[current] = 0; if ( ++current >= streamSize ) { break; } } if ( current >= streamSize ) { break; } current = 0; } for ( int i = 0; i < streamSize; i++ ) { data.results.get( i ).clear(); } } else { // mismatch found and no results can be generated for ( int i = 0; i < drainSize; i++ ) { int index = data.drainIndices[i]; while ( ( row = getRowFrom( data.rowSets[index] ) ) != null && data.metas[index].compare( data.rows[index], row, data.keyNrs[index] ) == 0 ) { if ( isStopped() ) { break; } } if ( isStopped() || row == null ) { break; } data.queueEntries[index].row = row; data.queue.add( data.queueEntries[index] ); } if ( isStopped() ) { return false; } } } if ( checkFeedback( getLinesRead() ) ) { logBasic( BaseMessages.getString( PKG, "MultiMergeJoin.LineNumber" ) + getLinesRead() ); } return true; } /** * @see StepInterface#init(org.pentaho.di.trans.step.StepMetaInterface , org.pentaho.di.trans.step.StepDataInterface) */ public boolean init( StepMetaInterface smi, StepDataInterface sdi ) { meta = (MultiMergeJoinMeta) smi; data = (MultiMergeJoinData) sdi; if ( super.init( smi, sdi ) ) { StepIOMetaInterface stepIOMeta = meta.getStepIOMeta(); String[] inputStepNames = meta.getInputSteps(); String inputStepName; List<StreamInterface> infoStreams = stepIOMeta.getInfoStreams(); StreamInterface stream; for ( int i = 0; i < infoStreams.size(); i++ ) { inputStepName = inputStepNames[i]; stream = infoStreams.get( i ); if ( stream.getStepMeta() == null ) { logError( BaseMessages.getString( PKG, "MultiMergeJoin.Log.UnableToFindReferenceStream", inputStepName ) ); return false; } } String joinType = meta.getJoinType(); for ( int i = 0; i < MultiMergeJoinMeta.join_types.length; ++i ) { if ( joinType.equalsIgnoreCase( MultiMergeJoinMeta.join_types[i] ) ) { data.optional = MultiMergeJoinMeta.optionals[i]; return true; } } logError( BaseMessages.getString( PKG, "MultiMergeJoin.Log.InvalidJoinType", meta.getJoinType() ) ); return false; } return true; } /** * Checks whether incoming rows are join compatible. This essentially means that the keys being compared should be of * the same datatype and both rows should have the same number of keys specified * * @param row1 * Reference row * @param row2 * Row to compare to * * @return true when templates are compatible. */ protected boolean isInputLayoutValid( RowMetaInterface[] rows ) { if ( rows != null ) { // Compare the key types String[] keyFields = meta.getKeyFields(); /* * int nrKeyFields = keyFields.length; * * for (int i=0;i<nrKeyFields;i++) { ValueMetaInterface v1 = rows[0].searchValueMeta(keyFields[i]); if (v1 == * null) { return false; } for (int j = 1; j < rows.length; j++) { ValueMetaInterface v2 = * rows[j].searchValueMeta(keyFields[i]); if (v2 == null) { return false; } if ( v1.getType()!=v2.getType() ) { * return false; } } } */ // check 1 : keys are configured for each stream if ( rows.length != keyFields.length ) { logError( "keys are not configured for all the streams " ); return false; } // check:2 No of keys are same for each stream int prevCount = 0; List<String[]> keyList = new ArrayList<String[]>(); for ( int i = 0; i < keyFields.length; i++ ) { String[] keys = keyFields[i].split( "," ); keyList.add( keys ); int count = keys.length; if ( i != 0 && prevCount != count ) { logError( "Number of keys do not match " ); return false; } else { prevCount = count; } } // check:3 compare the key types for ( int i = 0; i < prevCount; i++ ) { ValueMetaInterface preValue = null; for ( int j = 0; j < rows.length; j++ ) { ValueMetaInterface v = rows[j].searchValueMeta( keyList.get( j )[i] ); if ( v == null ) { return false; } if ( j != 0 && v.getType() != preValue.getType() ) { logError( "key data type do not match " ); return false; } else { preValue = v; } } } } // we got here, all seems to be ok. return true; } }