/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ package org.pentaho.di.trans.steps.fuzzymatch; import java.util.Iterator; import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.commons.codec.language.Metaphone; import org.apache.commons.codec.language.RefinedSoundex; import org.apache.commons.codec.language.Soundex; import org.apache.commons.lang.StringUtils; import org.pentaho.di.core.Const; import org.pentaho.di.core.util.Utils; import org.pentaho.di.core.RowSet; import org.pentaho.di.core.exception.KettleException; import org.pentaho.di.core.exception.KettleStepException; import org.pentaho.di.core.exception.KettleValueException; import org.pentaho.di.core.row.RowDataUtil; import org.pentaho.di.core.row.RowMeta; import org.pentaho.di.core.row.RowMetaInterface; import org.pentaho.di.core.row.ValueMetaInterface; import org.pentaho.di.i18n.BaseMessages; import org.pentaho.di.trans.Trans; import org.pentaho.di.trans.TransMeta; import org.pentaho.di.trans.step.BaseStep; import org.pentaho.di.trans.step.StepDataInterface; import org.pentaho.di.trans.step.StepInterface; import org.pentaho.di.trans.step.StepMeta; import org.pentaho.di.trans.step.StepMetaInterface; import com.wcohen.ss.Jaro; import com.wcohen.ss.JaroWinkler; import com.wcohen.ss.NeedlemanWunsch; /** * Performs a fuzzy match for each main stream field row An approximative match is done in a lookup stream * * @author Samatar * @since 03-mars-2008 */ public class FuzzyMatch extends BaseStep implements StepInterface { private static Class<?> PKG = FuzzyMatchMeta.class; // for i18n purposes, needed by Translator2!! private FuzzyMatchMeta meta; private FuzzyMatchData data; public FuzzyMatch( StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, Trans trans ) { super( stepMeta, stepDataInterface, copyNr, transMeta, trans ); } private boolean readLookupValues() throws KettleException { data.infoStream = meta.getStepIOMeta().getInfoStreams().get( 0 ); if ( data.infoStream.getStepMeta() == null ) { logError( BaseMessages.getString( PKG, "FuzzyMatch.Log.NoLookupStepSpecified" ) ); return false; } if ( isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.ReadingFromStream" ) + data.infoStream.getStepname() + "]" ); } boolean firstRun = true; // Which row set do we read from? // RowSet rowSet = findInputRowSet( data.infoStream.getStepname() ); Object[] rowData = getRowFrom( rowSet ); // rows are originating from "lookup_from" while ( rowData != null ) { if ( firstRun ) { data.infoMeta = rowSet.getRowMeta().clone(); // Check lookup field int indexOfLookupField = data.infoMeta.indexOfValue( environmentSubstitute( meta.getLookupField() ) ); if ( indexOfLookupField < 0 ) { // The field is unreachable ! throw new KettleException( BaseMessages.getString( PKG, "FuzzyMatch.Exception.CouldnotFindLookField", meta.getLookupField() ) ); } data.infoCache = new RowMeta(); ValueMetaInterface keyValueMeta = data.infoMeta.getValueMeta( indexOfLookupField ); keyValueMeta.setStorageType( ValueMetaInterface.STORAGE_TYPE_NORMAL ); data.infoCache.addValueMeta( keyValueMeta ); // Add key data.indexOfCachedFields[0] = indexOfLookupField; // Check additional fields if ( data.addAdditionalFields ) { ValueMetaInterface additionalFieldValueMeta; for ( int i = 0; i < meta.getValue().length; i++ ) { int fi = i + 1; data.indexOfCachedFields[fi] = data.infoMeta.indexOfValue( meta.getValue()[i] ); if ( data.indexOfCachedFields[fi] < 0 ) { // The field is unreachable ! throw new KettleException( BaseMessages.getString( PKG, "FuzzyMatch.Exception.CouldnotFindLookField", meta.getValue()[i] ) ); } additionalFieldValueMeta = data.infoMeta.getValueMeta( data.indexOfCachedFields[fi] ); additionalFieldValueMeta.setStorageType( ValueMetaInterface.STORAGE_TYPE_NORMAL ); data.infoCache.addValueMeta( additionalFieldValueMeta ); } data.nrCachedFields += meta.getValue().length; } } if ( log.isRowLevel() ) { logRowlevel( BaseMessages.getString( PKG, "FuzzyMatch.Log.ReadLookupRow" ) + rowSet.getRowMeta().getString( rowData ) ); } // Look up the keys in the source rows // and store values in cache Object[] storeData = new Object[data.nrCachedFields]; // Add key field if ( rowData[data.indexOfCachedFields[0]] == null ) { storeData[0] = ""; } else { ValueMetaInterface fromStreamRowMeta = rowSet.getRowMeta().getValueMeta( data.indexOfCachedFields[0] ); if ( fromStreamRowMeta.isStorageBinaryString() ) { storeData[0] = fromStreamRowMeta.convertToNormalStorageType( rowData[data.indexOfCachedFields[0]] ); } else { storeData[0] = rowData[data.indexOfCachedFields[0]]; } } // Add additional fields? for ( int i = 1; i < data.nrCachedFields; i++ ) { ValueMetaInterface fromStreamRowMeta = rowSet.getRowMeta().getValueMeta( data.indexOfCachedFields[i] ); if ( fromStreamRowMeta.isStorageBinaryString() ) { storeData[i] = fromStreamRowMeta.convertToNormalStorageType( rowData[data.indexOfCachedFields[i]] ); } else { storeData[i] = rowData[data.indexOfCachedFields[i]]; } } if ( isDebug() ) { logDebug( BaseMessages.getString( PKG, "FuzzyMatch.Log.AddingValueToCache", data.infoCache .getString( storeData ) ) ); } addToCache( storeData ); rowData = getRowFrom( rowSet ); if ( firstRun ) { firstRun = false; } } return true; } private Object[] lookupValues( RowMetaInterface rowMeta, Object[] row ) throws KettleException { if ( first ) { first = false; data.outputRowMeta = getInputRowMeta().clone(); meta.getFields( data.outputRowMeta, getStepname(), new RowMetaInterface[] { data.infoMeta }, null, this, repository, metaStore ); // Check lookup field data.indexOfMainField = getInputRowMeta().indexOfValue( environmentSubstitute( meta.getMainStreamField() ) ); if ( data.indexOfMainField < 0 ) { // The field is unreachable ! throw new KettleException( BaseMessages.getString( PKG, "FuzzyMatch.Exception.CouldnotFindMainField", meta .getMainStreamField() ) ); } } Object[] add = null; if ( row[ data.indexOfMainField ] == null ) { add = buildEmptyRow(); } else { try { add = getFromCache( row ); } catch ( Exception e ) { throw new KettleStepException( e ); } } return RowDataUtil.addRowData( row, rowMeta.size(), add ); } private void addToCache( Object[] value ) throws KettleException { try { data.look.add( value ); } catch ( java.lang.OutOfMemoryError o ) { // exception out of memory throw new KettleException( BaseMessages.getString( PKG, "FuzzyMatch.Error.JavaHeap", o.toString() ) ); } } private Object[] getFromCache( Object[] keyRow ) throws KettleValueException { if ( isDebug() ) { logDebug( BaseMessages.getString( PKG, "FuzzyMatch.Log.ReadingMainStreamRow", getInputRowMeta().getString( keyRow ) ) ); } Object[] retval = null; switch ( meta.getAlgorithmType() ) { case FuzzyMatchMeta.OPERATION_TYPE_LEVENSHTEIN: case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN: case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH: retval = doDistance( keyRow ); break; case FuzzyMatchMeta.OPERATION_TYPE_DOUBLE_METAPHONE: case FuzzyMatchMeta.OPERATION_TYPE_METAPHONE: case FuzzyMatchMeta.OPERATION_TYPE_SOUNDEX: case FuzzyMatchMeta.OPERATION_TYPE_REFINED_SOUNDEX: retval = doPhonetic( keyRow ); break; case FuzzyMatchMeta.OPERATION_TYPE_JARO: case FuzzyMatchMeta.OPERATION_TYPE_JARO_WINKLER: case FuzzyMatchMeta.OPERATION_TYPE_PAIR_SIMILARITY: retval = doSimilarity( keyRow ); break; default: break; } return retval; } private Object[] doDistance( Object[] row ) throws KettleValueException { // Reserve room Object[] rowData = buildEmptyRow(); Iterator<Object[]> it = data.look.iterator(); long distance = -1; // Object o=row[data.indexOfMainField]; String lookupvalue = getInputRowMeta().getString( row, data.indexOfMainField ); while ( it.hasNext() ) { // Get cached row data Object[] cachedData = it.next(); // Key value is the first value String cacheValue = (String) cachedData[0]; int cdistance = -1; String usecacheValue = cacheValue; String uselookupvalue = lookupvalue; if ( !meta.isCaseSensitive() ) { usecacheValue = cacheValue.toLowerCase(); uselookupvalue = lookupvalue.toLowerCase(); } switch ( meta.getAlgorithmType() ) { case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN: cdistance = Utils.getDamerauLevenshteinDistance( usecacheValue, uselookupvalue ); break; case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH: cdistance = Math.abs( (int) new NeedlemanWunsch().score( usecacheValue, uselookupvalue ) ); break; default: cdistance = StringUtils.getLevenshteinDistance( usecacheValue, uselookupvalue ); break; } if ( data.minimalDistance <= cdistance && cdistance <= data.maximalDistance ) { if ( meta.isGetCloserValue() ) { if ( cdistance < distance || distance == -1 ) { // Get closer value // minimal distance distance = cdistance; int index = 0; rowData[index++] = cacheValue; // Add metric value? if ( data.addValueFieldName ) { rowData[index++] = distance; } // Add additional return values? if ( data.addAdditionalFields ) { for ( int i = 0; i < meta.getValue().length; i++ ) { int nr = i + 1; int nf = i + index; rowData[nf] = cachedData[nr]; } } } } else { // get all values separated by values separator if ( rowData[0] == null ) { rowData[0] = cacheValue; } else { rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue; } } } } return rowData; } private Object[] doPhonetic( Object[] row ) { // Reserve room Object[] rowData = buildEmptyRow(); Iterator<Object[]> it = data.look.iterator(); Object o = row[data.indexOfMainField]; String lookupvalue = (String) o; String lookupValueMF = getEncodedMF( lookupvalue, meta.getAlgorithmType() ); while ( it.hasNext() ) { // Get cached row data Object[] cachedData = it.next(); // Key value is the first value String cacheValue = (String) cachedData[0]; String cacheValueMF = getEncodedMF( cacheValue, meta.getAlgorithmType() ); if ( lookupValueMF.equals( cacheValueMF ) ) { // Add match value int index = 0; rowData[index++] = cacheValue; // Add metric value? if ( data.addValueFieldName ) { rowData[index++] = cacheValueMF; } // Add additional return values? if ( data.addAdditionalFields ) { for ( int i = 0; i < meta.getValue().length; i++ ) { int nf = i + index; int nr = i + 1; rowData[nf] = cachedData[nr]; } } } } return rowData; } private String getEncodedMF( String value, Integer algorithmType ) { String encodedValueMF = ""; switch ( algorithmType ) { case FuzzyMatchMeta.OPERATION_TYPE_METAPHONE: encodedValueMF = ( new Metaphone() ).metaphone( value ); break; case FuzzyMatchMeta.OPERATION_TYPE_DOUBLE_METAPHONE: encodedValueMF = ( ( new DoubleMetaphone() ).doubleMetaphone( value ) ); break; case FuzzyMatchMeta.OPERATION_TYPE_SOUNDEX: encodedValueMF = ( new Soundex() ).encode( value ); break; case FuzzyMatchMeta.OPERATION_TYPE_REFINED_SOUNDEX: encodedValueMF = ( new RefinedSoundex() ).encode( value ); break; default: break; } return encodedValueMF; } private Object[] doSimilarity( Object[] row ) { // Reserve room Object[] rowData = buildEmptyRow(); // prepare to read from cache ... Iterator<Object[]> it = data.look.iterator(); double similarity = 0; // get current value from main stream Object o = row[data.indexOfMainField]; String lookupvalue = o == null ? "" : (String) o; while ( it.hasNext() ) { // Get cached row data Object[] cachedData = it.next(); // Key value is the first value String cacheValue = (String) cachedData[0]; double csimilarity = new Double( 0 ); switch ( meta.getAlgorithmType() ) { case FuzzyMatchMeta.OPERATION_TYPE_JARO: csimilarity = new Jaro().score( cacheValue, lookupvalue ); break; case FuzzyMatchMeta.OPERATION_TYPE_JARO_WINKLER: csimilarity = new JaroWinkler().score( cacheValue, lookupvalue ); break; default: // Letters pair similarity csimilarity = LetterPairSimilarity.getSimiliarity( cacheValue, lookupvalue ); break; } if ( data.minimalSimilarity <= csimilarity && csimilarity <= data.maximalSimilarity ) { if ( meta.isGetCloserValue() ) { if ( csimilarity > similarity || ( csimilarity == 0 && cacheValue.equals( lookupvalue ) ) ) { similarity = csimilarity; // Update match value int index = 0; rowData[index++] = cacheValue; // Add metric value? if ( data.addValueFieldName ) { rowData[index++] = new Double( similarity ); } // Add additional return values? if ( data.addAdditionalFields ) { for ( int i = 0; i < meta.getValue().length; i++ ) { int nf = i + index; int nr = i + 1; rowData[nf] = cachedData[nr]; } } } } else { // get all values separated by values separator if ( rowData[0] == null ) { rowData[0] = cacheValue; } else { rowData[0] = (String) rowData[0] + data.valueSeparator + cacheValue; } } } } return rowData; } /** * Build an empty row based on the meta-data... * * @return */ private Object[] buildEmptyRow() { Object[] rowData = RowDataUtil.allocateRowData( data.outputRowMeta.size() ); return rowData; } public boolean processRow( StepMetaInterface smi, StepDataInterface sdi ) throws KettleException { meta = (FuzzyMatchMeta) smi; data = (FuzzyMatchData) sdi; if ( data.readLookupValues ) { data.readLookupValues = false; // Read values from lookup step (look) if ( !readLookupValues() ) { logError( BaseMessages.getString( PKG, "FuzzyMatch.Log.UnableToReadDataFromLookupStream" ) ); setErrors( 1 ); stopAll(); return false; } if ( isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.ReadValuesInMemory", data.look.size() ) ); } } Object[] r = getRow(); // Get row from input rowset & set row busy! if ( r == null ) { // no more input to be expected... if ( isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.StoppedProcessingWithEmpty", getLinesRead() ) ); } setOutputDone(); return false; } try { // Do the actual lookup in the hastable. Object[] outputRow = lookupValues( getInputRowMeta(), r ); if ( outputRow == null ) { setOutputDone(); // signal end to receiver(s) return false; } putRow( data.outputRowMeta, outputRow ); // copy row to output rowset(s); if ( checkFeedback( getLinesRead() ) ) { if ( log.isBasic() ) { logBasic( BaseMessages.getString( PKG, "FuzzyMatch.Log.LineNumber" ) + getLinesRead() ); } } } catch ( KettleException e ) { boolean sendToErrorRow = false; String errorMessage = null; if ( getStepMeta().isDoingErrorHandling() ) { sendToErrorRow = true; errorMessage = e.toString(); } else { logError( BaseMessages.getString( PKG, "FuzzyMatch.Log.ErrorInStepRunning" ) + e.getMessage() ); setErrors( 1 ); stopAll(); setOutputDone(); // signal end to receiver(s) return false; } if ( sendToErrorRow ) { // Simply add this row to the error row putError( getInputRowMeta(), r, 1, errorMessage, meta.getMainStreamField(), "FuzzyMatch001" ); } } return true; } public boolean init( StepMetaInterface smi, StepDataInterface sdi ) { meta = (FuzzyMatchMeta) smi; data = (FuzzyMatchData) sdi; if ( super.init( smi, sdi ) ) { // Check lookup and main stream field if ( Utils.isEmpty( meta.getMainStreamField() ) ) { logError( BaseMessages.getString( PKG, "FuzzyMatch.Error.MainStreamFieldMissing" ) ); return false; } if ( Utils.isEmpty( meta.getLookupField() ) ) { logError( BaseMessages.getString( PKG, "FuzzyMatch.Error.LookupStreamFieldMissing" ) ); return false; } // Checks output fields String matchField = environmentSubstitute( meta.getOutputMatchField() ); if ( Utils.isEmpty( matchField ) ) { logError( BaseMessages.getString( PKG, "FuzzyMatch.Error.OutputMatchFieldMissing" ) ); return false; } // We need to add metrics (distance, similarity, ...) // only when the fieldname is provided // and user want to return the closer value data.addValueFieldName = ( !Utils.isEmpty( environmentSubstitute( meta.getOutputValueField() ) ) && meta.isGetCloserValue() ); // Set the number of fields to cache // default value is one int nrFields = 1; if ( meta.getValue() != null && meta.getValue().length > 0 ) { if ( meta.isGetCloserValue() || ( meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_DOUBLE_METAPHONE ) || ( meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_SOUNDEX ) || ( meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_REFINED_SOUNDEX ) || ( meta.getAlgorithmType() == FuzzyMatchMeta.OPERATION_TYPE_METAPHONE ) ) { // cache also additional fields data.addAdditionalFields = true; nrFields += meta.getValue().length; } } data.indexOfCachedFields = new int[nrFields]; switch ( meta.getAlgorithmType() ) { case FuzzyMatchMeta.OPERATION_TYPE_LEVENSHTEIN: case FuzzyMatchMeta.OPERATION_TYPE_DAMERAU_LEVENSHTEIN: case FuzzyMatchMeta.OPERATION_TYPE_NEEDLEMAN_WUNSH: data.minimalDistance = Const.toInt( environmentSubstitute( meta.getMinimalValue() ), 0 ); if ( isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.MinimalDistance", data.minimalDistance ) ); } data.maximalDistance = Const.toInt( environmentSubstitute( meta.getMaximalValue() ), 5 ); if ( isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.MaximalDistance", data.maximalDistance ) ); } if ( !meta.isGetCloserValue() ) { data.valueSeparator = environmentSubstitute( meta.getSeparator() ); if ( isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.Separator", data.valueSeparator ) ); } } break; case FuzzyMatchMeta.OPERATION_TYPE_JARO: case FuzzyMatchMeta.OPERATION_TYPE_JARO_WINKLER: case FuzzyMatchMeta.OPERATION_TYPE_PAIR_SIMILARITY: data.minimalSimilarity = Const.toDouble( environmentSubstitute( meta.getMinimalValue() ), 0 ); if ( isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.MinimalSimilarity", data.minimalSimilarity ) ); } data.maximalSimilarity = Const.toDouble( environmentSubstitute( meta.getMaximalValue() ), 1 ); if ( isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.MaximalSimilarity", data.maximalSimilarity ) ); } if ( !meta.isGetCloserValue() ) { data.valueSeparator = environmentSubstitute( meta.getSeparator() ); if ( isDetailed() ) { logDetailed( BaseMessages.getString( PKG, "FuzzyMatch.Log.Separator", data.valueSeparator ) ); } } break; default: break; } data.readLookupValues = true; return true; } return false; } public void dispose( StepMetaInterface smi, StepDataInterface sdi ) { meta = (FuzzyMatchMeta) smi; data = (FuzzyMatchData) sdi; data.look.clear(); super.dispose( smi, sdi ); } }