/*! ******************************************************************************
*
* Pentaho Data Integration
*
* Copyright (C) 2002-2016 by Pentaho : http://www.pentaho.com
*
*******************************************************************************
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.pentaho.di.core.util;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import org.pentaho.di.core.Const;
import org.pentaho.di.core.exception.KettleValueException;
import org.pentaho.di.core.row.ValueMeta;
import org.pentaho.di.core.row.ValueMetaInterface;
import org.pentaho.di.core.row.value.ValueMetaBoolean;
import org.pentaho.di.core.row.value.ValueMetaDate;
import org.pentaho.di.core.row.value.ValueMetaInteger;
import org.pentaho.di.core.row.value.ValueMetaNumber;
import org.pentaho.di.core.row.value.ValueMetaString;
/**
* This class evaluates strings and extracts a data type. It allows you to criteria after which the analysis should be
* completed.
*
* @author matt
*/
public class StringEvaluator {
private Set<String> values;
private List<StringEvaluationResult> evaluationResults;
private int maxLength;
private int maxPrecision;
private int count;
private boolean tryTrimming;
private ValueMetaInterface stringMeta;
private String[] dateFormats;
private String[] numberFormats;
private static final String[] DEFAULT_NUMBER_FORMATS = new String[]
{
"#,###,###.#",
"#.#",
"#",
"#.0",
"#.00",
"#.000",
"#.0000",
"#.00000",
"#.000000",
" #.0#"
};
protected static final Pattern PRECISION_PATTERN = Pattern.compile( "[^0-9#]" );
public StringEvaluator() {
this( true );
}
public StringEvaluator( boolean tryTrimming ) {
this( tryTrimming, DEFAULT_NUMBER_FORMATS, Const.getDateFormats() );
}
public StringEvaluator( boolean tryTrimming, List<String> numberFormats, List<String> dateFormats ) {
this( tryTrimming, numberFormats.toArray( new String[ numberFormats.size() ] ), dateFormats
.toArray( new String[ dateFormats.size() ] ) );
}
public StringEvaluator( boolean tryTrimming, String[] numberFormats, String[] dateFormats ) {
this.tryTrimming = tryTrimming;
values = new HashSet<String>();
evaluationResults = new ArrayList<StringEvaluationResult>();
count = 0;
stringMeta = new ValueMetaString( "string" );
this.numberFormats = numberFormats;
this.dateFormats = dateFormats;
populateConversionMetaList();
}
public void evaluateString( String value ) {
count++;
if ( !values.contains( value ) ) {
values.add( value );
if ( value != null ) {
evaluateLength( value );
evaluatePrecision( value );
challengeConversions( value );
}
}
}
private void challengeConversions( String value ) {
List<StringEvaluationResult> all = new ArrayList<StringEvaluationResult>( evaluationResults );
ValueMetaInterface stringMetaClone = null;
for ( StringEvaluationResult cmm : all ) {
if ( cmm.getConversionMeta().isBoolean() ) {
// Boolean conversion never fails.
// If it's a Y, N, true, false it's a boolean otherwise it ain't.
//
String string;
if ( tryTrimming ) {
string = Const.trim( value );
} else {
string = value;
}
if ( StringUtils.isEmpty( value ) ) {
cmm.incrementNrNull();
} else if ( !( "Y".equalsIgnoreCase( string ) || "N".equalsIgnoreCase( string ) || "TRUE".equalsIgnoreCase(
string ) || "FALSE".equalsIgnoreCase( string ) ) ) {
evaluationResults.remove( cmm );
} else {
cmm.incrementSuccesses();
}
} else if ( cmm.getConversionMeta().isDate() ) {
String dateFormat = cmm.getConversionMeta().getConversionMask();
if ( !DateDetector.isValidDateFormatToStringDate( dateFormat, value, "en_US" ) ) {
evaluationResults.remove( cmm );
} else {
try {
Object object = DateDetector.getDateFromStringByFormat( value, dateFormat );
cmm.incrementSuccesses();
if ( cmm.getMin() == null || cmm.getConversionMeta().compare( cmm.getMin(), object ) > 0 ) {
cmm.setMin( object );
}
if ( cmm.getMax() == null || cmm.getConversionMeta().compare( cmm.getMax(), object ) < 0 ) {
cmm.setMax( object );
}
} catch ( ParseException e ) {
evaluationResults.remove( cmm );
} catch ( KettleValueException e ) {
evaluationResults.remove( cmm );
}
}
} else {
try {
if ( cmm.getConversionMeta().isNumeric() ) {
boolean stop = false;
int nrDots = 0;
int nrCommas = 0;
int pos = 0;
for ( char c : value.toCharArray() ) {
boolean currencySymbolMatch = !String.valueOf( c ).equals( cmm.getConversionMeta().getCurrencySymbol() )
&& c != '('
&& c != ')';
if ( !Character.isDigit( c )
&& c != '.'
&& c != ','
&& !Character.isSpaceChar( c )
&& currencySymbolMatch
&& ( pos > 0 && ( c == '+' || c == '-' ) ) // allow + & - at the 1st position
) {
evaluationResults.remove( cmm );
stop = true;
break;
}
// If the value contains a decimal or grouping symbol or some sort, it's not an integer
//
if ( ( c == '.' && cmm.getConversionMeta().isInteger() )
|| ( c == ',' && cmm.getConversionMeta().isInteger() ) ) {
evaluationResults.remove( cmm );
stop = true;
break;
}
if ( c == '.' ) {
nrDots++;
}
if ( c == ',' ) {
nrCommas++;
}
pos++;
}
if ( nrDots > 1 && nrCommas > 1 ) {
evaluationResults.remove( cmm );
stop = true;
}
if ( stop ) {
continue;
}
}
if ( stringMetaClone == null ) {
// avoid cloning each time
stringMetaClone = stringMeta.clone();
}
stringMetaClone.setConversionMetadata( cmm.getConversionMeta() );
stringMetaClone.setTrimType( cmm.getConversionMeta().getTrimType() );
Object object = stringMetaClone.convertDataUsingConversionMetaData( value );
// Still here? Evaluate the data...
// Keep track of null values, min, max, etc.
//
if ( cmm.getConversionMeta().isNull( object ) ) {
cmm.incrementNrNull();
} else {
cmm.incrementSuccesses();
}
if ( cmm.getMin() == null || cmm.getConversionMeta().compare( cmm.getMin(), object ) > 0 ) {
cmm.setMin( object );
}
if ( cmm.getMax() == null || cmm.getConversionMeta().compare( cmm.getMax(), object ) < 0 ) {
cmm.setMax( object );
}
} catch ( KettleValueException e ) {
// This one doesn't work, remove it from the list!
//
evaluationResults.remove( cmm );
}
}
}
}
private void evaluateLength( String value ) {
if ( value.length() > maxLength ) {
maxLength = value.length();
}
}
private void evaluatePrecision( String value ) {
int p = determinePrecision( value );
if ( p > maxPrecision ) {
maxPrecision = p;
}
}
private boolean containsInteger() {
for ( StringEvaluationResult result : evaluationResults ) {
if ( result.getConversionMeta().isInteger() && result.getNrSuccesses() > 0 ) {
return true;
}
}
return false;
}
private boolean containsNumber() {
for ( StringEvaluationResult result : evaluationResults ) {
if ( result.getConversionMeta().isNumber() && result.getNrSuccesses() > 0 ) {
return true;
}
}
return false;
}
private boolean containsDate() {
for ( StringEvaluationResult result : evaluationResults ) {
if ( result.getConversionMeta().isDate() && result.getNrSuccesses() > 0 ) {
return true;
}
}
return false;
}
public StringEvaluationResult getAdvicedResult() {
if ( evaluationResults.isEmpty() ) {
ValueMetaInterface adviced = new ValueMetaString( "adviced" );
adviced.setLength( maxLength );
int nrNulls = 0;
String min = null;
String max = null;
for ( String string : values ) {
if ( string != null ) {
if ( min == null || min.compareTo( string ) > 0 ) {
min = string;
}
if ( max == null || max.compareTo( string ) < 0 ) {
max = string;
}
} else {
nrNulls++;
}
}
StringEvaluationResult result = new StringEvaluationResult( adviced );
result.setNrNull( nrNulls );
result.setMin( min );
result.setMax( max );
return result;
} else {
// If there are Numbers and Integers, pick the integers...
//
if ( containsInteger() && containsNumber() ) {
for ( Iterator<StringEvaluationResult> iterator = evaluationResults.iterator(); iterator.hasNext(); ) {
StringEvaluationResult result = iterator.next();
if ( maxPrecision == 0 && result.getConversionMeta().isNumber() ) {
// no precision, don't bother with a number
iterator.remove();
} else if ( maxPrecision > 0 && result.getConversionMeta().isInteger() ) {
// precision is needed, can't use integer
iterator.remove();
}
}
}
// If there are Dates and Integers, pick the dates...
//
if ( containsInteger() && containsDate() ) {
for ( Iterator<StringEvaluationResult> iterator = evaluationResults.iterator(); iterator.hasNext(); ) {
StringEvaluationResult result = iterator.next();
if ( result.getConversionMeta().isInteger() ) {
iterator.remove();
}
}
}
Comparator<StringEvaluationResult> compare = null;
if ( containsDate() ) {
// want the longest format for dates
compare = new Comparator<StringEvaluationResult>() {
@Override
public int compare( StringEvaluationResult r1, StringEvaluationResult r2 ) {
Integer length1 =
r1.getConversionMeta().getConversionMask() == null ? 0 : r1
.getConversionMeta().getConversionMask().length();
Integer length2 =
r2.getConversionMeta().getConversionMask() == null ? 0 : r2
.getConversionMeta().getConversionMask().length();
return length2.compareTo( length1 );
}
};
} else {
// want the shortest format mask for numerics & integers
compare = new Comparator<StringEvaluationResult>() {
@Override
public int compare( StringEvaluationResult r1, StringEvaluationResult r2 ) {
Integer length1 =
r1.getConversionMeta().getConversionMask() == null ? 0 : r1
.getConversionMeta().getConversionMask().length();
Integer length2 =
r2.getConversionMeta().getConversionMask() == null ? 0 : r2
.getConversionMeta().getConversionMask().length();
return length1.compareTo( length2 );
}
};
}
Collections.sort( evaluationResults, compare );
StringEvaluationResult result = evaluationResults.get( 0 );
ValueMetaInterface conversionMeta = result.getConversionMeta();
if ( conversionMeta.isNumber() && conversionMeta.getCurrencySymbol() == null ) {
conversionMeta.setPrecision( maxPrecision );
if ( maxPrecision > 0 && maxLength > 0 ) {
conversionMeta.setLength( maxLength );
}
}
return result;
}
}
public String[] getDateFormats() {
return dateFormats;
}
public String[] getNumberFormats() {
return numberFormats;
}
private void populateConversionMetaList() {
int[] trimTypes;
if ( tryTrimming ) {
trimTypes = new int[] { ValueMetaInterface.TRIM_TYPE_NONE, ValueMetaInterface.TRIM_TYPE_BOTH, };
} else {
trimTypes = new int[] { ValueMetaInterface.TRIM_TYPE_NONE, };
}
for ( int trimType : trimTypes ) {
for ( String format : getDateFormats() ) {
ValueMetaInterface conversionMeta = new ValueMetaDate( "date" );
conversionMeta.setConversionMask( format );
conversionMeta.setTrimType( trimType );
conversionMeta.setDateFormatLenient( false );
evaluationResults.add( new StringEvaluationResult( conversionMeta ) );
}
EvalResultBuilder numberUsBuilder = new EvalResultBuilder( "number-us", ValueMetaInterface.TYPE_NUMBER, 15, trimType, ".", "," );
EvalResultBuilder numberEuBuilder = new EvalResultBuilder( "number-eu", ValueMetaInterface.TYPE_NUMBER, 15, trimType, ",", "." );
for ( String format : getNumberFormats() ) {
if ( format.equals( "#" ) || format.equals( "0" ) ) {
// skip the integer ones. we'll get those later
continue;
}
int precision = determinePrecision( format );
evaluationResults.add( numberUsBuilder.format( format, precision ).build() );
evaluationResults.add( numberEuBuilder.format( format, precision ).build() );
}
// Try the locale's Currency
DecimalFormat currencyFormat = ( (DecimalFormat) NumberFormat.getCurrencyInstance() );
ValueMetaInterface conversionMeta = new ValueMetaNumber( "number-currency" );
// replace the universal currency symbol with the locale's currency symbol for user recognition
String currencyMask = currencyFormat.toLocalizedPattern().replace( "\u00A4", currencyFormat.getCurrency().getSymbol() );
conversionMeta.setConversionMask( currencyMask );
conversionMeta.setTrimType( trimType );
conversionMeta.setDecimalSymbol( String.valueOf( currencyFormat.getDecimalFormatSymbols().getDecimalSeparator() ) );
conversionMeta.setGroupingSymbol( String.valueOf( currencyFormat.getDecimalFormatSymbols().getGroupingSeparator() ) );
conversionMeta.setCurrencySymbol( currencyFormat.getCurrency().getSymbol() );
conversionMeta.setLength( 15 );
int currencyPrecision = currencyFormat.getCurrency().getDefaultFractionDigits();
conversionMeta.setPrecision( currencyPrecision );
evaluationResults.add( new StringEvaluationResult( conversionMeta ) );
// add same mask w/o currency symbol
String currencyMaskAsNumeric = currencyMask.replaceAll( Pattern.quote( currencyFormat.getCurrency().getSymbol() ), "" );
evaluationResults.add( numberUsBuilder.format( currencyMaskAsNumeric, currencyPrecision ).build() );
evaluationResults.add( numberEuBuilder.format( currencyMaskAsNumeric, currencyPrecision ).build() );
// Integer
//
conversionMeta = new ValueMetaInteger( "integer" );
conversionMeta.setConversionMask( "#" );
conversionMeta.setLength( 15 );
evaluationResults.add( new StringEvaluationResult( conversionMeta ) );
conversionMeta = new ValueMetaInteger( "integer" );
conversionMeta.setConversionMask( " #" );
conversionMeta.setLength( 15 );
evaluationResults.add( new StringEvaluationResult( conversionMeta ) );
// Add support for left zero padded integers
//
for ( int i = 1; i <= 15; i++ ) {
String mask = " ";
for ( int x = 0; x < i; x++ ) {
mask += "0";
}
mask += ";-";
for ( int x = 0; x < i; x++ ) {
mask += "0";
}
conversionMeta = new ValueMetaInteger( "integer-zero-padded-" + i );
conversionMeta.setConversionMask( mask );
conversionMeta.setLength( i );
evaluationResults.add( new StringEvaluationResult( conversionMeta ) );
}
// Boolean
//
conversionMeta = new ValueMetaBoolean( "boolean" );
evaluationResults.add( new StringEvaluationResult( conversionMeta ) );
}
}
protected static int determinePrecision( String numericFormat ) {
if ( numericFormat != null ) {
char decimalSymbol = ( (DecimalFormat) NumberFormat.getInstance() ).getDecimalFormatSymbols().getDecimalSeparator();
int loc = numericFormat.lastIndexOf( decimalSymbol );
if ( loc >= 0 && loc < numericFormat.length() ) {
Matcher m = PRECISION_PATTERN.matcher( numericFormat.substring( loc + 1 ) );
int nonDigitLoc = numericFormat.length();
if ( m.find() ) {
nonDigitLoc = loc + 1 + m.start();
}
return numericFormat.substring( loc + 1, nonDigitLoc ).length();
} else {
return 0;
}
} else {
return 0;
}
}
/**
* @return The distinct set of string values
*/
public Set<String> getValues() {
return values;
}
/**
* PDI-7736: Only list of successful evaluations returned.
*
* @return The list of string evaluation results
*/
public List<StringEvaluationResult> getStringEvaluationResults() {
List<StringEvaluationResult> result = new ArrayList<>();
for ( StringEvaluationResult ev : evaluationResults ) {
if ( ev.getNrSuccesses() > 0 ) {
result.add( ev );
}
}
return result;
}
/**
* @return the number of values analyzed
*/
public int getCount() {
return count;
}
/**
* @return The maximum string length encountered
*/
public int getMaxLength() {
return maxLength;
}
private static class EvalResultBuilder {
private final String name;
private final int type;
private final int length;
private final int trimType;
private final String decimalSymbol;
private final String groupingSymbol;
private String format;
private int precision;
public StringEvaluationResult build() {
ValueMetaInterface meta = new ValueMeta( name, type );
meta.setConversionMask( format );
meta.setTrimType( trimType );
meta.setDecimalSymbol( decimalSymbol );
meta.setGroupingSymbol( groupingSymbol );
meta.setLength( length );
meta.setPrecision( precision );
return new StringEvaluationResult( meta );
}
public EvalResultBuilder( String name, int type, int length, int trimType, String decimalSymbol,
String groupingSymbol ) {
this.name = name;
this.type = type;
this.length = length;
this.trimType = trimType;
this.decimalSymbol = decimalSymbol;
this.groupingSymbol = groupingSymbol;
}
public EvalResultBuilder format( String format, int precision ) {
this.format = format;
this.precision = precision;
return this;
}
}
}