/* * Joinery -- Data frames for Java * Copyright (c) 2014, 2015 IBM Corp. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package joinery.impl; import java.text.DateFormat; import java.text.ParsePosition; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeSet; import joinery.DataFrame; import joinery.DataFrame.Function; import joinery.DataFrame.NumberDefault; public class Conversion { protected static int dummyVariableMaxLen = 8; public static int getDummyVariableMaxLen() { return dummyVariableMaxLen; } /** * Set the Max length of dummy part (after the $ sign) of the * column (variable) names generated by <pre>toModelMatrix</pre> * Observe that the final name can actually become longer to * avoid non-unique variable names which is a requirement * * @param dummyVariableMaxLen set to negative value for no limit */ public static void setDummyVariableMaxLen(int dummyVariableMaxLen) { Conversion.dummyVariableMaxLen = dummyVariableMaxLen; } public static <V> void convert(final DataFrame<V> df) { convert(df, NumberDefault.LONG_DEFAULT, null); } public static <V> void convert(final DataFrame<V> df, final NumberDefault numDefault, final String naString) { final Map<Integer, Function<V, ?>> conversions = new HashMap<>(); List<Function<V, ?>> converters; final int rows = df.length(); final int cols = df.size(); switch (numDefault) { case LONG_DEFAULT: converters = Arrays.<Function<V, ?>>asList( new LongConversion<V>(), new DoubleConversion<V>(), new BooleanConversion<V>(), new DateTimeConversion<V>()); break; case DOUBLE_DEFAULT: converters = Arrays.<Function<V, ?>>asList( new DoubleConversion<V>(), new LongConversion<V>(), new BooleanConversion<V>(), new DateTimeConversion<V>()); break; default: throw new IllegalArgumentException("Number default contains an Illegal value"); } NAConversion<V> naConverter = new NAConversion<>(naString); // find conversions for (int c = 0; c < cols; c++) { for (final Function<V, ?> conv : converters) { boolean all = true; for (int r = 0; r < rows; r++) { if (conv.apply(df.get(r, c)) == null && naConverter.apply(df.get(r, c)) != null) { all = false; break; } } if (all) { conversions.put(c, conv); break; } } } // apply conversions convert(df, conversions, naString); } @SafeVarargs public static <V> void convert(final DataFrame<V> df, final Class<? extends V> ... columnTypes) { final Map<Integer, Function<V, ?>> conversions = new HashMap<>(); for (int i = 0; i < columnTypes.length; i++) { final Class<? extends V> cls = columnTypes[i]; if (cls != null) { Function<V, ?> conv = null; if (Date.class.isAssignableFrom(cls)) { conv = new DateTimeConversion<V>(); } else if (Boolean.class.isAssignableFrom(cls)) { conv = new BooleanConversion<V>(); } else if (Long.class.isAssignableFrom(cls)) { conv = new LongConversion<V>(); } else if (Number.class.isAssignableFrom(cls)) { conv = new DoubleConversion<V>(); } else if (String.class.isAssignableFrom(cls)) { conv = new StringConversion<V>(); } conversions.put(i, conv); } } convert(df, conversions, null); } @SuppressWarnings("unchecked") public static <V> void convert(final DataFrame<V> df, final Map<Integer, Function<V, ?>> conversions, String naString) { final int rows = df.length(); final int cols = df.size(); for (int c = 0; c < cols; c++) { final Function<V, ?> conv = conversions.get(c); if (conv != null) { for (int r = 0; r < rows; r++) { df.set(r, c, (V)conv.apply(df.get(r, c))); } } else { NAConversion<V> naConverter = new NAConversion<>(naString); for (int r = 0; r < rows; r++) { df.set(r, c, (V)naConverter.apply(df.get(r, c))); } } } } public static <V> double[][] toModelMatrix(final DataFrame<V> df, double fillValue) { return toModelMatrixDataFrame(df).fillna(fillValue).toArray(double[][].class); } public static <V> double[][] toModelMatrix(final DataFrame<V> df, double fillValue, boolean addIntercept) { return toModelMatrixDataFrame(df, null, addIntercept, null, null).fillna(fillValue).toArray(double[][].class); } public static <V> double[][] toModelMatrix(final DataFrame<V> df, double fillValue, DataFrame<Object> template) { return toModelMatrixDataFrame(df, template, false, null, null).fillna(fillValue).toArray(double[][].class); } public static <V> double[][] toModelMatrix(final DataFrame<V> df, double fillValue, DataFrame<Object> template, boolean addIntercept) { return toModelMatrixDataFrame(df, template, addIntercept, null, null).fillna(fillValue).toArray(double[][].class); } public static <V> double[][] toModelMatrix(final DataFrame<V> df, double fillValue, DataFrame<Object> template, boolean addIntercept, Map<String, String> factorReferences) { return toModelMatrixDataFrame(df, template, addIntercept, null, null).fillna(fillValue).toArray(double[][].class); } public static <V> DataFrame<Number> toModelMatrixDataFrame(final DataFrame<V> df) { return toModelMatrixDataFrame(df, null, false, null, null); } public static <V> DataFrame<Number> toModelMatrixDataFrame(final DataFrame<V> df, DataFrame<Object> template, boolean addIntercept) { return toModelMatrixDataFrame(df, template, addIntercept, null, null); } /** * Encodes the DataFrame as a model matrix, converting nominal values * to dummy variables and optionally adds an intercept column. In addition * it is possible to send in a Map which contains reference categories * for all or a subset of the variables. The Map should contain pairs: * variable => reference factor. I.e if you have a column (variable) * called "color" and you want "red" to be the reference factor in that * column you should put the following in the Map: Map.put("color", "red") * * @param df Dataframe to be converted * @param template template DataFrame which has already been converted * @param addIntercept * @param factorReferences a Map of reference factor for each variable (null if none) * @param naString replaces null values in DF with this string (default if not supplied is NA) * @return a new DataFrame encoded as a model matrix */ public static <V> DataFrame<Number> toModelMatrixDataFrame(final DataFrame<V> df, DataFrame<Object> template, boolean addIntercept, Map<String, String> factorReferences, String naString) { DataFrame<Number> newDf = new DataFrame<>(); if(addIntercept) { // Add an intercept column newDf.add("DFMMAddedIntercept"); for (int i = 0; i < df.length(); i++) { newDf.append(Arrays.asList(1.0)); } } final List<Object> columns = new ArrayList<>(df.columns()); // Now convert Nominals (String columns) to dummy variables // Keep all others as is List<Class<?>> colTypes = df.types(); for (int column = 0; column < df.size(); column++) { List<V> col = df.col(column); String columnName = columns.get(column).toString(); if(Number.class.isAssignableFrom(colTypes.get(column))) { List<Number> nums = new ArrayList<>(); for (V num : col) { nums.add((Number)num); } newDf.add(columnName,nums); } else if (Date.class.isAssignableFrom(colTypes.get(column))) { List<Number> dates = new ArrayList<>(); for (V date : col) { dates.add(new Double(((Date)date).getTime())); } newDf.add(columnName,dates); } else if (Boolean.class.isAssignableFrom(colTypes.get(column))) { List<Number> bools = new ArrayList<>(); for (V tVal : col) { bools.add((Boolean)tVal ? 1.0 : 0.0); } newDf.add(columnName,bools); } else if (String.class.isAssignableFrom(colTypes.get(column))) { Set<String> namesUsed = new HashSet<String>(); List<Object> extra = template != null ? template.col(column) : null; VariableToDummyResult vr = variableToDummy(col, extra, columnName, factorReferences, naString); List<List<Number>> variable = vr.col; int cnt = 0; for(List<Number> var : variable) { String name = columnName + "$" + nameToValidName(vr.names[cnt++],namesUsed);; newDf.add(name, var); } } } return newDf; } protected static Object nameToValidName(String string, Set<String> namesUsed) { String result = string.replaceAll("[^\\p{Alpha}]", ""); if(dummyVariableMaxLen>0) result = result.substring(0,Math.min(result.length(),dummyVariableMaxLen)); int tryCnt = 0; String tmp = result; while(namesUsed.contains(result)) { result = tmp + tryCnt++; } namesUsed.add(result); return result; } protected static class VariableToDummyResult { List<List<Number>> col; String []names; public VariableToDummyResult(List<List<Number>> col, String[] names) { super(); this.col = col; this.names = names; } } @SuppressWarnings({ "rawtypes", "unchecked" }) protected static <V> VariableToDummyResult variableToDummy(List<V> colVals, List<Object> extra, String columnName, Map<String, String> references, String naString) { List<List<Number>> result = new ArrayList<List<Number>>(); List<String> col = new ArrayList<String>(); for (V value : colVals) { col.add(value == null ? (naString==null?"NA":naString) : value.toString()); } Set<String> factors = new TreeSet<>(col); if(extra!=null) factors.addAll(new TreeSet(extra)); if(references==null || references.get(columnName)==null) { factors.remove(col.get(col.size()-1)); } else { String ref = references.get(columnName); if(!factors.remove(references.get(columnName))) { throw new IllegalArgumentException("You specified '" + ref + "' as a references for '" + columnName + "' but it did not exist in this column"); } } // Convert the variable to noFactors - 1 // Since we have removed the reference from // factors we already have the -1 so it is // not needed below Iterator<String> uniqueIter = factors.iterator(); String [] names = new String[factors.size()]; for (int u = 0; u < factors.size(); u++) { String v = uniqueIter.next(); names[u] = v; List<Number> newDummy = new ArrayList<Number>(); for (int i = 0; i < col.size(); i++) { if(col.get(i).equals(v)) { newDummy.add(1.0); } else { newDummy.add(0.0); } } result.add(newDummy); } return new VariableToDummyResult(result,names); } public static <V> DataFrame<Boolean> isnull(final DataFrame<V> df) { return df.apply(new Function<V, Boolean>() { @Override public Boolean apply(final V value) { return value == null; } }); } public static <V> DataFrame<Boolean> notnull(final DataFrame<V> df) { return df.apply(new Function<V, Boolean>() { @Override public Boolean apply(final V value) { return value != null; } }); } private static class NAConversion<V> implements Function<V, V> { final String naString; public NAConversion(String naString) { this.naString = naString; } @Override public V apply(V value) { return naString != null && String.valueOf(value).equals(naString) ? null : value; } } private static final class StringConversion<V> implements Function<V, String> { @Override public String apply(final V value) { return String.valueOf(value); } } private static final class LongConversion<V> implements Function<V, Long> { @Override public Long apply(final V value) { try { return new Long(String.valueOf(value)); } catch (final NumberFormatException ignored) { } return null; } } private static final class DoubleConversion<V> implements Function<V, Double> { @Override public Double apply(final V value) { try { return new Double(String.valueOf(value)); } catch (final NumberFormatException ignored) { } return null; } } private static final class BooleanConversion<V> implements Function<V, Boolean> { @Override public Boolean apply(final V value) { final String str = String.valueOf(value); if (str.matches("t(r(u(e)?)?)?|y(e(s)?)?")) { return new Boolean(true); } else if (str.matches("f(a(l(s(e)?)?)?)?|n(o)?")) { return new Boolean(false); } return null; } } private static final class DateTimeConversion<V> implements Function<V, Date> { private final List<DateFormat> formats = Arrays.<DateFormat>asList( new SimpleDateFormat("y-M-d'T'HH:mm:ssXXX"), new SimpleDateFormat("y-M-d'T'HH:mm:ssZZZ"), new SimpleDateFormat("y-M-d"), new SimpleDateFormat("y-M-d hh:mm a"), new SimpleDateFormat("y-M-d HH:mm"), new SimpleDateFormat("y-M-d hh:mm:ss a"), new SimpleDateFormat("y-M-d HH:mm:ss"), new SimpleDateFormat("y/M/d hh:mm:ss a"), new SimpleDateFormat("y/M/d HH:mm:ss"), new SimpleDateFormat("y/M/d hh:mm a"), new SimpleDateFormat("y/M/d HH:mm"), new SimpleDateFormat("dd-MMM-yy hh.mm.ss.SSS a"), new SimpleDateFormat("dd-MMM-yy hh.mm.ss.SSSSSSSSS a"), new SimpleDateFormat("EEE MMM dd HH:mm:ss zzz yyyy"), DateFormat.getDateTimeInstance(), new SimpleDateFormat("y/M/d"), new SimpleDateFormat("M/d/y hh:mm:ss a"), new SimpleDateFormat("M/d/y HH:mm:ss"), new SimpleDateFormat("M/d/y hh:mm a"), new SimpleDateFormat("M/d/y HH:mm"), new SimpleDateFormat("M/d/y"), DateFormat.getDateInstance() ); @Override public Date apply(final V value) { final String source = String.valueOf(value); final ParsePosition pp = new ParsePosition(0); for (final DateFormat format : formats) { final Date dt = format.parse(source, pp); if (pp.getIndex() == source.length()) { return dt; } pp.setIndex(0); pp.setErrorIndex(-1); } return null; } } }