package com.linkedin.thirdeye.dataframe; import com.linkedin.pinot.client.ResultSet; import com.linkedin.pinot.client.ResultSetGroup; import com.udojava.evalex.Expression; import java.io.IOException; import java.io.Reader; import java.math.BigDecimal; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVRecord; /** * Container class for a data frame with multiple typed series with equivalent row count. */ public class DataFrame { public static Pattern SERIES_NAME_PATTERN = Pattern.compile("([A-Za-z_]\\w*)"); public static final String COLUMN_INDEX_DEFAULT = "index"; public static final String COLUMN_JOIN_LEFT = "_left"; public static final String COLUMN_JOIN_RIGHT = "_right"; public static final int DEFAULT_MAX_COLUMN_WIDTH = 30; /** * Strategy interface for resampling series with different native types with a common * strategy. */ public interface ResamplingStrategy { DataFrame apply(Series.SeriesGrouping grouping, Series s); } /** * Resampling by last value in each grouped interval */ public static final class ResampleLast implements ResamplingStrategy { @Override public DataFrame apply(Series.SeriesGrouping grouping, Series s) { switch(s.type()) { case DOUBLE: return grouping.applyTo(s).aggregate(new DoubleSeries.DoubleLast()); case LONG: return grouping.applyTo(s).aggregate(new LongSeries.LongLast()); case STRING: return grouping.applyTo(s).aggregate(new StringSeries.StringLast()); case BOOLEAN: return grouping.applyTo(s).aggregate(new BooleanSeries.BooleanLast()); default: throw new IllegalArgumentException(String.format("Cannot resample series type '%s'", s.type())); } } } /** * Container object for the grouping of multiple rows across different series * based on a common key. */ public static final class DataFrameGrouping { final String keyName; final Series keys; final List<Series.Bucket> buckets; final DataFrame source; DataFrameGrouping(String keyName, Series keys, DataFrame source, List<Series.Bucket> buckets) { this.keyName = keyName; this.keys = keys; this.buckets = buckets; this.source = source; } DataFrameGrouping(Series keys, DataFrame source, List<Series.Bucket> buckets) { this(Series.GROUP_KEY, keys, source, buckets); } public int size() { return this.keys.size(); } public DataFrame source() { return this.source; } public boolean isEmpty() { return this.keys.isEmpty(); } public Series.SeriesGrouping get(String seriesName) { return new Series.SeriesGrouping(this.keys, this.source.get(seriesName), this.buckets); } public DataFrame aggregate(String seriesName, Series.Function function) { return this.get(seriesName).aggregate(function) .renameSeries(Series.GROUP_KEY, this.keyName) .renameSeries(Series.GROUP_VALUE, seriesName) .setIndex(this.keyName); } } /** * Builder for DataFrame in row-by-row sequence. Constructs each column as a StringSeries * and attempts to infer a tighter native type on completion. */ public static final class Builder { final List<String> seriesNames; final List<Object[]> rows = new ArrayList<>(); Builder(List<String> seriesNames) { this.seriesNames = seriesNames; } public Builder append(Collection<Object[]> rows) { for(Object[] row : rows) { if (row.length != this.seriesNames.size()) throw new IllegalArgumentException(String.format("Expected %d values, but got %d", seriesNames.size(), row.length)); this.rows.add(row); } return this; } public Builder append(Object[]... rows) { return this.append(Arrays.asList(rows)); } public Builder append(Object... row) { return this.append(Collections.singleton(row)); } public DataFrame build() { DataFrame df = new DataFrame(); // infer column types for(int i=0; i<seriesNames.size(); i++) { String rawName = seriesNames.get(i); boolean isDynamicType = true; String name = rawName; Series.SeriesType type = Series.SeriesType.STRING; String[] parts = rawName.split(":", 2); if(parts.length == 2) { name = parts[0]; type = Series.SeriesType.valueOf(parts[1].toUpperCase()); isDynamicType = false; } Series series = buildSeries(type, i); // infer type if not provided if(isDynamicType) { series = series.get(((StringSeries) series).inferType()); } df.addSeries(name, series); } return df; } private Series buildSeries(Series.SeriesType type, int columnIndex) { switch (type) { case DOUBLE: return buildDoubleSeries(columnIndex); case LONG: return buildLongSeries(columnIndex); case STRING: return buildStringSeries(columnIndex); case BOOLEAN: return buildBooleanSeries(columnIndex); default: throw new IllegalArgumentException(String.format("Unknown series type '%s'", type)); } } // TODO implement ObjectSeries private DoubleSeries buildDoubleSeries(int columnIndex) { double[] values = new double[this.rows.size()]; int i = 0; for(Object[] r : this.rows) { values[i++] = toDouble(r[columnIndex]); } return DoubleSeries.buildFrom(values); } private static double toDouble(Object o) { if(o == null) return DoubleSeries.NULL; if(o instanceof Number) return ((Number)o).doubleValue(); return StringSeries.getDouble(o.toString()); } private LongSeries buildLongSeries(int columnIndex) { long[] values = new long[this.rows.size()]; int i = 0; for(Object[] r : this.rows) { values[i++] = toLong(r[columnIndex]); } return LongSeries.buildFrom(values); } private static long toLong(Object o) { if(o == null) return LongSeries.NULL; if(o instanceof Number) return ((Number)o).longValue(); return StringSeries.getLong(o.toString()); } private StringSeries buildStringSeries(int columnIndex) { String[] values = new String[this.rows.size()]; int i = 0; for(Object[] r : this.rows) { values[i++] = toString(r[columnIndex]); } return StringSeries.buildFrom(values); } private static String toString(Object o) { if(o == null) return StringSeries.NULL; return StringSeries.getString(o.toString()); } private BooleanSeries buildBooleanSeries(int columnIndex) { byte[] values = new byte[this.rows.size()]; int i = 0; for(Object[] r : this.rows) { values[i++] = toBoolean(r[columnIndex]); } return BooleanSeries.buildFrom(values); } private static byte toBoolean(Object o) { if(o == null) return BooleanSeries.NULL; if(o instanceof Number) return BooleanSeries.valueOf(((Number)o).doubleValue() != 0.0d); return StringSeries.getBoolean(o.toString()); } } String indexName = null; Map<String, Series> series = new HashMap<>(); /** * Returns a DoubleSeries wrapping the values array * * @param values base array * @return LongSeries wrapping the array */ public static DoubleSeries toSeries(double... values) { return DoubleSeries.buildFrom(values); } /** * Returns a LongSeries wrapping the values array * * @param values base array * @return LongSeries wrapping the array */ public static LongSeries toSeries(long... values) { return LongSeries.buildFrom(values); } /** * Returns a StringSeries wrapping the values array * * @param values base array * @return StringSeries wrapping the array */ public static StringSeries toSeries(String... values) { return StringSeries.buildFrom(values); } /** * Returns a BooleanSeries wrapping the values array * * @param values base array * @return BooleanSeries wrapping the array */ public static BooleanSeries toSeries(byte... values) { return BooleanSeries.buildFrom(values); } /** * Returns a BooleanSeries wrapping the values array (as converted to byte) * * @param values base array * @return BooleanSeries wrapping the array */ public static BooleanSeries toSeries(boolean... values) { return BooleanSeries.builder().addBooleanValues(values).build(); } /** * Returns a builder instance for DataFrame * * @param seriesNames series names of the DataFrame * @return FDataFrame builder */ public static Builder builder(String... seriesNames) { return new Builder(Arrays.asList(seriesNames)); } /** * Returns a builder instance for DataFrame * * @param seriesNames series names of the DataFrame * @return FDataFrame builder */ public static Builder builder(List<String> seriesNames) { return new Builder(seriesNames); } /** * Returns a builder instance for DoubleSeries * * @return DoubleSeries builder */ public static DoubleSeries.Builder buildDoubles() { return DoubleSeries.builder(); } /** * Returns a builder instance for LongSeries * * @return LongSeries builder */ public static LongSeries.Builder buildLongs() { return LongSeries.builder(); } /** * Returns a builder instance for StringSeries * * @return StringSeries builder */ public static StringSeries.Builder buildStrings() { return StringSeries.builder(); } /** * Returns a builder instance for BooleanSeries * * @return BooleanSeries builder */ public static BooleanSeries.Builder buildBooleans() { return BooleanSeries.builder(); } /** * Creates a new DataFrame with a column "index" (as determined by {@code COLUMN_INDEX_DEFAULT}) with * length {@code defaultIndexSize}, ranging from 0 to {@code defaultIndexSize - 1}. * * @param defaultIndexSize index column size */ public DataFrame(int defaultIndexSize) { long[] indexValues = new long[defaultIndexSize]; for(int i=0; i<defaultIndexSize; i++) { indexValues[i] = i; } this.addSeries(COLUMN_INDEX_DEFAULT, LongSeries.buildFrom(indexValues)); this.indexName = COLUMN_INDEX_DEFAULT; } /** * Creates a new DataFrame with a column "index" (as determined by {@code COLUMN_INDEX_DEFAULT}) that * wraps the array {@code indexValues}. * * @param indexValues index values */ public DataFrame(long... indexValues) { this.addSeries(COLUMN_INDEX_DEFAULT, LongSeries.buildFrom(indexValues)); this.indexName = COLUMN_INDEX_DEFAULT; } /** * Creates a new DataFrame with a column "index" (as determined by {@code COLUMN_INDEX_DEFAULT}) referencing * the Series {@code index}. * * @param index index series */ public DataFrame(Series index) { this.addSeries(COLUMN_INDEX_DEFAULT, index); this.indexName = COLUMN_INDEX_DEFAULT; } /** * Creates a new DataFrame that copies the properties of {@code df}. * * <br/><b>NOTE:</b> the copy is shallow, i.e. the contained series are not copied but referenced. * * @param df DataFrame to copy properties from */ public DataFrame(DataFrame df) { this.indexName = df.indexName; this.series = new HashMap<>(df.series); } /** * Creates a new DataFrame without any columns. The row count of the DataFrame is determined * by the first series added. */ public DataFrame() { // left blank } /** * Sets the index name to the specified series name in-place. * * @param seriesName index series name * @throws IllegalArgumentException if the series does not exist * @return reference to the modified DataFrame (this) */ public DataFrame setIndex(String seriesName) { assertSeriesExists(seriesName); this.indexName = seriesName; return this; } /** * Resets the index name to {@code null} in-place. * * @return reference to the modified DataFrame (this) */ public DataFrame resetIndex() { this.indexName = null; return this; } /** * Returns the series referenced by indexName. * * @throws IllegalArgumentException if the series does not exist * @return index series */ public Series getIndex() { return assertSeriesExists(this.indexName); } /** * Returns {@code true} if a valid index name is set. Otherwise, returns {@code false}. * * @return {@code true} if a valid index name is set, {@code false} otherwise */ public boolean hasIndex() { return this.indexName != null; } /** * Returns the series name of the index, or {@code null} if no index name is set. * * @return index series name */ public String getIndexName() { return this.indexName; } /** * Returns the row count of the DataFrame * * @return row count */ public int size() { if(this.series.isEmpty()) return 0; return this.series.values().iterator().next().size(); } /** * Returns a copy of the DataFrame sliced from index {@code from} (inclusive) to index {@code to} * (exclusive). * * @param from start index (inclusive), must be >= 0 * @param to end index (exclusive), must be <= size * @return sliced DataFrame copy */ public DataFrame slice(int from, int to) { DataFrame df = new DataFrame(this); df.series.clear(); for(Map.Entry<String, Series> e : this.series.entrySet()) { df.addSeries(e.getKey(), e.getValue().slice(from, to)); } return df; } /** * Returns a copy of the DataFrame omitting any elements before index {@code n}. * If {@code n} is {@code 0}, the entire DataFrame is returned. If {@code n} is greater than * the DataFrame size, an empty DataFrame is returned. * * @param from start index of copy (inclusive) * @return DataFrame copy with elements from index {@code from}. */ public DataFrame sliceFrom(int from) { return this.slice(from, this.size()); } /** * Returns a copy of the DataFrame omitting any elements equal to or after index {@code n}. * If {@code n} is equal or greater than the DataFrame size, the entire series is returned. * If {@code n} is {@code 0}, an empty DataFrame is returned. * * @param to end index of copy (exclusive) * @return DataFrame copy with elements before from index {@code from}. */ public DataFrame sliceTo(int to) { return this.slice(0, to); } /** * Returns a copy of the DataFrame containing (up to) {@code n} first rows. * * @param n number of rows to include * @return DataFrame copy with first {@code n} rows */ public DataFrame head(int n) { return this.slice(0, n); } /** * Returns a copy of the DataFrame containing (up to) {@code n} last rows. * * @param n number of rows to include * @return DataFrame copy with last {@code n} rows */ public DataFrame tail(int n) { return this.slice(this.size() - n, this.size()); } /** * Returns {@code true} is the DataFrame does not hold any rows. Otherwise, returns {@code false}. * * @return {@code true} is empty, {@code false} otherwise. */ public boolean isEmpty() { return this.size() <= 0; } /** * Returns a deep copy of the DataFrame. Duplicates each series as well as the DataFrame itself. * <br/><b>NOTE:</b> use caution when applying this to large DataFrames. * * @return deep copy of DataFrame */ public DataFrame copy() { DataFrame df = new DataFrame(this); for(Map.Entry<String, Series> e : this.series.entrySet()) { df.addSeries(e.getKey(), e.getValue().copy()); } return df; } /** * Adds a new series to the DataFrame in-place. The new series must have the same row count * as the DataFrame. If this is the first series added to an empty DataFrame, it determines * the DataFrame size. Further, {@code seriesName} must match the pattern {@code SERIES_NAME_PATTERN}. * If a series with {@code seriesName} already exists in the DataFrame it is replaced by * {@code series}. * * @param seriesName series name * @param series series * @throws IllegalArgumentException if the series does not have the same size or the series name does not match the pattern * @return reference to the modified DataFrame (this) */ public DataFrame addSeries(String seriesName, Series series) { if(seriesName == null || !SERIES_NAME_PATTERN.matcher(seriesName).matches()) throw new IllegalArgumentException(String.format("Series name must match pattern '%s'", SERIES_NAME_PATTERN)); if(!this.series.isEmpty() && series.size() != this.size()) throw new IllegalArgumentException("DataFrame index and series must be of same length"); this.series.put(seriesName, series); return this; } /** * Adds a new series to the DataFrame in-place. Wraps {@code values} with a series before adding * it to the DataFrame with semantics similar to {@code addSeries(String seriesName, Series series)} * * @param seriesName series name * @param values series * @return reference to the modified DataFrame (this) */ public DataFrame addSeries(String seriesName, double... values) { return addSeries(seriesName, DataFrame.toSeries(values)); } /** * Adds a new series to the DataFrame in-place. Wraps {@code values} with a series before adding * it to the DataFrame with semantics similar to {@code addSeries(String seriesName, Series series)} * * @param seriesName series name * @param values series * @return reference to the modified DataFrame (this) */ public DataFrame addSeries(String seriesName, long... values) { return addSeries(seriesName, DataFrame.toSeries(values)); } /** * Adds a new series to the DataFrame in-place. Wraps {@code values} with a series before adding * it to the DataFrame with semantics similar to {@code addSeries(String seriesName, Series series)} * * @param seriesName series name * @param values series * @return reference to the modified DataFrame (this) */ public DataFrame addSeries(String seriesName, String... values) { return addSeries(seriesName, DataFrame.toSeries(values)); } /** * Adds a new series to the DataFrame in-place. Wraps {@code values} with a series before adding * it to the DataFrame with semantics similar to {@code addSeries(String seriesName, Series series)} * * @param seriesName series name * @param values series * @return reference to the modified DataFrame (this) */ public DataFrame addSeries(String seriesName, byte... values) { return addSeries(seriesName, DataFrame.toSeries(values)); } /** * Adds a new series to the DataFrame in-place. Wraps {@code values} with a series before adding * it to the DataFrame with semantics similar to {@code addSeries(String seriesName, Series series)} * * @param seriesName series name * @param values series * @return reference to the modified DataFrame (this) */ public DataFrame addSeries(String seriesName, boolean... values) { return addSeries(seriesName, DataFrame.toSeries(values)); } /** * Removes a series from the DataFrame in-place. * * @param seriesName * @throws IllegalArgumentException if the series does not exist * @return reference to the modified DataFrame (this) */ public DataFrame dropSeries(String seriesName) { assertSeriesExists(seriesName); this.series.remove(seriesName); if(seriesName.equals(this.indexName)) this.indexName = null; return this; } /** * Renames a series in the DataFrame in-place. If a series with name {@code newName} already * exists it is replaced by the series referenced by {@code oldName}. * * @param oldName name of existing series * @param newName new name of series * @throws IllegalArgumentException if the series referenced by {@code oldName} does not exist * @return reference to the modified DataFrame (this) */ public DataFrame renameSeries(String oldName, String newName) { Series s = assertSeriesExists(oldName); String indexName = this.indexName; this.dropSeries(oldName).addSeries(newName, s); if(oldName.equals(indexName)) this.indexName = newName; return this; } /** * Converts a series in the DataFrame to a new type. The DataFrame is modified in-place, but * the series is allocated new memory. * * @param seriesName name of existing series * @param type new native type of series * @throws IllegalArgumentException if the series does not exist * @return reference to the modified DataFrame (this) */ public DataFrame convertSeries(String seriesName, Series.SeriesType type) { this.series.put(seriesName, assertSeriesExists(seriesName).get(type)); return this; } /** * Returns the set of names of series contained in the DataFrame. * * @return series names */ public Set<String> getSeriesNames() { return Collections.unmodifiableSet(this.series.keySet()); } /** * Returns a copy of the mapping of series names to series encapsulated by this DataFrame * * @return series mapping */ public Map<String, Series> getSeries() { return Collections.unmodifiableMap(this.series); } /** * Returns the series referenced by {@code seriesName}. * * @param seriesName series name * @throws IllegalArgumentException if the series does not exist * @return series */ public Series get(String seriesName) { return assertSeriesExists(seriesName); } /** * Returns the series referenced by {@code seriesNames}. * * @param seriesNames series names * @throws IllegalArgumentException if any one series does not exist * @return series array */ public Series[] get(String... seriesNames) { Series[] series = new Series[seriesNames.length]; int i = 0; for(String name : seriesNames) { series[i++] = assertSeriesExists(name); } return series; } /** * Returns {@code true} if the DataFrame contains a series {@code seriesName}. Otherwise, * return {@code false}. * * @param seriesName series name * @return {@code true} if series exists, {@code false} otherwise. */ public boolean contains(String seriesName) { return this.series.containsKey(seriesName); } /** * Returns the series referenced by {@code seriesName}. If the series' native type is not * {@code DoubleSeries} it is converted transparently. * * @param seriesName series name * @throws IllegalArgumentException if the series does not exist * @return DoubleSeries */ public DoubleSeries getDoubles(String seriesName) { return assertSeriesExists(seriesName).getDoubles(); } /** * Returns the series referenced by {@code seriesName}. If the series' native type is not * {@code LongSeries} it is converted transparently. * * @param seriesName series name * @throws IllegalArgumentException if the series does not exist * @return LongSeries */ public LongSeries getLongs(String seriesName) { return assertSeriesExists(seriesName).getLongs(); } /** * Returns the series referenced by {@code seriesName}. If the series' native type is not * {@code StringSeries} it is converted transparently. * * @param seriesName series name * @throws IllegalArgumentException if the series does not exist * @return StringSeries */ public StringSeries getStrings(String seriesName) { return assertSeriesExists(seriesName).getStrings(); } /** * Returns the series referenced by {@code seriesName}. If the series' native type is not * {@code BooleanSeries} it is converted transparently. * * @param seriesName series name * @throws IllegalArgumentException if the series does not exist * @return BooleanSeries */ public BooleanSeries getBooleans(String seriesName) { return assertSeriesExists(seriesName).getBooleans(); } public double getDouble(String seriesName, int index) { return assertSeriesExists(seriesName).getDouble(index); } public long getLong(String seriesName, int index) { return assertSeriesExists(seriesName).getLong(index); } public String getString(String seriesName, int index) { return assertSeriesExists(seriesName).getString(index); } public byte getBoolean(String seriesName, int index) { return assertSeriesExists(seriesName).getBoolean(index); } /** * Applies {@code function} to the series referenced by {@code seriesNames} row by row * and returns the results as a new series. The series' values are mapped to arguments * of {@code function} in the same order as they appear in {@code seriesNames}. * If the series' native types do not match the required input types of {@code function}, * the series are converted transparently. The native type of the returned series is * determined by {@code function}'s output type. * * @param function function to apply to each row * @param seriesNames names of input series * @throws IllegalArgumentException if the series does not exist * @return series with evaluation results */ public Series map(Series.Function function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * @see DataFrame#map(Series.Function, Series...) */ public DoubleSeries map(Series.DoubleFunction function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * @see DataFrame#map(Series.Function, Series...) */ public LongSeries map(Series.LongFunction function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * @see DataFrame#map(Series.Function, Series...) */ public StringSeries map(Series.StringFunction function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * @see DataFrame#map(Series.Function, Series...) */ public BooleanSeries map(Series.BooleanFunction function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * @see DataFrame#map(Series.Function, Series...) */ public BooleanSeries map(Series.BooleanFunctionEx function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * @see DataFrame#map(Series.Function, Series...) */ public BooleanSeries map(Series.DoubleConditional function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * @see DataFrame#map(Series.Function, Series...) */ public BooleanSeries map(Series.LongConditional function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * @see DataFrame#map(Series.Function, Series...) */ public BooleanSeries map(Series.StringConditional function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * @see DataFrame#map(Series.Function, Series...) */ public BooleanSeries map(Series.BooleanConditional function, String... seriesNames) { return map(function, names2series(seriesNames)); } /** * Applies {@code function} to the series referenced by {@code seriesNames} row by row * and adds the result to the DataFrame as a new series with name {@code outputName}. * The series' values are mapped to arguments of {@code function} in the same order * as they appear in {@code seriesNames}. * If the series' native types do not match the required input types of {@code function}, * the series are converted transparently. The native type of the returned series is * determined by {@code function}'s output type. * * @param function function to apply to each row * @param outputName name of output series * @param inputNames names of input series, or none to use output series name as only input * @throws IllegalArgumentException if the series does not exist * @return series with evaluation results */ public DataFrame mapInPlace(Series.Function function, String outputName, String... inputNames) { return this.addSeries(outputName, map(function, names2series(inputNames))); } /** * Applies {@code function} to the series referenced by {@code seriesName} row by row * and adds the result to the DataFrame as a new series with the same name. * If the series' native types do not match the required input types of {@code function}, * the series are converted transparently. The native type of the returned series is * determined by {@code function}'s output type. * * @param function function to apply to each row * @param seriesName name of series * @throws IllegalArgumentException if the series does not exist * @return series with evaluation results */ public DataFrame mapInPlace(Series.Function function, String seriesName) { return this.addSeries(seriesName, map(function, this.get(seriesName))); } /** * Applies {@code function} to {@code series} row by row * and returns the results as a new series. The series' values are mapped to arguments * of {@code function} in the same order as they appear in {@code series}. * If the series' native types do not match the required input types of {@code function}, * the series are converted transparently. The native type of the returned series is * determined by {@code function}'s output type. * * @param function function to apply to each row * @param series input series for function * @throws IllegalArgumentException if the series does not exist * @return series with evaluation results */ public static Series map(Series.Function function, Series... series) { return Series.map(function, series); } /** * @see DataFrame#map(Series.Function, Series...) */ public static DoubleSeries map(Series.DoubleFunction function, Series... series) { return (DoubleSeries)map((Series.Function)function, series); } /** * @see DataFrame#map(Series.Function, Series...) */ public static LongSeries map(Series.LongFunction function, Series... series) { return (LongSeries)map((Series.Function)function, series); } /** * @see DataFrame#map(Series.Function, Series...) */ public static StringSeries map(Series.StringFunction function, Series... series) { return (StringSeries)map((Series.Function)function, series); } /** * @see DataFrame#map(Series.Function, Series...) */ public static BooleanSeries map(Series.BooleanFunction function, Series... series) { return (BooleanSeries)map((Series.Function)function, series); } /** * @see DataFrame#map(Series.Function, Series...) */ public static BooleanSeries map(Series.BooleanFunctionEx function, Series... series) { return (BooleanSeries)map((Series.Function)function, series); } /** * @see DataFrame#map(Series.Function, Series...) */ public static BooleanSeries map(Series.DoubleConditional function, Series... series) { return (BooleanSeries)map((Series.Function)function, series); } /** * @see DataFrame#map(Series.Function, Series...) */ public static BooleanSeries map(Series.LongConditional function, Series... series) { return (BooleanSeries)map((Series.Function)function, series); } /** * @see DataFrame#map(Series.Function, Series...) */ public static BooleanSeries map(Series.StringConditional function, Series... series) { return (BooleanSeries)map((Series.Function)function, series); } /** * @see DataFrame#map(Series.Function, Series...) */ public static BooleanSeries map(Series.BooleanConditional function, Series... series) { return (BooleanSeries)map((Series.Function)function, series); } /** * Applies {@code doubleExpression} compiled to an expression to the series referenced by * {@code seriesNames} row by row and returns the results as a new series. The series' values * are mapped to variables in {@code doubleExpression} by series names. Only series referenced * by {@code seriesNames} can be referenced by the expression. * The series are converted to {@code DoubleSeries} transparently and the results * are returned as DoubleSeries as well. * * <br/><b>NOTE:</b> doubleExpression is compiled to an {@code EvalEx} expression. * * @param doubleExpression expression to be compiled and applied using EvalEx * @throws IllegalArgumentException if the series does not exist * @return series with evaluation results */ public DoubleSeries map(String doubleExpression, final String... seriesNames) { final Expression e = new Expression(doubleExpression); return (DoubleSeries)this.map(new Series.DoubleFunction() { @Override public double apply(double[] values) { for(int i=0; i<values.length; i++) { e.with(seriesNames[i], new BigDecimal(values[i])); } return e.eval().doubleValue(); } }, seriesNames); } /** * Applies {@code doubleExpression} compiled to an expression to the series referenced by * {@code seriesNames} row by row and returns the results as a new series. The series' values * are mapped to variables in {@code doubleExpression} by series names. All series contained * in the DataFrame can be referenced by the expression. * The series are converted to {@code DoubleSeries} transparently and the results * are returned as DoubleSeries as well. * * <br/><b>NOTE:</b> doubleExpression is compiled to an {@code EvalEx} expression. * * @param doubleExpression expression to be compiled and applied using EvalEx * @throws IllegalArgumentException if the series does not exist * @return series with evaluation results */ public DoubleSeries map(String doubleExpression) { Set<String> variables = extractSeriesNames(doubleExpression); return this.map(doubleExpression, variables.toArray(new String[variables.size()])); } /** * Returns a projection of the DataFrame. * * <br/><b>NOTE:</b> fromIndex <= -1 is filled with {@code null}. * <br/><b>NOTE:</b> array with length 0 produces empty series. * * @param fromIndex array with indices to project from (must be <= series size) * @return DataFrame projection */ public DataFrame project(int[] fromIndex) { DataFrame newDataFrame = new DataFrame(this); newDataFrame.series.clear(); for(Map.Entry<String, Series> e : this.series.entrySet()) { newDataFrame.addSeries(e.getKey(), e.getValue().project(fromIndex)); } return newDataFrame; } /** * Returns a copy of the DataFrame sorted by series values referenced by {@code seriesNames}. * The resulting sorted order is the equivalent of applying a stable sort to the nth series * first, and then sorting iteratively by series until the 1st series. * * @param seriesNames 1st series, 2nd series, ..., nth series * @throws IllegalArgumentException if the series does not exist * @return sorted DataFrame copy */ public DataFrame sortedBy(String... seriesNames) { DataFrame df = this; for(int i=seriesNames.length-1; i>=0; i--) { df = df.project(assertSeriesExists(seriesNames[i]).sortedIndex()); } return df; } /** * Returns a copy of the DataFrame with the order of values in the series reversed. * * @return reversed DataFrame copy */ public DataFrame reverse() { DataFrame newDataFrame = new DataFrame(this); for(Map.Entry<String, Series> e : this.series.entrySet()) { newDataFrame.addSeries(e.getKey(), e.getValue().reverse()); } return newDataFrame; } /** * Returns a copy of the DataFrame with values resampled by {@code interval} using {@code strategy} * on the series referenced by {@code seriesName}. The method first applies an interval-based * grouping to the series and then aggregates the DataFrame using the specified strategy. If * the series referenced by {@code seriesName} is not of native type {@code LongSeries} it is * converted transparently. * * @param seriesName target series for resampling * @param interval resampling interval * @param strategy resampling strategy * @throws IllegalArgumentException if the series does not exist * @return resampled DataFrame copy */ public DataFrame resampledBy(String seriesName, long interval, ResamplingStrategy strategy) { DataFrame baseDataFrame = this.sortedBy(seriesName); Series.SeriesGrouping grouping = baseDataFrame.getLongs(seriesName).groupByInterval(interval); // resample series DataFrame newDataFrame = new DataFrame(this); newDataFrame.series.clear(); for(Map.Entry<String, Series> e : baseDataFrame.getSeries().entrySet()) { if(e.getKey().equals(seriesName)) continue; newDataFrame.addSeries(e.getKey(), strategy.apply(grouping, e.getValue()).get(Series.GROUP_VALUE)); } // new series newDataFrame.addSeries(seriesName, grouping.keys()); return newDataFrame; } /** * Returns a copy of the DataFrame with rows filtered by {@code series}. If the value of {@code series} * associated with a row is {@code true} the row is copied, otherwise it is set to {@code null}. * * @param series filter series * @return filtered DataFrame copy */ public DataFrame filter(BooleanSeries series) { if(series.size() != this.size()) throw new IllegalArgumentException("Series size must be equal to index size"); int[] fromIndex = new int[series.size()]; for(int i=0; i<series.size(); i++) { if(BooleanSeries.isTrue(series.values[i])) { fromIndex[i] = i; } else { fromIndex[i] = -1; } } return this.project(fromIndex); } public DataFrame filter(String seriesName) { return this.filter(this.getBooleans(seriesName)); } public DataFrame filter(Series.Conditional conditional, String... seriesNames) { return filter(conditional, names2series(seriesNames)); } public DataFrame filter(Series.Conditional conditional, Series... series) { return filter((BooleanSeries)Series.map(conditional, series)); } public DataFrame filterEquals(String seriesName, final double value) { return this.filter(new Series.DoubleConditional() { @Override public boolean apply(double... v) { return value == v[0]; } }, seriesName); } public DataFrame filterEquals(String seriesName, final long value) { return this.filter(new Series.LongConditional() { @Override public boolean apply(long... v) { return value == v[0]; } }, seriesName); } public DataFrame filterEquals(String seriesName, final String value) { return this.filter(new Series.StringConditional() { @Override public boolean apply(String... v) { return value.equals(v[0]); } }, seriesName); } public DataFrame filterEquals(String seriesName, final boolean value) { return this.filter(new Series.BooleanConditional() { @Override public boolean apply(boolean... v) { return value == v[0]; } }, seriesName); } /** * Returns a DataFrameGrouping based on the labels provided by {@code labels} row by row. * The size of {@code labels} must match the size of the DataFrame. * * @param labels grouping labels * @return DataFrameGrouping */ public DataFrameGrouping groupBy(Series labels) { Series.SeriesGrouping grouping = labels.groupByValue(); return new DataFrameGrouping(grouping.keys(), this, grouping.buckets); } /** * Returns a DataFrameGrouping based on the labels provided by the series referenced by * {@code seriesName} row by row. * * @param seriesName series containing grouping labels * @return DataFrameGrouping */ public DataFrameGrouping groupBy(String seriesName) { Series.SeriesGrouping grouping = this.get(seriesName).groupByValue(); return new DataFrameGrouping(seriesName, grouping.keys(), this, grouping.buckets); } /** * Returns a copy of the DataFrame omitting rows that contain a {@code null} value in any series. * * @return DataFrame copy without null rows */ public DataFrame dropNull() { BooleanSeries isNull = BooleanSeries.fillValues(this.size(), false); for(Series s : this.series.values()) { isNull = isNull.or(s.isNull()); } int[] fromIndex = new int[isNull.count(false)]; int countNotNull = 0; for(int i=0; i<this.size(); i++) { if(BooleanSeries.isFalse(isNull.getBoolean(i))) { fromIndex[countNotNull++] = i; } } return this.project(Arrays.copyOf(fromIndex, countNotNull)); } /** * Returns a copy of the DataFrame omitting series that contain a {@code null} value. * * @return DataFrame copy without null series */ public DataFrame dropNullColumns() { DataFrame df = new DataFrame(this); df.series.clear(); for(Map.Entry<String, Series> e : this.getSeries().entrySet()) { if(!e.getValue().hasNull()) df.addSeries(e.getKey(), e.getValue()); } return df; } /** * Returns a copy of the DataFrame with series {@code seriesName} replacing {@code null} * values with its native default value. * * @param seriesName * @return */ public DataFrame fillNull(String seriesName) { DataFrame df = new DataFrame(this); return df.addSeries(seriesName, assertSeriesExists(seriesName).fillNull()); } /* ************************************************************************** * Joins across data frames ***************************************************************************/ public DataFrame joinInner(DataFrame other) { assertIndex(this, other); return this.joinInner(other, this.getIndexName(), other.getIndexName()); } public DataFrame joinInner(DataFrame other, String onSeries) { return this.joinInner(other, onSeries, onSeries); } public DataFrame joinInner(DataFrame other, String onSeriesLeft, String onSeriesRight) { List<Series.JoinPair> pairs = this.get(onSeriesLeft).join(other.get(onSeriesRight), Series.JoinType.INNER); return DataFrame.join(this, other, pairs, onSeriesLeft, onSeriesRight); } public DataFrame joinLeft(DataFrame other) { assertIndex(this, other); return this.joinLeft(other, this.getIndexName(), other.getIndexName()); } public DataFrame joinLeft(DataFrame other, String onSeries) { return this.joinLeft(other, onSeries, onSeries); } public DataFrame joinLeft(DataFrame other, String onSeriesLeft, String onSeriesRight) { List<Series.JoinPair> pairs = this.get(onSeriesLeft).join(other.get(onSeriesRight), Series.JoinType.LEFT); return DataFrame.join(this, other, pairs, onSeriesLeft, onSeriesRight); } public DataFrame joinRight(DataFrame other) { assertIndex(this, other); return this.joinRight(other, this.getIndexName(), other.getIndexName()); } public DataFrame joinRight(DataFrame other, String onSeries) { return this.joinRight(other, onSeries, onSeries); } public DataFrame joinRight(DataFrame other, String onSeriesLeft, String onSeriesRight) { List<Series.JoinPair> pairs = this.get(onSeriesLeft).join(other.get(onSeriesRight), Series.JoinType.RIGHT); return DataFrame.join(this, other, pairs, onSeriesLeft, onSeriesRight); } public DataFrame joinOuter(DataFrame other) { assertIndex(this, other); return this.joinOuter(other, this.getIndexName(), other.getIndexName()); } public DataFrame joinOuter(DataFrame other, String onSeries) { return this.joinOuter(other, onSeries, onSeries); } public DataFrame joinOuter(DataFrame other, String onSeriesLeft, String onSeriesRight) { List<Series.JoinPair> pairs = this.get(onSeriesLeft).join(other.get(onSeriesRight), Series.JoinType.OUTER); return DataFrame.join(this, other, pairs, onSeriesLeft, onSeriesRight); } private static DataFrame join(DataFrame left, DataFrame right, List<Series.JoinPair> pairs, String onSeriesLeft, String onSeriesRight) { int[] fromIndexLeft = new int[pairs.size()]; int i=0; for(Series.JoinPair p : pairs) { fromIndexLeft[i++] = p.left; } int[] fromIndexRight = new int[pairs.size()]; int j=0; for(Series.JoinPair p : pairs) { fromIndexRight[j++] = p.right; } DataFrame leftData = left.project(fromIndexLeft); DataFrame rightData = right.project(fromIndexRight); Set<String> seriesLeft = left.getSeriesNames(); Set<String> seriesRight = right.getSeriesNames(); DataFrame joined = new DataFrame(); for(String name : seriesRight) { Series s = rightData.get(name); if(!seriesLeft.contains(name) || name.equals(onSeriesRight)) { joined.addSeries(name, s); } else { joined.addSeries(name + COLUMN_JOIN_RIGHT, s); } } for(String name : seriesLeft) { Series s = leftData.get(name); if(!seriesRight.contains(name) || name.equals(onSeriesLeft)) { joined.addSeries(name, s); } else { joined.addSeries(name + COLUMN_JOIN_LEFT, s); } } joined.setIndex(onSeriesLeft); return joined; } /** * Returns a copy of the DataFrame with data from {@code others} appended at the end. Matches * series by names and uses the native type of the original (this) DataFrame. If {@code others} * do not contain series with matching names, a sequence of {@code nulls} is appended. Any series * in {@code other} that are not matched by name are discarded. * * @param others DataFrames to append in sequence * @return copy of the DataFrame with appended data */ public DataFrame append(DataFrame... others) { DataFrame df = new DataFrame(this); df.series.clear(); for(String name : this.getSeriesNames()) { Series.Builder builder = this.get(name).getBuilder(); builder.addSeries(this.get(name)); for(DataFrame other : others) { if (other.contains(name)) { builder.addSeries(other.get(name)); } else { builder.addSeries(BooleanSeries.nulls(other.size())); } } df.addSeries(name, builder.build()); } return df; } @Override public String toString() { StringBuilder builder = new StringBuilder(); builder.append("DataFrame{\n"); builder.append(this.toString(DEFAULT_MAX_COLUMN_WIDTH, this.getSeriesNames().toArray(new String[0]))); builder.append("}"); return builder.toString(); } public String toString(int maxColumnWidth, String... seriesNames) { String[][] values = new String[this.size()][seriesNames.length]; int[] width = new int[seriesNames.length]; for(int i=0; i<seriesNames.length; i++) { Series s = assertSeriesExists(seriesNames[i]); width[i] = truncateToString(seriesNames[i], maxColumnWidth).length(); for(int j=0; j<this.size(); j++) { String itemValue = truncateToString(s.toString(j), maxColumnWidth); values[j][i] = itemValue; width[i] = Math.max(itemValue.length(), width[i]); } } StringBuilder sb = new StringBuilder(); // header for(int i=0; i<seriesNames.length; i++) { sb.append(String.format("%" + width[i] + "s", truncateToString(seriesNames[i], maxColumnWidth))); sb.append(" "); } sb.append("\n"); // values for(int j=0; j<this.size(); j++) { for(int i=0; i<seriesNames.length; i++) { Series s = this.get(seriesNames[i]); String item; switch(s.type()) { case DOUBLE: case LONG: case BOOLEAN: item = String.format("%" + width[i] + "s", values[j][i]); break; case STRING: item = String.format("%-" + width[i] + "s", values[j][i]); break; default: throw new IllegalArgumentException(String.format("Unknown series type '%s'", s.type())); } sb.append(item); sb.append(" "); } sb.append("\n"); } return sb.toString(); } static String truncateToString(String value, int maxWidth) { if(value.length() > maxWidth) value = value.substring(0, maxWidth - 3) + "..."; return value; } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } DataFrame dataFrame = (DataFrame) o; return series != null ? series.equals(dataFrame.series) : dataFrame.series == null; } @Override public int hashCode() { return series != null ? series.hashCode() : 0; } Series[] names2series(String... names) { Series[] inputSeries = new Series[names.length]; for(int i=0; i<names.length; i++) { inputSeries[i] = assertSeriesExists(names[i]); } return inputSeries; } Series assertSeriesExists(String name) { if(!series.containsKey(name)) throw new IllegalArgumentException(String.format("Unknown series '%s'", name)); return series.get(name); } void assertSameLength(Series s) { if(this.size() != s.size()) throw new IllegalArgumentException("Series size must be equals to DataFrame size"); } static void assertSameLength(Series... series) { for(int i=0; i<series.length-1; i++) { if (series[i].size() != series[i+1].size()) throw new IllegalArgumentException("Series size must be equals to DataFrame size"); } } static void assertIndex(DataFrame... dataframes) { for(DataFrame d : dataframes) if(!d.hasIndex()) throw new IllegalArgumentException("DataFrames must have a valid index"); } Set<String> extractSeriesNames(String doubleExpression) { Matcher m = SERIES_NAME_PATTERN.matcher(doubleExpression); Set<String> variables = new HashSet<>(); while(m.find()) { if(this.series.keySet().contains(m.group())) variables.add(m.group()); } return variables; } /* ************************************************************************** * DataFrame parsers ***************************************************************************/ /** * Reads in a CSV structured stream and returns it as a DataFrame. The native series type is * chosen to be as specific as possible based on the data ingested. * <br/><b>NOTE:</b> Expects the first line to contain * column headers. The column headers are transformed into series names by replacing non-word * character sequences with underscores ({@code "_"}). Leading digits in series names are also * escaped with a leading underscore. * * @param in input reader * @return CSV as DataFrame * @throws IOException if a read error is encountered * @throws IllegalArgumentException if the column headers cannot be transformed into valid series names */ public static DataFrame fromCsv(Reader in) throws IOException { Iterator<CSVRecord> it = CSVFormat.RFC4180.withFirstRecordAsHeader().parse(in).iterator(); if(!it.hasNext()) return new DataFrame(); CSVRecord first = it.next(); Set<String> headers = first.toMap().keySet(); // transform column headers into series names Map<String, String> header2name = new HashMap<>(); for(String h : headers) { // remove spaces String name = Pattern.compile("\\W+").matcher(h).replaceAll("_"); // underscore escape leading number if(Pattern.compile("\\A[0-9]").matcher(name).find()) name = "_" + name; if(!SERIES_NAME_PATTERN.matcher(name).matches()) { throw new IllegalArgumentException(String.format("Series name must match pattern '%s'", SERIES_NAME_PATTERN)); } header2name.put(h, name); } // read first line and initialize builders Map<String, StringSeries.Builder> builders = new HashMap<>(); for(String h : headers) { StringSeries.Builder builder = StringSeries.builder(); builder.addValues(first.get(h)); builders.put(h, builder); } while(it.hasNext()) { CSVRecord record = it.next(); for(String h : headers) { String value = record.get(h); builders.get(h).addValues(value); } } // construct dataframe and detect native data types DataFrame df = new DataFrame(); for(Map.Entry<String, StringSeries.Builder> e : builders.entrySet()) { StringSeries s = e.getValue().build(); Series conv = s.get(s.inferType()); String name = header2name.get(e.getKey()); df.addSeries(name, conv); } return df; } /** * Reads in a Pinot ResultSetGroup and returns it as a DataFrame. * * <br/><b>NOTE:</b> cannot parse a query result with multiple group aggregations * * @param resultSetGroup pinot query result * @return Pinot query result as DataFrame * @throws IllegalArgumentException if the result cannot be parsed */ public static DataFrame fromPinotResult(ResultSetGroup resultSetGroup) { if (resultSetGroup.getResultSetCount() <= 0) throw new IllegalArgumentException("Query did not return any results"); if (resultSetGroup.getResultSetCount() > 1) throw new IllegalArgumentException("Query returned multiple results"); ResultSet resultSet = resultSetGroup.getResultSet(0); DataFrame df = new DataFrame(); // TODO conditions not necessarily safe if(resultSet.getColumnCount() == 1 && resultSet.getRowCount() == 0) { // empty result } else if(resultSet.getColumnCount() == 1 && resultSet.getRowCount() == 1 && resultSet.getGroupKeyLength() == 0) { // aggregation result String function = resultSet.getColumnName(0); String value = resultSet.getString(0, 0); df.addSeries(function, DataFrame.toSeries(new String[] { value })); } else if(resultSet.getColumnCount() == 1 && resultSet.getGroupKeyLength() > 0) { // groupby result String function = resultSet.getColumnName(0); df.addSeries(function, makeGroupByValueSeries(resultSet)); for(int i=0; i<resultSet.getGroupKeyLength(); i++) { String groupKey = resultSet.getGroupKeyColumnName(i); df.addSeries(groupKey, makeGroupByGroupSeries(resultSet, i)); } } else if(resultSet.getColumnCount() >= 1 && resultSet.getGroupKeyLength() == 0) { // selection result for (int i = 0; i < resultSet.getColumnCount(); i++) { df.addSeries(resultSet.getColumnName(i), makeSelectionSeries(resultSet, i)); } } else { // defensive throw new IllegalStateException("Could not determine DataFrame shape from output"); } return df; } private static Series makeSelectionSeries(ResultSet resultSet, int colIndex) { int rowCount = resultSet.getRowCount(); if(rowCount <= 0) return StringSeries.empty(); //DataFrame.SeriesType type = inferType(resultSet.getString(0, colIndex)); String[] values = new String[rowCount]; for(int i=0; i<rowCount; i++) { values[i] = resultSet.getString(i, colIndex); } return DataFrame.toSeries(values); } private static Series makeGroupByValueSeries(ResultSet resultSet) { int rowCount = resultSet.getRowCount(); if(rowCount <= 0) return StringSeries.empty(); String[] values = new String[rowCount]; for(int i=0; i<rowCount; i++) { values[i] = resultSet.getString(i, 0); } return DataFrame.toSeries(values); } private static Series makeGroupByGroupSeries(ResultSet resultSet, int keyIndex) { int rowCount = resultSet.getRowCount(); if(rowCount <= 0) return StringSeries.empty(); String[] values = new String[rowCount]; for(int i=0; i<rowCount; i++) { values[i] = resultSet.getGroupKeyString(i, keyIndex); } return DataFrame.toSeries(values); } }