/* * Joinery -- Data frames for Java * Copyright (c) 2014, 2015 IBM Corp. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package joinery; import java.awt.Container; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.Set; import com.codahale.metrics.annotation.Timed; import joinery.impl.Aggregation; import joinery.impl.BlockManager; import joinery.impl.Combining; import joinery.impl.Comparison; import joinery.impl.Conversion; import joinery.impl.Display; import joinery.impl.Grouping; import joinery.impl.Index; import joinery.impl.Inspection; import joinery.impl.Pivoting; import joinery.impl.Selection; import joinery.impl.Serialization; import joinery.impl.Shaping; import joinery.impl.Shell; import joinery.impl.Sorting; import joinery.impl.SparseBitSet; import joinery.impl.Timeseries; import joinery.impl.Transforms; import joinery.impl.Views; /** * A data frame implementation in the spirit * of <a href="http://pandas.pydata.org">Pandas</a> or * <a href="http://cran.r-project.org/doc/manuals/r-release/R-intro.html#Data-frames"> * R</a> data frames. * * <p>Below is a simple motivating example. When working in Java, * data operations like the following should be easy. The code * below retrieves the S&P 500 daily market data for 2008 from * Yahoo! Finance and returns the average monthly close for * the three top months of the year.</p> * * <pre> {@code * > DataFrame.readCsv(String.format( * > "%s?s=%s&a=%d&b=%d&c=%d&d=%d&e=%d&f=%d", * > "http://real-chart.finance.yahoo.com/table.csv", * > "^GSPC", // symbol for S&P 500 * > 0, 2, 2008, // start date * > 11, 31, 2008 // end date * > )) * > .retain("Date", "Close") * > .groupBy(new KeyFunction<Object>() { * > public Object apply(List<Object> row) { * > return Date.class.cast(row.get(0)).getMonth(); * > } * > }) * > .mean() * > .sortBy("Close") * > .tail(3) * > .apply(new Function<Object, Number>() { * > public Number apply(Object value) { * > return Number.class.cast(value).intValue(); * > } * > }) * > .col("Close"); * [1370, 1378, 1403] }</pre> * * <p>Taking each step in turn: * <ol> * <li>{@link #readCsv(String)} reads csv data from files and urls</li> * <li>{@link #retain(Object...)} is used to * eliminate columns that are not needed</li> * <li>{@link #groupBy(KeyFunction)} with a key function * is used to group the rows by month</li> * <li>{@link #mean()} calculates the average close for each month</li> * <li>{@link #sortBy(Object...)} orders the rows according * to average closing price</li> * <li>{@link #tail(int)} returns the last three rows * (alternatively, sort in descending order and use head)</li> * <li>{@link #apply(Function)} is used to convert the * closing prices to integers (this is purely to ease * comparisons for verifying the results</li> * <li>finally, {@link #col(Object)} is used to * extract the values as a list</li> * </ol> * </p> * * <p>Find more details on the * <a href="http://github.com/cardillo/joinery">github</a> * project page.</p> * * @param <V> the type of values in this data frame */ public class DataFrame<V> implements Iterable<List<V>> { private final Index index; private final Index columns; private final BlockManager<V> data; private final Grouping groups; /** * Construct an empty data frame. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>(); * > df.isEmpty(); * true }</pre> */ public DataFrame() { this(Collections.<List<V>>emptyList()); } /** * Construct an empty data frame with the specified columns. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.columns(); * [name, value] }</pre> * * @param columns the data frame column names. */ public DataFrame(final String ... columns) { this(Arrays.asList((Object[])columns)); } /** * Construct an empty data frame with the specified columns. * * <pre> {@code * > List<String> columns = new ArrayList<>(); * > columns.add("name"); * > columns.add("value"); * > DataFrame<Object> df = new DataFrame<>(columns); * > df.columns(); * [name, value] }</pre> * * @param columns the data frame column names. */ public DataFrame(final Collection<?> columns) { this(Collections.emptyList(), columns, Collections.<List<V>>emptyList()); } /** * Construct a data frame containing the specified rows and columns. * * <pre> {@code * > List<String> rows = Arrays.asList("row1", "row2", "row3"); * > List<String> columns = Arrays.asList("col1", "col2"); * > DataFrame<Object> df = new DataFrame<>(rows, columns); * > df.get("row1", "col1"); * null }</pre> * * @param index the row names * @param columns the column names */ public DataFrame(final Collection<?> index, final Collection<?> columns) { this(index, columns, Collections.<List<V>>emptyList()); } /** * Construct a data frame from the specified list of columns. * * <pre> {@code * > List<List<Object>> data = Arrays.asList( * > Arrays.<Object>asList("alpha", "bravo", "charlie"), * > Arrays.<Object>asList(1, 2, 3) * > ); * > DataFrame<Object> df = new DataFrame<>(data); * > df.row(0); * [alpha, 1] }</pre> * * @param data a list of columns containing the data elements. */ public DataFrame(final List<? extends List<? extends V>> data) { this(Collections.emptyList(), Collections.emptyList(), data); } /** * Construct a new data frame using the specified data and indices. * * @param index the row names * @param columns the column names * @param data the data */ public DataFrame(final Collection<?> index, final Collection<?> columns, final List<? extends List<? extends V>> data) { final BlockManager<V> mgr = new BlockManager<V>(data); mgr.reshape( Math.max(mgr.size(), columns.size()), Math.max(mgr.length(), index.size()) ); this.data = mgr; this.columns = new Index(columns, mgr.size()); this.index = new Index(index, mgr.length()); this.groups = new Grouping(); } private DataFrame(final Index index, final Index columns, final BlockManager<V> data, final Grouping groups) { this.index = index; this.columns = columns; this.data = data; this.groups = groups; } /** * Add new columns to the data frame. * * Any existing rows will have {@code null} values for the new columns. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>(); * > df.add("value"); * > df.columns(); * [value] }</pre> * * @param columns the new column names * @return the data frame with the columns added */ public DataFrame<V> add(final Object ... columns) { for (final Object column : columns) { final List<V> values = new ArrayList<V>(length()); for (int r = 0; r < values.size(); r++) { values.add(null); } add(column, values); } return this; } public DataFrame<V> add(final List<V> values) { return add(length(), values); } /** * Add a new column to the data frame containing the value provided. * * Any existing rows with indices greater than the size of the * specified column data will have {@code null} values for the new column. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>(); * > df.add("value", Arrays.<Object>asList(1)); * > df.columns(); * [value] }</pre> * * @param column the new column names * @param values the new column values * @return the data frame with the column added */ public DataFrame<V> add(final Object column, final List<V> values) { columns.add(column, data.size()); index.extend(values.size()); data.add(values); return this; } /** * Create a new data frame by leaving out the specified columns. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value", "category"); * > df.drop("category").columns(); * [name, value] }</pre> * * @param cols the names of columns to be removed * @return a shallow copy of the data frame with the columns removed */ public DataFrame<V> drop(final Object ... cols) { return drop(columns.indices(cols)); } /** * Create a new data frame by leaving out the specified columns. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value", "category"); * > df.drop(2).columns(); * [name, value] }</pre> * * @param cols the indices of the columns to be removed * @return a shallow copy of the data frame with the columns removed */ public DataFrame<V> drop(final Integer ... cols) { final List<Object> colnames = new ArrayList<>(columns.names()); final List<Object> todrop = new ArrayList<>(cols.length); for (final int col : cols) { todrop.add(colnames.get(col)); } colnames.removeAll(todrop); final List<List<V>> keep = new ArrayList<>(colnames.size()); for (final Object col : colnames) { keep.add(col(col)); } return new DataFrame<>( index.names(), colnames, keep ); } public DataFrame<V> dropna() { return dropna(Axis.ROWS); } public DataFrame<V> dropna(final Axis direction) { switch (direction) { case ROWS: return select(new Selection.DropNaPredicate<V>()); default: return transpose() .select(new Selection.DropNaPredicate<V>()) .transpose(); } } /** * Returns a view of the of data frame with NA's replaced with {@code fill}. * * @param fill the value used to replace missing values * @return the new data frame */ public DataFrame<V> fillna(final V fill) { return apply(new Views.FillNaFunction<V>(fill)); } /** * Create a new data frame containing only the specified columns. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value", "category"); * > df.retain("name", "category").columns(); * [name, category] }</pre> * * @param cols the columns to include in the new data frame * @return a new data frame containing only the specified columns */ public DataFrame<V> retain(final Object ... cols) { return retain(columns.indices(cols)); } /** * Create a new data frame containing only the specified columns. * * <pre> {@code * DataFrame<Object> df = new DataFrame<>("name", "value", "category"); * df.retain(0, 2).columns(); * [name, category] }</pre> * * @param cols the columns to include in the new data frame * @return a new data frame containing only the specified columns */ public DataFrame<V> retain(final Integer ... cols) { final Set<Integer> keep = new HashSet<Integer>(Arrays.asList(cols)); final Integer[] todrop = new Integer[size() - keep.size()]; for (int i = 0, c = 0; c < size(); c++) { if (!keep.contains(c)) { todrop[i++] = c; } } return drop(todrop); } /** * Re-index the rows of the data frame using the specified column index, * optionally dropping the column from the data. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("one", "two"); * > df.append("a", Arrays.asList("alpha", 1)); * > df.append("b", Arrays.asList("bravo", 2)); * > df.reindex(0, true) * > .index(); * [alpha, bravo] }</pre> * * @param col the column to use as the new index * @param drop true to remove the index column from the data, false otherwise * @return a new data frame with index specified */ public DataFrame<V> reindex(final Integer col, final boolean drop) { final DataFrame<V> df = Index.reindex(this, col); return drop ? df.drop(col) : df; } /** * Re-index the rows of the data frame using the specified column indices, * optionally dropping the columns from the data. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("one", "two", "three"); * > df.append("a", Arrays.asList("alpha", 1, 10)); * > df.append("b", Arrays.asList("bravo", 2, 20)); * > df.reindex(new Integer[] { 0, 1 }, true) * > .index(); * [[alpha, 1], [bravo, 2]] }</pre> * * @param cols the column to use as the new index * @param drop true to remove the index column from the data, false otherwise * @return a new data frame with index specified */ public DataFrame<V> reindex(final Integer[] cols, final boolean drop) { final DataFrame<V> df = Index.reindex(this, cols); return drop ? df.drop(cols) : df; } /** * Re-index the rows of the data frame using the specified column indices * and dropping the columns from the data. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("one", "two"); * > df.append("a", Arrays.asList("alpha", 1)); * > df.append("b", Arrays.asList("bravo", 2)); * > df.reindex(0) * > .index(); * [alpha, bravo] }</pre> * * @param cols the column to use as the new index * @return a new data frame with index specified */ public DataFrame<V> reindex(final Integer ... cols) { return reindex(cols, true); } /** * Re-index the rows of the data frame using the specified column name, * optionally dropping the row from the data. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("one", "two"); * > df.append("a", Arrays.asList("alpha", 1)); * > df.append("b", Arrays.asList("bravo", 2)); * > df.reindex("one", true) * > .index(); * [alpha, bravo] }</pre> * * @param col the column to use as the new index * @param drop true to remove the index column from the data, false otherwise * @return a new data frame with index specified */ public DataFrame<V> reindex(final Object col, final boolean drop) { return reindex(columns.get(col), drop); } /** * Re-index the rows of the data frame using the specified column names, * optionally dropping the columns from the data. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("one", "two", "three"); * > df.append("a", Arrays.asList("alpha", 1, 10)); * > df.append("b", Arrays.asList("bravo", 2, 20)); * > df.reindex(new String[] { "one", "two" }, true) * > .index(); * [[alpha, 1], [bravo, 2]] }</pre> * * @param cols the column to use as the new index * @param drop true to remove the index column from the data, false otherwise * @return a new data frame with index specified */ public DataFrame<V> reindex(final Object[] cols, final boolean drop) { return reindex(columns.indices(cols), drop); } /** * Re-index the rows of the data frame using the specified column names * and removing the columns from the data. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("one", "two"); * > df.append("a", Arrays.asList("alpha", 1)); * > df.append("b", Arrays.asList("bravo", 2)); * > df.reindex("one", true) * > .index(); * [alpha, bravo] }</pre> * * @param cols the column to use as the new index * @return a new data frame with index specified */ public DataFrame<V> reindex(final Object ... cols) { return reindex(columns.indices(cols), true); } /** * Return a new data frame with the default index, rows names will * be reset to the string value of their integer index. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("one", "two"); * > df.append("a", Arrays.asList("alpha", 1)); * > df.append("b", Arrays.asList("bravo", 2)); * > df.resetIndex() * > .index(); * [0, 1] }</pre> * * @return a new data frame with the default index. */ public DataFrame<V> resetIndex() { return Index.reset(this); } public DataFrame<V> rename(final Object old, final Object name) { return rename(Collections.singletonMap(old, name)); } public DataFrame<V> rename(final Map<Object, Object> names) { columns.rename(names); return this; } public DataFrame<V> append(final Object name, final V[] row) { return append(name, Arrays.asList(row)); } /** * Append rows to the data frame. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.append(Arrays.asList("alpha", 1)); * > df.append(Arrays.asList("bravo", 2)); * > df.length(); * 2 }</pre> * * @param row the row to append * @return the data frame with the new data appended */ public DataFrame<V> append(final List<? extends V> row) { return append(length(), row); } /** * Append rows indexed by the the specified name to the data frame. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.append("row1", Arrays.asList("alpha", 1)); * > df.append("row2", Arrays.asList("bravo", 2)); * > df.index(); * [row1, row2] }</pre> * * @param name the row name to add to the index * @param row the row to append * @return the data frame with the new data appended */ @Timed public DataFrame<V> append(final Object name, final List<? extends V> row) { final int len = length(); index.add(name, len); columns.extend(row.size()); data.reshape(columns.names().size(), len + 1); for (int c = 0; c < data.size(); c++) { data.set(c < row.size() ? row.get(c) : null, c, len); } return this; } /** * Reshape a data frame to the specified dimensions. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("0", "1", "2"); * > df.append("0", Arrays.asList(10, 20, 30)); * > df.append("1", Arrays.asList(40, 50, 60)); * > df.reshape(3, 2) * > .length(); * 3 }</pre> * * @param rows the number of rows the new data frame will contain * @param cols the number of columns the new data frame will contain * @return a new data frame with the specified dimensions */ public DataFrame<V> reshape(final Integer rows, final Integer cols) { return Shaping.reshape(this, rows, cols); } /** * Reshape a data frame to the specified indices. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("0", "1", "2"); * > df.append("0", Arrays.asList(10, 20, 30)); * > df.append("1", Arrays.asList(40, 50, 60)); * > df.reshape(Arrays.asList("0", "1", "2"), Arrays.asList("0", "1")) * > .length(); * 3 }</pre> * * @param rows the names of rows the new data frame will contain * @param cols the names of columns the new data frame will contain * @return a new data frame with the specified indices */ public DataFrame<V> reshape(final Collection<?> rows, final Collection<?> cols) { return Shaping.reshape(this, rows, cols); } /** * Return a new data frame created by performing a left outer join * of this data frame with the argument and using the row indices * as the join key. * * <pre> {@code * > DataFrame<Object> left = new DataFrame<>("a", "b"); * > left.append("one", Arrays.asList(1, 2)); * > left.append("two", Arrays.asList(3, 4)); * > left.append("three", Arrays.asList(5, 6)); * > DataFrame<Object> right = new DataFrame<>("c", "d"); * > right.append("one", Arrays.asList(10, 20)); * > right.append("two", Arrays.asList(30, 40)); * > right.append("four", Arrays.asList(50, 60)); * > left.join(right) * > .index(); * [one, two, three] }</pre> * * @param other the other data frame * @return the result of the join operation as a new data frame */ public final DataFrame<V> join(final DataFrame<V> other) { return join(other, JoinType.LEFT, null); } /** * Return a new data frame created by performing a join of this * data frame with the argument using the specified join type and * using the row indices as the join key. * * @param other the other data frame * @param join the join type * @return the result of the join operation as a new data frame */ public final DataFrame<V> join(final DataFrame<V> other, final JoinType join) { return join(other, join, null); } /** * Return a new data frame created by performing a left outer join of this * data frame with the argument using the specified key function. * * @param other the other data frame * @param on the function to generate the join keys * @return the result of the join operation as a new data frame */ public final DataFrame<V> join(final DataFrame<V> other, final KeyFunction<V> on) { return join(other, JoinType.LEFT, on); } /** * Return a new data frame created by performing a join of this * data frame with the argument using the specified join type and * the specified key function. * * @param other the other data frame * @param join the join type * @param on the function to generate the join keys * @return the result of the join operation as a new data frame */ public final DataFrame<V> join(final DataFrame<V> other, final JoinType join, final KeyFunction<V> on) { return Combining.join(this, other, join, on); } /** * Return a new data frame created by performing a left outer join of * this data frame with the argument using the column values as the join key. * * @param other the other data frame * @param cols the indices of the columns to use as the join key * @return the result of the join operation as a new data frame */ public final DataFrame<V> joinOn(final DataFrame<V> other, final Integer ... cols) { return joinOn(other, JoinType.LEFT, cols); } /** * Return a new data frame created by performing a join of this * data frame with the argument using the specified join type and * the column values as the join key. * * @param other the other data frame * @param join the join type * @param cols the indices of the columns to use as the join key * @return the result of the join operation as a new data frame */ public final DataFrame<V> joinOn(final DataFrame<V> other, final JoinType join, final Integer ... cols) { return Combining.joinOn(this, other, join, cols); } /** * Return a new data frame created by performing a left outer join of * this data frame with the argument using the column values as the join key. * * @param other the other data frame * @param cols the names of the columns to use as the join key * @return the result of the join operation as a new data frame */ public final DataFrame<V> joinOn(final DataFrame<V> other, final Object ... cols) { return joinOn(other, JoinType.LEFT, cols); } /** * Return a new data frame created by performing a join of this * data frame with the argument using the specified join type and * the column values as the join key. * * @param other the other data frame * @param join the join type * @param cols the names of the columns to use as the join key * @return the result of the join operation as a new data frame */ public final DataFrame<V> joinOn(final DataFrame<V> other, final JoinType join, final Object ... cols) { return joinOn(other, join, columns.indices(cols)); } /** * Return a new data frame created by performing a left outer join of this * data frame with the argument using the common, non-numeric columns * from each data frame as the join key. * * @param other the other data frame * @return the result of the merge operation as a new data frame */ public final DataFrame<V> merge(final DataFrame<V> other) { return merge(other, JoinType.LEFT); } /** * Return a new data frame created by performing a join of this * data frame with the argument using the specified join type and * the common, non-numeric columns from each data frame as the join key. * * @param other the other data frame * @return the result of the merge operation as a new data frame */ public final DataFrame<V> merge(final DataFrame<V> other, final JoinType join) { return Combining.merge(this, other, join); } /** * Update the data frame in place by overwriting the any values * with the non-null values provided by the data frame arguments. * * @param others the other data frames * @return this data frame with the overwritten values */ @SafeVarargs public final DataFrame<V> update(final DataFrame<? extends V> ... others) { Combining.update(this, true, others); return this; } /** * Update the data frame in place by overwriting any null values with * any non-null values provided by the data frame arguments. * * @param others the other data frames * @return this data frame with the overwritten values */ @SafeVarargs public final DataFrame<V> coalesce(final DataFrame<? extends V> ... others) { Combining.update(this, false, others); return this; } /** * Return the size (number of columns) of the data frame. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.size(); * 2 }</pre> * * @return the number of columns */ public int size() { return data.size(); } /** * Return the length (number of rows) of the data frame. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.append(Arrays.asList("alpha", 1)); * > df.append(Arrays.asList("bravo", 2)); * > df.append(Arrays.asList("charlie", 3)); * > df.length(); * 3 }</pre> * * @return the number of columns */ public int length() { return data.length(); } /** * Return {@code true} if the data frame contains no data. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>(); * > df.isEmpty(); * true }</pre> * * @return the number of columns */ public boolean isEmpty() { return length() == 0; } /** * Return the index names for the data frame. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.append("row1", Arrays.asList("one", 1)); * > df.index(); * [row1] }</pre> * * @return the index names */ public Set<Object> index() { return index.names(); } /** * Return the column names for the data frame. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.columns(); * [name, value] }</pre> * * @return the column names */ public Set<Object> columns() { return columns.names(); } /** * Return the value located by the (row, column) names. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<Object>( * > Arrays.asList("row1", "row2", "row3"), * > Arrays.asList("name", "value"), * > Arrays.asList( * > Arrays.asList("alpha", "bravo", "charlie"), * > Arrays.asList(10, 20, 30) * > ) * > ); * > df.get("row2", "name"); * bravo }</pre> * * @param row the row name * @param col the column name * @return the value */ public V get(final Object row, final Object col) { return get(index.get(row), columns.get(col)); } /** * Return the value located by the (row, column) coordinates. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<Object>( * > Collections.emptyList(), * > Arrays.asList("name", "value"), * > Arrays.asList( * > Arrays.asList("alpha", "bravo", "charlie"), * > Arrays.asList(10, 20, 30) * > ) * > ); * > df.get(1, 0); * bravo }</pre> * * @param row the row index * @param col the column index * @return the value */ public V get(final Integer row, final Integer col) { return data.get(col, row); } public DataFrame<V> slice(final Object rowStart, final Object rowEnd) { return slice(index.get(rowStart), index.get(rowEnd), 0, size()); } public DataFrame<V> slice(final Object rowStart, final Object rowEnd, final Object colStart, final Object colEnd) { return slice(index.get(rowStart), index.get(rowEnd), columns.get(colStart), columns.get(colEnd)); } public DataFrame<V> slice(final Integer rowStart, final Integer rowEnd) { return slice(rowStart, rowEnd, 0, size()); } public DataFrame<V> slice(final Integer rowStart, final Integer rowEnd, final Integer colStart, final Integer colEnd) { final SparseBitSet[] slice = Selection.slice(this, rowStart, rowEnd, colStart, colEnd); return new DataFrame<>( Selection.select(index, slice[0]), Selection.select(columns, slice[1]), Selection.select(data, slice[0], slice[1]), new Grouping() ); } /** * Set the value located by the names (row, column). * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Arrays.asList("row1", "row2"), * > Arrays.asList("col1", "col2") * > ); * > df.set("row1", "col2", new Integer(7)); * > df.col(1); * [7, null] }</pre> * * @param row the row name * @param col the column name * @param value the new value */ public void set(final Object row, final Object col, final V value) { set(index.get(row), columns.get(col), value); } /** * Set the value located by the coordinates (row, column). * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Arrays.asList("row1", "row2"), * > Arrays.asList("col1", "col2") * > ); * > df.set(1, 0, new Integer(7)); * > df.col(0); * [null, 7] }</pre> * * @param row the row index * @param col the column index * @param value the new value */ public void set(final Integer row, final Integer col, final V value) { data.set(value, col, row); } /** * Return a data frame column as a list. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Collections.emptyList(), * > Arrays.asList("name", "value"), * > Arrays.asList( * > Arrays.<Object>asList("alpha", "bravo", "charlie"), * > Arrays.<Object>asList(1, 2, 3) * > ) * > ); * > df.col("value"); * [1, 2, 3] }</pre> * * @param column the column name * @return the list of values */ public List<V> col(final Object column) { return col(columns.get(column)); } /** * Return a data frame column as a list. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Collections.emptyList(), * > Arrays.asList("name", "value"), * > Arrays.asList( * > Arrays.<Object>asList("alpha", "bravo", "charlie"), * > Arrays.<Object>asList(1, 2, 3) * > ) * > ); * > df.col(1); * [1, 2, 3] }</pre> * * @param column the column index * @return the list of values */ public List<V> col(final Integer column) { return new Views.SeriesListView<>(this, column, true); } /** * Return a data frame row as a list. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Arrays.asList("row1", "row2", "row3"), * > Collections.emptyList(), * > Arrays.asList( * > Arrays.<Object>asList("alpha", "bravo", "charlie"), * > Arrays.<Object>asList(1, 2, 3) * > ) * > ); * > df.row("row2"); * [bravo, 2] }</pre> * * @param row the row name * @return the list of values */ public List<V> row(final Object row) { return row(index.get(row)); } /** * Return a data frame row as a list. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Collections.emptyList(), * > Collections.emptyList(), * > Arrays.asList( * > Arrays.<Object>asList("alpha", "bravo", "charlie"), * > Arrays.<Object>asList(1, 2, 3) * > ) * > ); * > df.row(1); * [bravo, 2] }</pre> * * @param row the row index * @return the list of values */ public List<V> row(final Integer row) { return new Views.SeriesListView<>(this, row, false); } /** * Select a subset of the data frame using a predicate function. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > for (int i = 0; i < 10; i++) * > df.append(Arrays.asList("name" + i, i)); * > df.select(new Predicate<Object>() { * > @Override * > public Boolean apply(List<Object> values) { * > return Integer.class.cast(values.get(1)).intValue() % 2 == 0; * > } * > }) * > .col(1); * [0, 2, 4, 6, 8] } </pre> * * @param predicate a function returning true for rows to be included in the subset * @return a subset of the data frame */ public DataFrame<V> select(final Predicate<V> predicate) { final SparseBitSet selected = Selection.select(this, predicate); return new DataFrame<>( Selection.select(index, selected), columns, Selection.select(data, selected), new Grouping() ); } /** * Return a data frame containing the first ten rows of this data frame. * * <pre> {@code * > DataFrame<Integer> df = new DataFrame<>("value"); * > for (int i = 0; i < 20; i++) * > df.append(Arrays.asList(i)); * > df.head() * > .col("value"); * [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] }</pre> * * @return the new data frame */ public DataFrame<V> head() { return head(10); } /** * Return a data frame containing the first {@code limit} rows of this data frame. * * <pre> {@code * > DataFrame<Integer> df = new DataFrame<>("value"); * > for (int i = 0; i < 20; i++) * > df.append(Arrays.asList(i)); * > df.head(3) * > .col("value"); * [0, 1, 2] }</pre> * * @param limit the number of rows to include in the result * @return the new data frame */ public DataFrame<V> head(final int limit) { final SparseBitSet selected = new SparseBitSet(); selected.set(0, Math.min(limit, length())); return new DataFrame<>( Selection.select(index, selected), columns, Selection.select(data, selected), new Grouping() ); } /** * Return a data frame containing the last ten rows of this data frame. * * <pre> {@code * > DataFrame<Integer> df = new DataFrame<>("value"); * > for (int i = 0; i < 20; i++) * > df.append(Arrays.asList(i)); * > df.tail() * > .col("value"); * [10, 11, 12, 13, 14, 15, 16, 17, 18, 19] }</pre> * * @return the new data frame */ public DataFrame<V> tail() { return tail(10); } /** * Return a data frame containing the last {@code limit} rows of this data frame. * * <pre> {@code * > DataFrame<Integer> df = new DataFrame<>("value"); * > for (int i = 0; i < 20; i++) * > df.append(Arrays.asList(i)); * > df.tail(3) * > .col("value"); * [17, 18, 19] }</pre> * * @param limit the number of rows to include in the result * @return the new data frame */ public DataFrame<V> tail(final int limit) { final SparseBitSet selected = new SparseBitSet(); final int len = length(); selected.set(Math.max(len - limit, 0), len); return new DataFrame<>( Selection.select(index, selected), columns, Selection.select(data, selected), new Grouping() ); } /** * Return the values of the data frame as a flat list. * * <pre> {@code * > DataFrame<String> df = new DataFrame<>( * > Arrays.asList( * > Arrays.asList("one", "two"), * > Arrays.asList("alpha", "bravo") * > ) * > ); * > df.flatten(); * [one, two, alpha, bravo] }</pre> * * @return the list of values */ public List<V> flatten() { return new Views.FlatView<>(this); } /** * Transpose the rows and columns of the data frame. * * <pre> {@code * > DataFrame<String> df = new DataFrame<>( * > Arrays.asList( * > Arrays.asList("one", "two"), * > Arrays.asList("alpha", "bravo") * > ) * > ); * > df.transpose().flatten(); * [one, alpha, two, bravo] }</pre> * * @return a new data frame with the rows and columns transposed */ public DataFrame<V> transpose() { return new DataFrame<>( columns.names(), index.names(), new Views.ListView<>(this, true) ); } /** * Apply a function to each value in the data frame. * * <pre> {@code * > DataFrame<Number> df = new DataFrame<>( * > Arrays.<List<Number>>asList( * > Arrays.<Number>asList(1, 2), * > Arrays.<Number>asList(3, 4) * > ) * > ); * > df = df.apply(new Function<Number, Number>() { * > public Number apply(Number value) { * > return value.intValue() * value.intValue(); * > } * > }); * > df.flatten(); * [1, 4, 9, 16] }</pre> * * @param function the function to apply * @return a new data frame with the function results */ public <U> DataFrame<U> apply(final Function<V, U> function) { return new DataFrame<>( index.names(), columns.names(), new Views.TransformedView<V, U>(this, function, false) ); } public <U> DataFrame<U> transform(final RowFunction<V, U> transform) { final DataFrame<U> transformed = new DataFrame<>(columns.names()); final Iterator<Object> it = index().iterator(); for (final List<V> row : this) { for (final List<U> trans : transform.apply(row)) { transformed.append(it.hasNext() ? it.next() : transformed.length(), trans); } } return transformed; } /** * Attempt to infer better types for object columns. * * <p>The following conversions are performed where applicable: * <ul> * <li>Floating point numbers are converted to {@code Double} values</li> * <li>Whole numbers are converted to {@code Long} values</li> * <li>True, false, yes, and no are converted to {@code Boolean} values</li> * <li>Date strings in the following formats are converted to {@code Date} values:<br> * {@literal 2000-01-01T00:00:00+1, 2000-01-01T00:00:00EST, 2000-01-01}</li> * <li>Time strings in the following formats are converted to {@code Date} values:<br> * {@literal 2000/01/01, 1/01/2000, 12:01:01 AM, 23:01:01, 12:01 AM, 23:01}</li> * </li> * </ul> * </p> * * <p>Note, the conversion process replaces existing values * with values of the converted type.</p> * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value", "date"); * > df.append(Arrays.asList("one", "1", new Date())); * > df.convert(); * > df.types(); * [class java.lang.String, class java.lang.Long, class java.util.Date] }</pre> * * @return the data frame with the converted values */ public DataFrame<V> convert() { Conversion.convert(this); return this; } public DataFrame<V> convert(final NumberDefault numDefault, final String naString) { Conversion.convert(this,numDefault,naString); return this; } /** * Convert columns based on the requested types. * * <p>Note, the conversion process replaces existing values * with values of the converted type.</p> * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("a", "b", "c"); * > df.append(Arrays.asList("one", 1, 1.0)); * > df.append(Arrays.asList("two", 2, 2.0)); * > df.convert( * > null, // leave column "a" as is * > Long.class, // convert column "b" to Long * > Number.class // convert column "c" to Double * > ); * > df.types(); * [class java.lang.String, class java.lang.Long, class java.lang.Double] }</pre> * * @param columnTypes * @return the data frame with the converted values */ @SafeVarargs public final DataFrame<V> convert(final Class<? extends V> ... columnTypes) { Conversion.convert(this, columnTypes); return this; } /** * Create a new data frame containing boolean values such that * {@code null} object references in the original data frame * yield {@code true} and valid references yield {@code false}. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<Object>( * > Arrays.asList( * > Arrays.asList("alpha", "bravo", null), * > Arrays.asList(null, 2, 3) * > ) * > ); * > df.isnull().row(0); * [false, true] }</pre> * * @return the new boolean data frame */ public DataFrame<Boolean> isnull() { return Conversion.isnull(this); } /** * Create a new data frame containing boolean values such that * valid object references in the original data frame yield {@code true} * and {@code null} references yield {@code false}. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Arrays.asList( * > Arrays.<Object>asList("alpha", "bravo", null), * > Arrays.<Object>asList(null, 2, 3) * > ) * > ); * > df.notnull().row(0); * [true, false] }</pre> * * @return the new boolean data frame */ public DataFrame<Boolean> notnull() { return Conversion.notnull(this); } /** * Copy the values of contained in the data frame into a * flat array of length {@code #size()} * {@code #length()}. * * @return the array */ public Object[] toArray() { return toArray(new Object[size() * length()]); } /** * Copy the values of contained in the data frame into the * specified array. If the length of the provided array is * less than length {@code #size()} * {@code #length()} a * new array will be created. * * @return the array */ public <U> U[] toArray(final U[] array) { return new Views.FlatView<>(this).toArray(array); } @SuppressWarnings("unchecked") public <U> U[][] toArray(final U[][] array) { if (array.length >= size() && array.length > 0 && array[0].length >= length()) { for (int c = 0; c < size(); c++) { for (int r = 0; r < length(); r++) { array[r][c] = (U)get(r, c); } } } return (U[][])toArray(array.getClass()); } /** * Copy the values of contained in the data frame into a * array of the specified type. If the type specified is * a two dimensional array, for example {@code double[][].class}, * a row-wise copy will be made. * * @throws IllegalArgumentException if the values are not assignable to the specified component type * @return the array */ public <U> U toArray(final Class<U> cls) { int dim = 0; Class<?> type = cls; while (type.getComponentType() != null) { type = type.getComponentType(); dim++; } final int size = size(); final int len = length(); if (dim == 1) { @SuppressWarnings("unchecked") final U array = (U)Array.newInstance(type, size * len); for (int c = 0; c < size; c++) { for (int r = 0; r < len; r++) { Array.set(array, c * len + r, data.get(c, r)); } } return array; } else if (dim == 2) { @SuppressWarnings("unchecked") final U array = (U)Array.newInstance(type, new int[] { len, size }); for (int r = 0; r < len; r++) { final Object aa = Array.get(array, r); for (int c = 0; c < size; c++) { Array.set(aa, c, get(r, c)); } Array.set(array, r, aa); } return array; } throw new IllegalArgumentException("class must be an array class"); } /** * Encodes the DataFrame as a model matrix, converting nominal values * to dummy variables but does not add an intercept column. * * More methods with additional parameters to control the conversion to * the model matrix are available in the <code>Conversion</code> class. * * @param fillValue value to replace NA's with * @return a model matrix */ public double[][] toModelMatrix(final double fillValue) { return Conversion.toModelMatrix(this, fillValue); } /** * Encodes the DataFrame as a model matrix, converting nominal values * to dummy variables but does not add an intercept column. * * More methods with additional parameters to control the conversion to * the model matrix are available in the <code>Conversion</code> class. * * @return a model matrix */ public DataFrame<Number> toModelMatrixDataFrame() { return Conversion.toModelMatrixDataFrame(this); } /** * Group the data frame rows by the specified column names. * * @param cols the column names * @return the grouped data frame */ @Timed public DataFrame<V> groupBy(final Object ... cols) { return groupBy(columns.indices(cols)); } /** * Group the data frame rows by the specified columns. * * @param cols the column indices * @return the grouped data frame */ @Timed public DataFrame<V> groupBy(final Integer ... cols) { return new DataFrame<>( index, columns, data, new Grouping(this, cols) ); } /** * Group the data frame rows using the specified key function. * * @param function the function to reduce rows to grouping keys * @return the grouped data frame */ @Timed public DataFrame<V> groupBy(final KeyFunction<V> function) { return new DataFrame<>( index, columns, data, new Grouping(this, function) ); } public Grouping groups() { return groups; } /** * Return a map of group names to data frame for grouped * data frames. Observe that for this method to have any * effect a {@code groupBy} call must have been done before. * * @return a map of group names to data frames */ public Map<Object, DataFrame<V>> explode() { final Map<Object, DataFrame<V>> exploded = new LinkedHashMap<>(); for (final Map.Entry<Object, SparseBitSet> entry : groups) { final SparseBitSet selected = entry.getValue(); exploded.put(entry.getKey(), new DataFrame<V>( Selection.select(index, selected), columns, Selection.select(data, selected), new Grouping() )); } return exploded; } /** * Apply an aggregate function to each group or the entire * data frame if the data is not grouped. * * @param function the aggregate function * @return the new data frame */ public <U> DataFrame<V> aggregate(final Aggregate<V, U> function) { return groups.apply(this, function); } @Timed public DataFrame<V> count() { return groups.apply(this, new Aggregation.Count<V>()); } public DataFrame<V> collapse() { return groups.apply(this, new Aggregation.Collapse<V>()); } public DataFrame<V> unique() { return groups.apply(this, new Aggregation.Unique<V>()); } /** * Compute the sum of the numeric columns for each group * or the entire data frame if the data is not grouped. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Collections.emptyList(), * > Arrays.asList("name", "value"), * > Arrays.asList( * > Arrays.<Object>asList("alpha", "alpha", "alpha", "bravo", "bravo"), * > Arrays.<Object>asList(1, 2, 3, 4, 5) * > ) * > ); * > df.groupBy("name") * > .sum() * > .col("value"); * [6.0, 9.0]} </pre> * * @return the new data frame */ @Timed public DataFrame<V> sum() { return groups.apply(this, new Aggregation.Sum<V>()); } /** * Compute the product of the numeric columns for each group * or the entire data frame if the data is not grouped. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Collections.emptyList(), * > Arrays.asList("name", "value"), * > Arrays.asList( * > Arrays.<Object>asList("alpha", "alpha", "alpha", "bravo", "bravo"), * > Arrays.<Object>asList(1, 2, 3, 4, 5) * > ) * > ); * > df.groupBy("name") * > .prod() * > .col("value"); * [6.0, 20.0]} </pre> * * @return the new data frame */ @Timed public DataFrame<V> prod() { return groups.apply(this, new Aggregation.Product<V>()); } /** * Compute the mean of the numeric columns for each group * or the entire data frame if the data is not grouped. * * <pre> {@code * > DataFrame<Integer> df = new DataFrame<>("value"); * > df.append("one", Arrays.asList(1)); * > df.append("two", Arrays.asList(5)); * > df.append("three", Arrays.asList(3)); * > df.append("four", Arrays.asList(7)); * > df.mean().col(0); * [4.0] }</pre> * * @return the new data frame */ @Timed public DataFrame<V> mean() { return groups.apply(this, new Aggregation.Mean<V>()); } /** * Compute the percentile of the numeric columns for each group * or the entire data frame if the data is not grouped. * * <pre> {@code * > DataFrame<Integer> df = new DataFrame<>("value"); * > df.append("one", Arrays.asList(1)); * > df.append("two", Arrays.asList(5)); * > df.append("three", Arrays.asList(3)); * > df.append("four", Arrays.asList(7)); * > df.mean().col(0); * [4.0] }</pre> * * @return the new data frame */ @Timed public DataFrame<V> percentile(final double quantile) { return groups.apply(this, new Aggregation.Percentile<V>(quantile)); } /** * Compute the standard deviation of the numeric columns for each group * or the entire data frame if the data is not grouped. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>( * > Collections.emptyList(), * > Arrays.asList("name", "value"), * > Arrays.asList( * > Arrays.<Object>asList("alpha", "alpha", "alpha", "bravo", "bravo", "bravo"), * > Arrays.<Object>asList(1, 2, 3, 4, 6, 8) * > ) * > ); * > df.groupBy("name") * > .stddev() * > .col("value"); * [1.0, 2.0]} </pre> * * @return the new data frame */ @Timed public DataFrame<V> stddev() { return groups.apply(this, new Aggregation.StdDev<V>()); } @Timed public DataFrame<V> var() { return groups.apply(this, new Aggregation.Variance<V>()); } @Timed public DataFrame<V> skew() { return groups.apply(this, new Aggregation.Skew<V>()); } @Timed public DataFrame<V> kurt() { return groups.apply(this, new Aggregation.Kurtosis<V>()); } @Timed public DataFrame<V> min() { return groups.apply(this, new Aggregation.Min<V>()); } @Timed public DataFrame<V> max() { return groups.apply(this, new Aggregation.Max<V>()); } @Timed public DataFrame<V> median() { return groups.apply(this, new Aggregation.Median<V>()); } @Timed public DataFrame<Number> cov() { return Aggregation.cov(this); } @Timed public DataFrame<V> cumsum() { return groups.apply(this, new Transforms.CumulativeSum<V>()); } @Timed public DataFrame<V> cumprod() { return groups.apply(this, new Transforms.CumulativeProduct<V>()); } @Timed public DataFrame<V> cummin() { return groups.apply(this, new Transforms.CumulativeMin<V>()); } @Timed public DataFrame<V> cummax() { return groups.apply(this, new Transforms.CumulativeMax<V>()); } @Timed public DataFrame<V> describe() { return Aggregation.describe( groups.apply(this, new Aggregation.Describe<V>())); } public DataFrame<V> pivot(final Object row, final Object col, final Object ... values) { return pivot(Collections.singletonList(row), Collections.singletonList(col), Arrays.asList(values)); } public DataFrame<V> pivot(final List<Object> rows, final List<Object> cols, final List<Object> values) { return pivot(columns.indices(rows), columns.indices(cols), columns.indices(values)); } public DataFrame<V> pivot(final Integer row, final Integer col, final Integer ... values) { return pivot(new Integer[] { row }, new Integer[] { col }, values); } @Timed public DataFrame<V> pivot(final Integer[] rows, final Integer[] cols, final Integer[] values) { return Pivoting.pivot(this, rows, cols, values); } @Timed public <U> DataFrame<U> pivot(final KeyFunction<V> rows, final KeyFunction<V> cols, final Map<Integer, Aggregate<V,U>> values) { return Pivoting.pivot(this, rows, cols, values); } public DataFrame<V> sortBy(final Object ... cols) { final Map<Integer, SortDirection> sortCols = new LinkedHashMap<>(); for (final Object col : cols) { final String str = col instanceof String ? String.class.cast(col) : ""; final SortDirection dir = str.startsWith("-") ? SortDirection.DESCENDING : SortDirection.ASCENDING; final int c = columns.get(str.startsWith("-") ? str.substring(1) : col); sortCols.put(c, dir); } return Sorting.sort(this, sortCols); } @Timed public DataFrame<V> sortBy(final Integer ... cols) { final Map<Integer, SortDirection> sortCols = new LinkedHashMap<>(); for (final int c : cols) { final SortDirection dir = c < 0 ? SortDirection.DESCENDING : SortDirection.ASCENDING; sortCols.put(Math.abs(c), dir); } return Sorting.sort(this, sortCols); } public DataFrame<V> sortBy(final Comparator<List<V>> comparator) { return Sorting.sort(this, comparator); } /** * Return the types for each of the data frame columns. * * @return the list of column types */ public List<Class<?>> types() { return Inspection.types(this); } /** * Return a data frame containing only columns with numeric data. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.append(Arrays.asList("one", 1)); * > df.append(Arrays.asList("two", 2)); * > df.numeric().columns(); * [value] }</pre> * * @return a data frame containing only the numeric columns */ public DataFrame<Number> numeric() { final SparseBitSet numeric = Inspection.numeric(this); final Set<Object> keep = Selection.select(columns, numeric).names(); return retain(keep.toArray(new Object[keep.size()])) .cast(Number.class); } /** * Return a data frame containing only columns with non-numeric data. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.append(Arrays.asList("one", 1)); * > df.append(Arrays.asList("two", 2)); * > df.nonnumeric().columns(); * [name] }</pre> * * @return a data frame containing only the non-numeric columns */ public DataFrame<V> nonnumeric() { final SparseBitSet nonnumeric = Inspection.nonnumeric(this); final Set<Object> keep = Selection.select(columns, nonnumeric).names(); return retain(keep.toArray(new Object[keep.size()])); } /** * Return an iterator over the rows of the data frame. Also used * implicitly with {@code foreach} loops. * * <pre> {@code * > DataFrame<Integer> df = new DataFrame<>( * > Arrays.asList( * > Arrays.asList(1, 2), * > Arrays.asList(3, 4) * > ) * > ); * > List<Integer> results = new ArrayList<>(); * > for (List<Integer> row : df) * > results.add(row.get(0)); * > results; * [1, 2] }</pre> * * @return an iterator over the rows of the data frame. */ @Override public ListIterator<List<V>> iterator() { return iterrows(); } public ListIterator<List<V>> iterrows() { return new Views.ListView<>(this, true).listIterator(); } public ListIterator<List<V>> itercols() { return new Views.ListView<>(this, false).listIterator(); } public ListIterator<Map<Object, V>> itermap() { return new Views.MapView<>(this, true).listIterator(); } public ListIterator<V> itervalues() { return new Views.FlatView<>(this).listIterator(); } /** * Cast this data frame to the specified type. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<>("name", "value"); * > df.append(Arrays.asList("one", "1")); * > DataFrame<String> dfs = df.cast(String.class); * > dfs.get(0, 0).getClass().getName(); * java.lang.String }</pre> * * @param cls * @return the data frame cast to the specified type */ @SuppressWarnings("unchecked") public <T> DataFrame<T> cast(final Class<T> cls) { return (DataFrame<T>)this; } /** * Return a map of index names to rows. * * <pre> {@code * > DataFrame<Integer> df = new DataFrame<>("value"); * > df.append("alpha", Arrays.asList(1)); * > df.append("bravo", Arrays.asList(2)); * > df.map(); * {alpha=[1], bravo=[2]}}</pre> * * @return a map of index names to rows. */ public Map<Object, List<V>> map() { final Map<Object, List<V>> m = new LinkedHashMap<Object, List<V>>(); final int len = length(); final Iterator<Object> names = index.names().iterator(); for (int r = 0; r < len; r++) { final Object name = names.hasNext() ? names.next() : r; m.put(name, row(r)); } return m; } public Map<V, List<V>> map(final Object key, final Object value) { return map(columns.get(key), columns.get(value)); } public Map<V, List<V>> map(final Integer key, final Integer value) { final Map<V, List<V>> m = new LinkedHashMap<V, List<V>>(); final int len = length(); for (int r = 0; r < len; r++) { final V name = data.get(key, r); List<V> values = m.get(name); if (values == null) { values = new ArrayList<V>(); m.put(name, values); } values.add(data.get(value, r)); } return m; } public DataFrame<V> unique(final Object ... cols) { return unique(columns.indices(cols)); } public DataFrame<V> unique(final Integer ... cols) { final DataFrame<V> unique = new DataFrame<V>(columns.names()); final Set<List<V>> seen = new HashSet<List<V>>(); final List<V> key = new ArrayList<V>(cols.length); final int len = length(); for (int r = 0; r < len; r++) { for (final int c : cols) { key.add(data.get(c, r)); } if (!seen.contains(key)) { unique.append(row(r)); seen.add(key); } key.clear(); } return unique; } public DataFrame<V> diff() { return diff(1); } public DataFrame<V> diff(final int period) { return Timeseries.diff(this, period); } public DataFrame<V> percentChange() { return percentChange(1); } public DataFrame<V> percentChange(final int period) { return Timeseries.percentChange(this, period); } public DataFrame<V> rollapply(final Function<List<V>, V> function) { return rollapply(function, 1); } public DataFrame<V> rollapply(final Function<List<V>, V> function, final int period) { return Timeseries.rollapply(this, function, period); } /** * Display the numeric columns of this data frame * as a line chart in a new swing frame. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<Object>( * > Collections.emptyList(), * > Arrays.asList("name", "value"), * > Arrays.asList( * > Arrays.asList("alpha", "bravo", "charlie"), * > Arrays.asList(10, 20, 30) * > ) * > ); * > df.plot(); * } </pre> * */ public final void plot() { plot(PlotType.LINE); } /** * Display the numeric columns of this data frame * as a chart in a new swing frame using the specified type. * * <pre> {@code * > DataFrame<Object> df = new DataFrame<Object>( * > Collections.emptyList(), * > Arrays.asList("name", "value"), * > Arrays.asList( * > Arrays.asList("alpha", "bravo", "charlie"), * > Arrays.asList(10, 20, 30) * > ) * > ); * > df.plot(PlotType.AREA); * } </pre> * @param type the type of plot to display */ public final void plot(final PlotType type) { Display.plot(this, type); } /** * Draw the numeric columns of this data frame * as a chart in the specified {@link Container}. * * @param container the container to use for the chart */ public final void draw(final Container container) { Display.draw(this, container, PlotType.LINE); } /** * Draw the numeric columns of this data frame as a chart * in the specified {@link Container} using the specified type. * * @param container the container to use for the chart * @param type the type of plot to draw */ public final void draw(final Container container, final PlotType type) { Display.draw(this, container, type); } public final void show() { Display.show(this); } public static final <V> DataFrame<String> compare(final DataFrame<V> df1, final DataFrame<V> df2) { return Comparison.compare(df1, df2); } public static final DataFrame<Object> readCsv(final String file) throws IOException { return Serialization.readCsv(file); } public static final DataFrame<Object> readCsv(final InputStream input) throws IOException { return Serialization.readCsv(input); } public static final DataFrame<Object> readCsv(final String file, final String separator) throws IOException { return Serialization.readCsv(file, separator, NumberDefault.LONG_DEFAULT); } public static final DataFrame<Object> readCsv(final InputStream input, final String separator) throws IOException { return Serialization.readCsv(input, separator, NumberDefault.LONG_DEFAULT, null); } public static final DataFrame<Object> readCsv(final InputStream input, final String separator, final String naString) throws IOException { return Serialization.readCsv(input, separator, NumberDefault.LONG_DEFAULT, naString); } public static final DataFrame<Object> readCsv(final InputStream input, final String separator, final String naString, final boolean hasHeader) throws IOException { return Serialization.readCsv(input, separator, NumberDefault.LONG_DEFAULT, naString, hasHeader); } public static final DataFrame<Object> readCsv(final String file, final String separator, final String naString, final boolean hasHeader) throws IOException { return Serialization.readCsv(file, separator, NumberDefault.LONG_DEFAULT, naString, hasHeader); } public static final DataFrame<Object> readCsv(final String file, final String separator, final NumberDefault numberDefault, final String naString, final boolean hasHeader) throws IOException { return Serialization.readCsv(file, separator, numberDefault, naString, hasHeader); } public static final DataFrame<Object> readCsv(final String file, final String separator, final NumberDefault longDefault) throws IOException { return Serialization.readCsv(file, separator, longDefault); } public static final DataFrame<Object> readCsv(final String file, final String separator, final NumberDefault longDefault, final String naString) throws IOException { return Serialization.readCsv(file, separator, longDefault, naString); } public static final DataFrame<Object> readCsv(final InputStream input, final String separator, final NumberDefault longDefault) throws IOException { return Serialization.readCsv(input, separator, longDefault, null); } public final void writeCsv(final String file) throws IOException { Serialization.writeCsv(this, new FileOutputStream(file)); } public final void writeCsv(final OutputStream output) throws IOException { Serialization.writeCsv(this, output); } public static final DataFrame<Object> readXls(final String file) throws IOException { return Serialization.readXls(file); } public static final DataFrame<Object> readXls(final InputStream input) throws IOException { return Serialization.readXls(input); } public final void writeXls(final String file) throws IOException { Serialization.writeXls(this, new FileOutputStream(file)); } public final void writeXls(final OutputStream output) throws IOException { Serialization.writeXls(this, output); } public final String toString(final int limit) { return Serialization.toString(this, limit); } @Override public String toString() { return toString(10); } /** * A function that is applied to objects (rows or values) * in a {@linkplain DataFrame data frame}. * * <p>Implementors define {@link #apply(Object)} to perform * the desired calculation and return the result.</p> * * @param <I> the type of the input values * @param <O> the type of the output values * @see DataFrame#apply(Function) * @see DataFrame#aggregate(Aggregate) */ public interface Function<I, O> { /** * Perform computation on the specified * input value and return the result. * * @param value the input value * @return the result */ O apply(I value); } public interface RowFunction<I, O> { List<List<O>> apply(List<I> values); } /** * A function that converts {@linkplain DataFrame data frame} * rows to index or group keys. * * <p>Implementors define {@link #apply(Object)} to accept * a data frame row as input and return a key value, most * commonly used by {@link DataFrame#groupBy(KeyFunction)}.</p> * * @param <I> the type of the input values * @see DataFrame#groupBy(KeyFunction) */ public interface KeyFunction<I> extends Function<List<I>, Object> { } /** * A function that converts lists of {@linkplain DataFrame data frame} * values to aggregate results. * * <p>Implementors define {@link #apply(Object)} to accept * a list of data frame values as input and return an aggregate * result.</p> * * @param <I> the type of the input values * @param <O> the type of the result * @see DataFrame#aggregate(Aggregate) */ public interface Aggregate<I, O> extends Function<List<I>, O> { } /** * An interface used to filter a {@linkplain DataFrame data frame}. * * <p>Implementors define {@link #apply(Object)} to * return {@code true} for rows that should be included * in the filtered data frame.</p> * * @param <I> the type of the input values * @see DataFrame#select(Predicate) */ public interface Predicate<I> extends Function<List<I>, Boolean> { } public enum SortDirection { ASCENDING, DESCENDING } /** * An enumeration of join types for joining data frames together. */ public enum JoinType { INNER, OUTER, LEFT, RIGHT } /** * An enumeration of plot types for displaying data frames with charts. */ public enum PlotType { SCATTER, SCATTER_WITH_TREND, LINE, LINE_AND_POINTS, AREA, BAR, GRID, GRID_WITH_TREND } /** * An enumeration of data frame axes. */ public enum Axis { ROWS, COLUMNS } public static enum NumberDefault { LONG_DEFAULT, DOUBLE_DEFAULT } /** * Entry point to joinery as a command line tool. * * The available commands are: * <dl> * <dt>show</dt><dd>display the specified data frame as a swing table</dd> * <dt>plot</dt><dd>display the specified data frame as a chart</dd> * <dt>compare</dt><dd>merge the specified data frames and output the result</dd> * <dt>shell</dt><dd>launch an interactive javascript shell for exploring data</dd> * </dl> * * @param args file paths or urls of csv input data * @throws IOException if an error occurs reading input */ public static final void main(final String[] args) throws IOException { final List<DataFrame<Object>> frames = new ArrayList<>(); for (int i = 1; i < args.length; i++) { frames.add(DataFrame.readCsv(args[i])); } if (args.length > 0 && "plot".equalsIgnoreCase(args[0])) { if (frames.size() == 1) { frames.get(0).plot(); return; } } if (args.length > 0 && "show".equalsIgnoreCase(args[0])) { if (frames.size() == 1) { frames.get(0).show(); return; } } if (args.length > 0 && "compare".equalsIgnoreCase(args[0])) { if (frames.size() == 2) { System.out.println(DataFrame.compare(frames.get(0), frames.get(1))); return; } } if (args.length > 0 && "shell".equalsIgnoreCase(args[0])) { Shell.repl(frames); return; } System.err.printf( "usage: %s [compare|plot|show|shell] [csv-file ...]\n", DataFrame.class.getCanonicalName() ); System.exit(255); } }