/*
* Joinery -- Data frames for Java
* Copyright (c) 2014, 2015 IBM Corp.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package joinery;
import java.awt.Container;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.lang.reflect.Array;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import com.codahale.metrics.annotation.Timed;
import joinery.impl.Aggregation;
import joinery.impl.BlockManager;
import joinery.impl.Combining;
import joinery.impl.Comparison;
import joinery.impl.Conversion;
import joinery.impl.Display;
import joinery.impl.Grouping;
import joinery.impl.Index;
import joinery.impl.Inspection;
import joinery.impl.Pivoting;
import joinery.impl.Selection;
import joinery.impl.Serialization;
import joinery.impl.Shaping;
import joinery.impl.Shell;
import joinery.impl.Sorting;
import joinery.impl.SparseBitSet;
import joinery.impl.Timeseries;
import joinery.impl.Transforms;
import joinery.impl.Views;
/**
* A data frame implementation in the spirit
* of <a href="http://pandas.pydata.org">Pandas</a> or
* <a href="http://cran.r-project.org/doc/manuals/r-release/R-intro.html#Data-frames">
* R</a> data frames.
*
* <p>Below is a simple motivating example. When working in Java,
* data operations like the following should be easy. The code
* below retrieves the S&P 500 daily market data for 2008 from
* Yahoo! Finance and returns the average monthly close for
* the three top months of the year.</p>
*
* <pre> {@code
* > DataFrame.readCsv(String.format(
* > "%s?s=%s&a=%d&b=%d&c=%d&d=%d&e=%d&f=%d",
* > "http://real-chart.finance.yahoo.com/table.csv",
* > "^GSPC", // symbol for S&P 500
* > 0, 2, 2008, // start date
* > 11, 31, 2008 // end date
* > ))
* > .retain("Date", "Close")
* > .groupBy(new KeyFunction<Object>() {
* > public Object apply(List<Object> row) {
* > return Date.class.cast(row.get(0)).getMonth();
* > }
* > })
* > .mean()
* > .sortBy("Close")
* > .tail(3)
* > .apply(new Function<Object, Number>() {
* > public Number apply(Object value) {
* > return Number.class.cast(value).intValue();
* > }
* > })
* > .col("Close");
* [1370, 1378, 1403] }</pre>
*
* <p>Taking each step in turn:
* <ol>
* <li>{@link #readCsv(String)} reads csv data from files and urls</li>
* <li>{@link #retain(Object...)} is used to
* eliminate columns that are not needed</li>
* <li>{@link #groupBy(KeyFunction)} with a key function
* is used to group the rows by month</li>
* <li>{@link #mean()} calculates the average close for each month</li>
* <li>{@link #sortBy(Object...)} orders the rows according
* to average closing price</li>
* <li>{@link #tail(int)} returns the last three rows
* (alternatively, sort in descending order and use head)</li>
* <li>{@link #apply(Function)} is used to convert the
* closing prices to integers (this is purely to ease
* comparisons for verifying the results</li>
* <li>finally, {@link #col(Object)} is used to
* extract the values as a list</li>
* </ol>
* </p>
*
* <p>Find more details on the
* <a href="http://github.com/cardillo/joinery">github</a>
* project page.</p>
*
* @param <V> the type of values in this data frame
*/
public class DataFrame<V>
implements Iterable<List<V>> {
private final Index index;
private final Index columns;
private final BlockManager<V> data;
private final Grouping groups;
/**
* Construct an empty data frame.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>();
* > df.isEmpty();
* true }</pre>
*/
public DataFrame() {
this(Collections.<List<V>>emptyList());
}
/**
* Construct an empty data frame with the specified columns.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.columns();
* [name, value] }</pre>
*
* @param columns the data frame column names.
*/
public DataFrame(final String ... columns) {
this(Arrays.asList((Object[])columns));
}
/**
* Construct an empty data frame with the specified columns.
*
* <pre> {@code
* > List<String> columns = new ArrayList<>();
* > columns.add("name");
* > columns.add("value");
* > DataFrame<Object> df = new DataFrame<>(columns);
* > df.columns();
* [name, value] }</pre>
*
* @param columns the data frame column names.
*/
public DataFrame(final Collection<?> columns) {
this(Collections.emptyList(), columns, Collections.<List<V>>emptyList());
}
/**
* Construct a data frame containing the specified rows and columns.
*
* <pre> {@code
* > List<String> rows = Arrays.asList("row1", "row2", "row3");
* > List<String> columns = Arrays.asList("col1", "col2");
* > DataFrame<Object> df = new DataFrame<>(rows, columns);
* > df.get("row1", "col1");
* null }</pre>
*
* @param index the row names
* @param columns the column names
*/
public DataFrame(final Collection<?> index, final Collection<?> columns) {
this(index, columns, Collections.<List<V>>emptyList());
}
/**
* Construct a data frame from the specified list of columns.
*
* <pre> {@code
* > List<List<Object>> data = Arrays.asList(
* > Arrays.<Object>asList("alpha", "bravo", "charlie"),
* > Arrays.<Object>asList(1, 2, 3)
* > );
* > DataFrame<Object> df = new DataFrame<>(data);
* > df.row(0);
* [alpha, 1] }</pre>
*
* @param data a list of columns containing the data elements.
*/
public DataFrame(final List<? extends List<? extends V>> data) {
this(Collections.emptyList(), Collections.emptyList(), data);
}
/**
* Construct a new data frame using the specified data and indices.
*
* @param index the row names
* @param columns the column names
* @param data the data
*/
public DataFrame(final Collection<?> index, final Collection<?> columns,
final List<? extends List<? extends V>> data) {
final BlockManager<V> mgr = new BlockManager<V>(data);
mgr.reshape(
Math.max(mgr.size(), columns.size()),
Math.max(mgr.length(), index.size())
);
this.data = mgr;
this.columns = new Index(columns, mgr.size());
this.index = new Index(index, mgr.length());
this.groups = new Grouping();
}
private DataFrame(final Index index, final Index columns, final BlockManager<V> data, final Grouping groups) {
this.index = index;
this.columns = columns;
this.data = data;
this.groups = groups;
}
/**
* Add new columns to the data frame.
*
* Any existing rows will have {@code null} values for the new columns.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>();
* > df.add("value");
* > df.columns();
* [value] }</pre>
*
* @param columns the new column names
* @return the data frame with the columns added
*/
public DataFrame<V> add(final Object ... columns) {
for (final Object column : columns) {
final List<V> values = new ArrayList<V>(length());
for (int r = 0; r < values.size(); r++) {
values.add(null);
}
add(column, values);
}
return this;
}
public DataFrame<V> add(final List<V> values) {
return add(length(), values);
}
/**
* Add a new column to the data frame containing the value provided.
*
* Any existing rows with indices greater than the size of the
* specified column data will have {@code null} values for the new column.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>();
* > df.add("value", Arrays.<Object>asList(1));
* > df.columns();
* [value] }</pre>
*
* @param column the new column names
* @param values the new column values
* @return the data frame with the column added
*/
public DataFrame<V> add(final Object column, final List<V> values) {
columns.add(column, data.size());
index.extend(values.size());
data.add(values);
return this;
}
/**
* Create a new data frame by leaving out the specified columns.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value", "category");
* > df.drop("category").columns();
* [name, value] }</pre>
*
* @param cols the names of columns to be removed
* @return a shallow copy of the data frame with the columns removed
*/
public DataFrame<V> drop(final Object ... cols) {
return drop(columns.indices(cols));
}
/**
* Create a new data frame by leaving out the specified columns.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value", "category");
* > df.drop(2).columns();
* [name, value] }</pre>
*
* @param cols the indices of the columns to be removed
* @return a shallow copy of the data frame with the columns removed
*/
public DataFrame<V> drop(final Integer ... cols) {
final List<Object> colnames = new ArrayList<>(columns.names());
final List<Object> todrop = new ArrayList<>(cols.length);
for (final int col : cols) {
todrop.add(colnames.get(col));
}
colnames.removeAll(todrop);
final List<List<V>> keep = new ArrayList<>(colnames.size());
for (final Object col : colnames) {
keep.add(col(col));
}
return new DataFrame<>(
index.names(),
colnames,
keep
);
}
public DataFrame<V> dropna() {
return dropna(Axis.ROWS);
}
public DataFrame<V> dropna(final Axis direction) {
switch (direction) {
case ROWS:
return select(new Selection.DropNaPredicate<V>());
default:
return transpose()
.select(new Selection.DropNaPredicate<V>())
.transpose();
}
}
/**
* Returns a view of the of data frame with NA's replaced with {@code fill}.
*
* @param fill the value used to replace missing values
* @return the new data frame
*/
public DataFrame<V> fillna(final V fill) {
return apply(new Views.FillNaFunction<V>(fill));
}
/**
* Create a new data frame containing only the specified columns.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value", "category");
* > df.retain("name", "category").columns();
* [name, category] }</pre>
*
* @param cols the columns to include in the new data frame
* @return a new data frame containing only the specified columns
*/
public DataFrame<V> retain(final Object ... cols) {
return retain(columns.indices(cols));
}
/**
* Create a new data frame containing only the specified columns.
*
* <pre> {@code
* DataFrame<Object> df = new DataFrame<>("name", "value", "category");
* df.retain(0, 2).columns();
* [name, category] }</pre>
*
* @param cols the columns to include in the new data frame
* @return a new data frame containing only the specified columns
*/
public DataFrame<V> retain(final Integer ... cols) {
final Set<Integer> keep = new HashSet<Integer>(Arrays.asList(cols));
final Integer[] todrop = new Integer[size() - keep.size()];
for (int i = 0, c = 0; c < size(); c++) {
if (!keep.contains(c)) {
todrop[i++] = c;
}
}
return drop(todrop);
}
/**
* Re-index the rows of the data frame using the specified column index,
* optionally dropping the column from the data.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("one", "two");
* > df.append("a", Arrays.asList("alpha", 1));
* > df.append("b", Arrays.asList("bravo", 2));
* > df.reindex(0, true)
* > .index();
* [alpha, bravo] }</pre>
*
* @param col the column to use as the new index
* @param drop true to remove the index column from the data, false otherwise
* @return a new data frame with index specified
*/
public DataFrame<V> reindex(final Integer col, final boolean drop) {
final DataFrame<V> df = Index.reindex(this, col);
return drop ? df.drop(col) : df;
}
/**
* Re-index the rows of the data frame using the specified column indices,
* optionally dropping the columns from the data.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("one", "two", "three");
* > df.append("a", Arrays.asList("alpha", 1, 10));
* > df.append("b", Arrays.asList("bravo", 2, 20));
* > df.reindex(new Integer[] { 0, 1 }, true)
* > .index();
* [[alpha, 1], [bravo, 2]] }</pre>
*
* @param cols the column to use as the new index
* @param drop true to remove the index column from the data, false otherwise
* @return a new data frame with index specified
*/
public DataFrame<V> reindex(final Integer[] cols, final boolean drop) {
final DataFrame<V> df = Index.reindex(this, cols);
return drop ? df.drop(cols) : df;
}
/**
* Re-index the rows of the data frame using the specified column indices
* and dropping the columns from the data.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("one", "two");
* > df.append("a", Arrays.asList("alpha", 1));
* > df.append("b", Arrays.asList("bravo", 2));
* > df.reindex(0)
* > .index();
* [alpha, bravo] }</pre>
*
* @param cols the column to use as the new index
* @return a new data frame with index specified
*/
public DataFrame<V> reindex(final Integer ... cols) {
return reindex(cols, true);
}
/**
* Re-index the rows of the data frame using the specified column name,
* optionally dropping the row from the data.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("one", "two");
* > df.append("a", Arrays.asList("alpha", 1));
* > df.append("b", Arrays.asList("bravo", 2));
* > df.reindex("one", true)
* > .index();
* [alpha, bravo] }</pre>
*
* @param col the column to use as the new index
* @param drop true to remove the index column from the data, false otherwise
* @return a new data frame with index specified
*/
public DataFrame<V> reindex(final Object col, final boolean drop) {
return reindex(columns.get(col), drop);
}
/**
* Re-index the rows of the data frame using the specified column names,
* optionally dropping the columns from the data.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("one", "two", "three");
* > df.append("a", Arrays.asList("alpha", 1, 10));
* > df.append("b", Arrays.asList("bravo", 2, 20));
* > df.reindex(new String[] { "one", "two" }, true)
* > .index();
* [[alpha, 1], [bravo, 2]] }</pre>
*
* @param cols the column to use as the new index
* @param drop true to remove the index column from the data, false otherwise
* @return a new data frame with index specified
*/
public DataFrame<V> reindex(final Object[] cols, final boolean drop) {
return reindex(columns.indices(cols), drop);
}
/**
* Re-index the rows of the data frame using the specified column names
* and removing the columns from the data.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("one", "two");
* > df.append("a", Arrays.asList("alpha", 1));
* > df.append("b", Arrays.asList("bravo", 2));
* > df.reindex("one", true)
* > .index();
* [alpha, bravo] }</pre>
*
* @param cols the column to use as the new index
* @return a new data frame with index specified
*/
public DataFrame<V> reindex(final Object ... cols) {
return reindex(columns.indices(cols), true);
}
/**
* Return a new data frame with the default index, rows names will
* be reset to the string value of their integer index.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("one", "two");
* > df.append("a", Arrays.asList("alpha", 1));
* > df.append("b", Arrays.asList("bravo", 2));
* > df.resetIndex()
* > .index();
* [0, 1] }</pre>
*
* @return a new data frame with the default index.
*/
public DataFrame<V> resetIndex() {
return Index.reset(this);
}
public DataFrame<V> rename(final Object old, final Object name) {
return rename(Collections.singletonMap(old, name));
}
public DataFrame<V> rename(final Map<Object, Object> names) {
columns.rename(names);
return this;
}
public DataFrame<V> append(final Object name, final V[] row) {
return append(name, Arrays.asList(row));
}
/**
* Append rows to the data frame.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.append(Arrays.asList("alpha", 1));
* > df.append(Arrays.asList("bravo", 2));
* > df.length();
* 2 }</pre>
*
* @param row the row to append
* @return the data frame with the new data appended
*/
public DataFrame<V> append(final List<? extends V> row) {
return append(length(), row);
}
/**
* Append rows indexed by the the specified name to the data frame.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.append("row1", Arrays.asList("alpha", 1));
* > df.append("row2", Arrays.asList("bravo", 2));
* > df.index();
* [row1, row2] }</pre>
*
* @param name the row name to add to the index
* @param row the row to append
* @return the data frame with the new data appended
*/
@Timed
public DataFrame<V> append(final Object name, final List<? extends V> row) {
final int len = length();
index.add(name, len);
columns.extend(row.size());
data.reshape(columns.names().size(), len + 1);
for (int c = 0; c < data.size(); c++) {
data.set(c < row.size() ? row.get(c) : null, c, len);
}
return this;
}
/**
* Reshape a data frame to the specified dimensions.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("0", "1", "2");
* > df.append("0", Arrays.asList(10, 20, 30));
* > df.append("1", Arrays.asList(40, 50, 60));
* > df.reshape(3, 2)
* > .length();
* 3 }</pre>
*
* @param rows the number of rows the new data frame will contain
* @param cols the number of columns the new data frame will contain
* @return a new data frame with the specified dimensions
*/
public DataFrame<V> reshape(final Integer rows, final Integer cols) {
return Shaping.reshape(this, rows, cols);
}
/**
* Reshape a data frame to the specified indices.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("0", "1", "2");
* > df.append("0", Arrays.asList(10, 20, 30));
* > df.append("1", Arrays.asList(40, 50, 60));
* > df.reshape(Arrays.asList("0", "1", "2"), Arrays.asList("0", "1"))
* > .length();
* 3 }</pre>
*
* @param rows the names of rows the new data frame will contain
* @param cols the names of columns the new data frame will contain
* @return a new data frame with the specified indices
*/
public DataFrame<V> reshape(final Collection<?> rows, final Collection<?> cols) {
return Shaping.reshape(this, rows, cols);
}
/**
* Return a new data frame created by performing a left outer join
* of this data frame with the argument and using the row indices
* as the join key.
*
* <pre> {@code
* > DataFrame<Object> left = new DataFrame<>("a", "b");
* > left.append("one", Arrays.asList(1, 2));
* > left.append("two", Arrays.asList(3, 4));
* > left.append("three", Arrays.asList(5, 6));
* > DataFrame<Object> right = new DataFrame<>("c", "d");
* > right.append("one", Arrays.asList(10, 20));
* > right.append("two", Arrays.asList(30, 40));
* > right.append("four", Arrays.asList(50, 60));
* > left.join(right)
* > .index();
* [one, two, three] }</pre>
*
* @param other the other data frame
* @return the result of the join operation as a new data frame
*/
public final DataFrame<V> join(final DataFrame<V> other) {
return join(other, JoinType.LEFT, null);
}
/**
* Return a new data frame created by performing a join of this
* data frame with the argument using the specified join type and
* using the row indices as the join key.
*
* @param other the other data frame
* @param join the join type
* @return the result of the join operation as a new data frame
*/
public final DataFrame<V> join(final DataFrame<V> other, final JoinType join) {
return join(other, join, null);
}
/**
* Return a new data frame created by performing a left outer join of this
* data frame with the argument using the specified key function.
*
* @param other the other data frame
* @param on the function to generate the join keys
* @return the result of the join operation as a new data frame
*/
public final DataFrame<V> join(final DataFrame<V> other, final KeyFunction<V> on) {
return join(other, JoinType.LEFT, on);
}
/**
* Return a new data frame created by performing a join of this
* data frame with the argument using the specified join type and
* the specified key function.
*
* @param other the other data frame
* @param join the join type
* @param on the function to generate the join keys
* @return the result of the join operation as a new data frame
*/
public final DataFrame<V> join(final DataFrame<V> other, final JoinType join, final KeyFunction<V> on) {
return Combining.join(this, other, join, on);
}
/**
* Return a new data frame created by performing a left outer join of
* this data frame with the argument using the column values as the join key.
*
* @param other the other data frame
* @param cols the indices of the columns to use as the join key
* @return the result of the join operation as a new data frame
*/
public final DataFrame<V> joinOn(final DataFrame<V> other, final Integer ... cols) {
return joinOn(other, JoinType.LEFT, cols);
}
/**
* Return a new data frame created by performing a join of this
* data frame with the argument using the specified join type and
* the column values as the join key.
*
* @param other the other data frame
* @param join the join type
* @param cols the indices of the columns to use as the join key
* @return the result of the join operation as a new data frame
*/
public final DataFrame<V> joinOn(final DataFrame<V> other, final JoinType join, final Integer ... cols) {
return Combining.joinOn(this, other, join, cols);
}
/**
* Return a new data frame created by performing a left outer join of
* this data frame with the argument using the column values as the join key.
*
* @param other the other data frame
* @param cols the names of the columns to use as the join key
* @return the result of the join operation as a new data frame
*/
public final DataFrame<V> joinOn(final DataFrame<V> other, final Object ... cols) {
return joinOn(other, JoinType.LEFT, cols);
}
/**
* Return a new data frame created by performing a join of this
* data frame with the argument using the specified join type and
* the column values as the join key.
*
* @param other the other data frame
* @param join the join type
* @param cols the names of the columns to use as the join key
* @return the result of the join operation as a new data frame
*/
public final DataFrame<V> joinOn(final DataFrame<V> other, final JoinType join, final Object ... cols) {
return joinOn(other, join, columns.indices(cols));
}
/**
* Return a new data frame created by performing a left outer join of this
* data frame with the argument using the common, non-numeric columns
* from each data frame as the join key.
*
* @param other the other data frame
* @return the result of the merge operation as a new data frame
*/
public final DataFrame<V> merge(final DataFrame<V> other) {
return merge(other, JoinType.LEFT);
}
/**
* Return a new data frame created by performing a join of this
* data frame with the argument using the specified join type and
* the common, non-numeric columns from each data frame as the join key.
*
* @param other the other data frame
* @return the result of the merge operation as a new data frame
*/
public final DataFrame<V> merge(final DataFrame<V> other, final JoinType join) {
return Combining.merge(this, other, join);
}
/**
* Update the data frame in place by overwriting the any values
* with the non-null values provided by the data frame arguments.
*
* @param others the other data frames
* @return this data frame with the overwritten values
*/
@SafeVarargs
public final DataFrame<V> update(final DataFrame<? extends V> ... others) {
Combining.update(this, true, others);
return this;
}
/**
* Update the data frame in place by overwriting any null values with
* any non-null values provided by the data frame arguments.
*
* @param others the other data frames
* @return this data frame with the overwritten values
*/
@SafeVarargs
public final DataFrame<V> coalesce(final DataFrame<? extends V> ... others) {
Combining.update(this, false, others);
return this;
}
/**
* Return the size (number of columns) of the data frame.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.size();
* 2 }</pre>
*
* @return the number of columns
*/
public int size() {
return data.size();
}
/**
* Return the length (number of rows) of the data frame.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.append(Arrays.asList("alpha", 1));
* > df.append(Arrays.asList("bravo", 2));
* > df.append(Arrays.asList("charlie", 3));
* > df.length();
* 3 }</pre>
*
* @return the number of columns
*/
public int length() {
return data.length();
}
/**
* Return {@code true} if the data frame contains no data.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>();
* > df.isEmpty();
* true }</pre>
*
* @return the number of columns
*/
public boolean isEmpty() {
return length() == 0;
}
/**
* Return the index names for the data frame.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.append("row1", Arrays.asList("one", 1));
* > df.index();
* [row1] }</pre>
*
* @return the index names
*/
public Set<Object> index() {
return index.names();
}
/**
* Return the column names for the data frame.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.columns();
* [name, value] }</pre>
*
* @return the column names
*/
public Set<Object> columns() {
return columns.names();
}
/**
* Return the value located by the (row, column) names.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<Object>(
* > Arrays.asList("row1", "row2", "row3"),
* > Arrays.asList("name", "value"),
* > Arrays.asList(
* > Arrays.asList("alpha", "bravo", "charlie"),
* > Arrays.asList(10, 20, 30)
* > )
* > );
* > df.get("row2", "name");
* bravo }</pre>
*
* @param row the row name
* @param col the column name
* @return the value
*/
public V get(final Object row, final Object col) {
return get(index.get(row), columns.get(col));
}
/**
* Return the value located by the (row, column) coordinates.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<Object>(
* > Collections.emptyList(),
* > Arrays.asList("name", "value"),
* > Arrays.asList(
* > Arrays.asList("alpha", "bravo", "charlie"),
* > Arrays.asList(10, 20, 30)
* > )
* > );
* > df.get(1, 0);
* bravo }</pre>
*
* @param row the row index
* @param col the column index
* @return the value
*/
public V get(final Integer row, final Integer col) {
return data.get(col, row);
}
public DataFrame<V> slice(final Object rowStart, final Object rowEnd) {
return slice(index.get(rowStart), index.get(rowEnd), 0, size());
}
public DataFrame<V> slice(final Object rowStart, final Object rowEnd, final Object colStart, final Object colEnd) {
return slice(index.get(rowStart), index.get(rowEnd), columns.get(colStart), columns.get(colEnd));
}
public DataFrame<V> slice(final Integer rowStart, final Integer rowEnd) {
return slice(rowStart, rowEnd, 0, size());
}
public DataFrame<V> slice(final Integer rowStart, final Integer rowEnd, final Integer colStart, final Integer colEnd) {
final SparseBitSet[] slice = Selection.slice(this, rowStart, rowEnd, colStart, colEnd);
return new DataFrame<>(
Selection.select(index, slice[0]),
Selection.select(columns, slice[1]),
Selection.select(data, slice[0], slice[1]),
new Grouping()
);
}
/**
* Set the value located by the names (row, column).
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Arrays.asList("row1", "row2"),
* > Arrays.asList("col1", "col2")
* > );
* > df.set("row1", "col2", new Integer(7));
* > df.col(1);
* [7, null] }</pre>
*
* @param row the row name
* @param col the column name
* @param value the new value
*/
public void set(final Object row, final Object col, final V value) {
set(index.get(row), columns.get(col), value);
}
/**
* Set the value located by the coordinates (row, column).
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Arrays.asList("row1", "row2"),
* > Arrays.asList("col1", "col2")
* > );
* > df.set(1, 0, new Integer(7));
* > df.col(0);
* [null, 7] }</pre>
*
* @param row the row index
* @param col the column index
* @param value the new value
*/
public void set(final Integer row, final Integer col, final V value) {
data.set(value, col, row);
}
/**
* Return a data frame column as a list.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Collections.emptyList(),
* > Arrays.asList("name", "value"),
* > Arrays.asList(
* > Arrays.<Object>asList("alpha", "bravo", "charlie"),
* > Arrays.<Object>asList(1, 2, 3)
* > )
* > );
* > df.col("value");
* [1, 2, 3] }</pre>
*
* @param column the column name
* @return the list of values
*/
public List<V> col(final Object column) {
return col(columns.get(column));
}
/**
* Return a data frame column as a list.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Collections.emptyList(),
* > Arrays.asList("name", "value"),
* > Arrays.asList(
* > Arrays.<Object>asList("alpha", "bravo", "charlie"),
* > Arrays.<Object>asList(1, 2, 3)
* > )
* > );
* > df.col(1);
* [1, 2, 3] }</pre>
*
* @param column the column index
* @return the list of values
*/
public List<V> col(final Integer column) {
return new Views.SeriesListView<>(this, column, true);
}
/**
* Return a data frame row as a list.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Arrays.asList("row1", "row2", "row3"),
* > Collections.emptyList(),
* > Arrays.asList(
* > Arrays.<Object>asList("alpha", "bravo", "charlie"),
* > Arrays.<Object>asList(1, 2, 3)
* > )
* > );
* > df.row("row2");
* [bravo, 2] }</pre>
*
* @param row the row name
* @return the list of values
*/
public List<V> row(final Object row) {
return row(index.get(row));
}
/**
* Return a data frame row as a list.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Collections.emptyList(),
* > Collections.emptyList(),
* > Arrays.asList(
* > Arrays.<Object>asList("alpha", "bravo", "charlie"),
* > Arrays.<Object>asList(1, 2, 3)
* > )
* > );
* > df.row(1);
* [bravo, 2] }</pre>
*
* @param row the row index
* @return the list of values
*/
public List<V> row(final Integer row) {
return new Views.SeriesListView<>(this, row, false);
}
/**
* Select a subset of the data frame using a predicate function.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > for (int i = 0; i < 10; i++)
* > df.append(Arrays.asList("name" + i, i));
* > df.select(new Predicate<Object>() {
* > @Override
* > public Boolean apply(List<Object> values) {
* > return Integer.class.cast(values.get(1)).intValue() % 2 == 0;
* > }
* > })
* > .col(1);
* [0, 2, 4, 6, 8] } </pre>
*
* @param predicate a function returning true for rows to be included in the subset
* @return a subset of the data frame
*/
public DataFrame<V> select(final Predicate<V> predicate) {
final SparseBitSet selected = Selection.select(this, predicate);
return new DataFrame<>(
Selection.select(index, selected),
columns,
Selection.select(data, selected),
new Grouping()
);
}
/**
* Return a data frame containing the first ten rows of this data frame.
*
* <pre> {@code
* > DataFrame<Integer> df = new DataFrame<>("value");
* > for (int i = 0; i < 20; i++)
* > df.append(Arrays.asList(i));
* > df.head()
* > .col("value");
* [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] }</pre>
*
* @return the new data frame
*/
public DataFrame<V> head() {
return head(10);
}
/**
* Return a data frame containing the first {@code limit} rows of this data frame.
*
* <pre> {@code
* > DataFrame<Integer> df = new DataFrame<>("value");
* > for (int i = 0; i < 20; i++)
* > df.append(Arrays.asList(i));
* > df.head(3)
* > .col("value");
* [0, 1, 2] }</pre>
*
* @param limit the number of rows to include in the result
* @return the new data frame
*/
public DataFrame<V> head(final int limit) {
final SparseBitSet selected = new SparseBitSet();
selected.set(0, Math.min(limit, length()));
return new DataFrame<>(
Selection.select(index, selected),
columns,
Selection.select(data, selected),
new Grouping()
);
}
/**
* Return a data frame containing the last ten rows of this data frame.
*
* <pre> {@code
* > DataFrame<Integer> df = new DataFrame<>("value");
* > for (int i = 0; i < 20; i++)
* > df.append(Arrays.asList(i));
* > df.tail()
* > .col("value");
* [10, 11, 12, 13, 14, 15, 16, 17, 18, 19] }</pre>
*
* @return the new data frame
*/
public DataFrame<V> tail() {
return tail(10);
}
/**
* Return a data frame containing the last {@code limit} rows of this data frame.
*
* <pre> {@code
* > DataFrame<Integer> df = new DataFrame<>("value");
* > for (int i = 0; i < 20; i++)
* > df.append(Arrays.asList(i));
* > df.tail(3)
* > .col("value");
* [17, 18, 19] }</pre>
*
* @param limit the number of rows to include in the result
* @return the new data frame
*/
public DataFrame<V> tail(final int limit) {
final SparseBitSet selected = new SparseBitSet();
final int len = length();
selected.set(Math.max(len - limit, 0), len);
return new DataFrame<>(
Selection.select(index, selected),
columns,
Selection.select(data, selected),
new Grouping()
);
}
/**
* Return the values of the data frame as a flat list.
*
* <pre> {@code
* > DataFrame<String> df = new DataFrame<>(
* > Arrays.asList(
* > Arrays.asList("one", "two"),
* > Arrays.asList("alpha", "bravo")
* > )
* > );
* > df.flatten();
* [one, two, alpha, bravo] }</pre>
*
* @return the list of values
*/
public List<V> flatten() {
return new Views.FlatView<>(this);
}
/**
* Transpose the rows and columns of the data frame.
*
* <pre> {@code
* > DataFrame<String> df = new DataFrame<>(
* > Arrays.asList(
* > Arrays.asList("one", "two"),
* > Arrays.asList("alpha", "bravo")
* > )
* > );
* > df.transpose().flatten();
* [one, alpha, two, bravo] }</pre>
*
* @return a new data frame with the rows and columns transposed
*/
public DataFrame<V> transpose() {
return new DataFrame<>(
columns.names(),
index.names(),
new Views.ListView<>(this, true)
);
}
/**
* Apply a function to each value in the data frame.
*
* <pre> {@code
* > DataFrame<Number> df = new DataFrame<>(
* > Arrays.<List<Number>>asList(
* > Arrays.<Number>asList(1, 2),
* > Arrays.<Number>asList(3, 4)
* > )
* > );
* > df = df.apply(new Function<Number, Number>() {
* > public Number apply(Number value) {
* > return value.intValue() * value.intValue();
* > }
* > });
* > df.flatten();
* [1, 4, 9, 16] }</pre>
*
* @param function the function to apply
* @return a new data frame with the function results
*/
public <U> DataFrame<U> apply(final Function<V, U> function) {
return new DataFrame<>(
index.names(),
columns.names(),
new Views.TransformedView<V, U>(this, function, false)
);
}
public <U> DataFrame<U> transform(final RowFunction<V, U> transform) {
final DataFrame<U> transformed = new DataFrame<>(columns.names());
final Iterator<Object> it = index().iterator();
for (final List<V> row : this) {
for (final List<U> trans : transform.apply(row)) {
transformed.append(it.hasNext() ? it.next() : transformed.length(), trans);
}
}
return transformed;
}
/**
* Attempt to infer better types for object columns.
*
* <p>The following conversions are performed where applicable:
* <ul>
* <li>Floating point numbers are converted to {@code Double} values</li>
* <li>Whole numbers are converted to {@code Long} values</li>
* <li>True, false, yes, and no are converted to {@code Boolean} values</li>
* <li>Date strings in the following formats are converted to {@code Date} values:<br>
* {@literal 2000-01-01T00:00:00+1, 2000-01-01T00:00:00EST, 2000-01-01}</li>
* <li>Time strings in the following formats are converted to {@code Date} values:<br>
* {@literal 2000/01/01, 1/01/2000, 12:01:01 AM, 23:01:01, 12:01 AM, 23:01}</li>
* </li>
* </ul>
* </p>
*
* <p>Note, the conversion process replaces existing values
* with values of the converted type.</p>
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value", "date");
* > df.append(Arrays.asList("one", "1", new Date()));
* > df.convert();
* > df.types();
* [class java.lang.String, class java.lang.Long, class java.util.Date] }</pre>
*
* @return the data frame with the converted values
*/
public DataFrame<V> convert() {
Conversion.convert(this);
return this;
}
public DataFrame<V> convert(final NumberDefault numDefault, final String naString) {
Conversion.convert(this,numDefault,naString);
return this;
}
/**
* Convert columns based on the requested types.
*
* <p>Note, the conversion process replaces existing values
* with values of the converted type.</p>
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("a", "b", "c");
* > df.append(Arrays.asList("one", 1, 1.0));
* > df.append(Arrays.asList("two", 2, 2.0));
* > df.convert(
* > null, // leave column "a" as is
* > Long.class, // convert column "b" to Long
* > Number.class // convert column "c" to Double
* > );
* > df.types();
* [class java.lang.String, class java.lang.Long, class java.lang.Double] }</pre>
*
* @param columnTypes
* @return the data frame with the converted values
*/
@SafeVarargs
public final DataFrame<V> convert(final Class<? extends V> ... columnTypes) {
Conversion.convert(this, columnTypes);
return this;
}
/**
* Create a new data frame containing boolean values such that
* {@code null} object references in the original data frame
* yield {@code true} and valid references yield {@code false}.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<Object>(
* > Arrays.asList(
* > Arrays.asList("alpha", "bravo", null),
* > Arrays.asList(null, 2, 3)
* > )
* > );
* > df.isnull().row(0);
* [false, true] }</pre>
*
* @return the new boolean data frame
*/
public DataFrame<Boolean> isnull() {
return Conversion.isnull(this);
}
/**
* Create a new data frame containing boolean values such that
* valid object references in the original data frame yield {@code true}
* and {@code null} references yield {@code false}.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Arrays.asList(
* > Arrays.<Object>asList("alpha", "bravo", null),
* > Arrays.<Object>asList(null, 2, 3)
* > )
* > );
* > df.notnull().row(0);
* [true, false] }</pre>
*
* @return the new boolean data frame
*/
public DataFrame<Boolean> notnull() {
return Conversion.notnull(this);
}
/**
* Copy the values of contained in the data frame into a
* flat array of length {@code #size()} * {@code #length()}.
*
* @return the array
*/
public Object[] toArray() {
return toArray(new Object[size() * length()]);
}
/**
* Copy the values of contained in the data frame into the
* specified array. If the length of the provided array is
* less than length {@code #size()} * {@code #length()} a
* new array will be created.
*
* @return the array
*/
public <U> U[] toArray(final U[] array) {
return new Views.FlatView<>(this).toArray(array);
}
@SuppressWarnings("unchecked")
public <U> U[][] toArray(final U[][] array) {
if (array.length >= size() && array.length > 0 && array[0].length >= length()) {
for (int c = 0; c < size(); c++) {
for (int r = 0; r < length(); r++) {
array[r][c] = (U)get(r, c);
}
}
}
return (U[][])toArray(array.getClass());
}
/**
* Copy the values of contained in the data frame into a
* array of the specified type. If the type specified is
* a two dimensional array, for example {@code double[][].class},
* a row-wise copy will be made.
*
* @throws IllegalArgumentException if the values are not assignable to the specified component type
* @return the array
*/
public <U> U toArray(final Class<U> cls) {
int dim = 0;
Class<?> type = cls;
while (type.getComponentType() != null) {
type = type.getComponentType();
dim++;
}
final int size = size();
final int len = length();
if (dim == 1) {
@SuppressWarnings("unchecked")
final U array = (U)Array.newInstance(type, size * len);
for (int c = 0; c < size; c++) {
for (int r = 0; r < len; r++) {
Array.set(array, c * len + r, data.get(c, r));
}
}
return array;
} else if (dim == 2) {
@SuppressWarnings("unchecked")
final U array = (U)Array.newInstance(type, new int[] { len, size });
for (int r = 0; r < len; r++) {
final Object aa = Array.get(array, r);
for (int c = 0; c < size; c++) {
Array.set(aa, c, get(r, c));
}
Array.set(array, r, aa);
}
return array;
}
throw new IllegalArgumentException("class must be an array class");
}
/**
* Encodes the DataFrame as a model matrix, converting nominal values
* to dummy variables but does not add an intercept column.
*
* More methods with additional parameters to control the conversion to
* the model matrix are available in the <code>Conversion</code> class.
*
* @param fillValue value to replace NA's with
* @return a model matrix
*/
public double[][] toModelMatrix(final double fillValue) {
return Conversion.toModelMatrix(this, fillValue);
}
/**
* Encodes the DataFrame as a model matrix, converting nominal values
* to dummy variables but does not add an intercept column.
*
* More methods with additional parameters to control the conversion to
* the model matrix are available in the <code>Conversion</code> class.
*
* @return a model matrix
*/
public DataFrame<Number> toModelMatrixDataFrame() {
return Conversion.toModelMatrixDataFrame(this);
}
/**
* Group the data frame rows by the specified column names.
*
* @param cols the column names
* @return the grouped data frame
*/
@Timed
public DataFrame<V> groupBy(final Object ... cols) {
return groupBy(columns.indices(cols));
}
/**
* Group the data frame rows by the specified columns.
*
* @param cols the column indices
* @return the grouped data frame
*/
@Timed
public DataFrame<V> groupBy(final Integer ... cols) {
return new DataFrame<>(
index,
columns,
data,
new Grouping(this, cols)
);
}
/**
* Group the data frame rows using the specified key function.
*
* @param function the function to reduce rows to grouping keys
* @return the grouped data frame
*/
@Timed
public DataFrame<V> groupBy(final KeyFunction<V> function) {
return new DataFrame<>(
index,
columns,
data,
new Grouping(this, function)
);
}
public Grouping groups() {
return groups;
}
/**
* Return a map of group names to data frame for grouped
* data frames. Observe that for this method to have any
* effect a {@code groupBy} call must have been done before.
*
* @return a map of group names to data frames
*/
public Map<Object, DataFrame<V>> explode() {
final Map<Object, DataFrame<V>> exploded = new LinkedHashMap<>();
for (final Map.Entry<Object, SparseBitSet> entry : groups) {
final SparseBitSet selected = entry.getValue();
exploded.put(entry.getKey(), new DataFrame<V>(
Selection.select(index, selected),
columns,
Selection.select(data, selected),
new Grouping()
));
}
return exploded;
}
/**
* Apply an aggregate function to each group or the entire
* data frame if the data is not grouped.
*
* @param function the aggregate function
* @return the new data frame
*/
public <U> DataFrame<V> aggregate(final Aggregate<V, U> function) {
return groups.apply(this, function);
}
@Timed
public DataFrame<V> count() {
return groups.apply(this, new Aggregation.Count<V>());
}
public DataFrame<V> collapse() {
return groups.apply(this, new Aggregation.Collapse<V>());
}
public DataFrame<V> unique() {
return groups.apply(this, new Aggregation.Unique<V>());
}
/**
* Compute the sum of the numeric columns for each group
* or the entire data frame if the data is not grouped.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Collections.emptyList(),
* > Arrays.asList("name", "value"),
* > Arrays.asList(
* > Arrays.<Object>asList("alpha", "alpha", "alpha", "bravo", "bravo"),
* > Arrays.<Object>asList(1, 2, 3, 4, 5)
* > )
* > );
* > df.groupBy("name")
* > .sum()
* > .col("value");
* [6.0, 9.0]} </pre>
*
* @return the new data frame
*/
@Timed
public DataFrame<V> sum() {
return groups.apply(this, new Aggregation.Sum<V>());
}
/**
* Compute the product of the numeric columns for each group
* or the entire data frame if the data is not grouped.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Collections.emptyList(),
* > Arrays.asList("name", "value"),
* > Arrays.asList(
* > Arrays.<Object>asList("alpha", "alpha", "alpha", "bravo", "bravo"),
* > Arrays.<Object>asList(1, 2, 3, 4, 5)
* > )
* > );
* > df.groupBy("name")
* > .prod()
* > .col("value");
* [6.0, 20.0]} </pre>
*
* @return the new data frame
*/
@Timed
public DataFrame<V> prod() {
return groups.apply(this, new Aggregation.Product<V>());
}
/**
* Compute the mean of the numeric columns for each group
* or the entire data frame if the data is not grouped.
*
* <pre> {@code
* > DataFrame<Integer> df = new DataFrame<>("value");
* > df.append("one", Arrays.asList(1));
* > df.append("two", Arrays.asList(5));
* > df.append("three", Arrays.asList(3));
* > df.append("four", Arrays.asList(7));
* > df.mean().col(0);
* [4.0] }</pre>
*
* @return the new data frame
*/
@Timed
public DataFrame<V> mean() {
return groups.apply(this, new Aggregation.Mean<V>());
}
/**
* Compute the percentile of the numeric columns for each group
* or the entire data frame if the data is not grouped.
*
* <pre> {@code
* > DataFrame<Integer> df = new DataFrame<>("value");
* > df.append("one", Arrays.asList(1));
* > df.append("two", Arrays.asList(5));
* > df.append("three", Arrays.asList(3));
* > df.append("four", Arrays.asList(7));
* > df.mean().col(0);
* [4.0] }</pre>
*
* @return the new data frame
*/
@Timed
public DataFrame<V> percentile(final double quantile) {
return groups.apply(this, new Aggregation.Percentile<V>(quantile));
}
/**
* Compute the standard deviation of the numeric columns for each group
* or the entire data frame if the data is not grouped.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>(
* > Collections.emptyList(),
* > Arrays.asList("name", "value"),
* > Arrays.asList(
* > Arrays.<Object>asList("alpha", "alpha", "alpha", "bravo", "bravo", "bravo"),
* > Arrays.<Object>asList(1, 2, 3, 4, 6, 8)
* > )
* > );
* > df.groupBy("name")
* > .stddev()
* > .col("value");
* [1.0, 2.0]} </pre>
*
* @return the new data frame
*/
@Timed
public DataFrame<V> stddev() {
return groups.apply(this, new Aggregation.StdDev<V>());
}
@Timed
public DataFrame<V> var() {
return groups.apply(this, new Aggregation.Variance<V>());
}
@Timed
public DataFrame<V> skew() {
return groups.apply(this, new Aggregation.Skew<V>());
}
@Timed
public DataFrame<V> kurt() {
return groups.apply(this, new Aggregation.Kurtosis<V>());
}
@Timed
public DataFrame<V> min() {
return groups.apply(this, new Aggregation.Min<V>());
}
@Timed
public DataFrame<V> max() {
return groups.apply(this, new Aggregation.Max<V>());
}
@Timed
public DataFrame<V> median() {
return groups.apply(this, new Aggregation.Median<V>());
}
@Timed
public DataFrame<Number> cov() {
return Aggregation.cov(this);
}
@Timed
public DataFrame<V> cumsum() {
return groups.apply(this, new Transforms.CumulativeSum<V>());
}
@Timed
public DataFrame<V> cumprod() {
return groups.apply(this, new Transforms.CumulativeProduct<V>());
}
@Timed
public DataFrame<V> cummin() {
return groups.apply(this, new Transforms.CumulativeMin<V>());
}
@Timed
public DataFrame<V> cummax() {
return groups.apply(this, new Transforms.CumulativeMax<V>());
}
@Timed
public DataFrame<V> describe() {
return Aggregation.describe(
groups.apply(this, new Aggregation.Describe<V>()));
}
public DataFrame<V> pivot(final Object row, final Object col, final Object ... values) {
return pivot(Collections.singletonList(row), Collections.singletonList(col), Arrays.asList(values));
}
public DataFrame<V> pivot(final List<Object> rows, final List<Object> cols, final List<Object> values) {
return pivot(columns.indices(rows), columns.indices(cols), columns.indices(values));
}
public DataFrame<V> pivot(final Integer row, final Integer col, final Integer ... values) {
return pivot(new Integer[] { row }, new Integer[] { col }, values);
}
@Timed
public DataFrame<V> pivot(final Integer[] rows, final Integer[] cols, final Integer[] values) {
return Pivoting.pivot(this, rows, cols, values);
}
@Timed
public <U> DataFrame<U> pivot(final KeyFunction<V> rows, final KeyFunction<V> cols, final Map<Integer, Aggregate<V,U>> values) {
return Pivoting.pivot(this, rows, cols, values);
}
public DataFrame<V> sortBy(final Object ... cols) {
final Map<Integer, SortDirection> sortCols = new LinkedHashMap<>();
for (final Object col : cols) {
final String str = col instanceof String ? String.class.cast(col) : "";
final SortDirection dir = str.startsWith("-") ?
SortDirection.DESCENDING : SortDirection.ASCENDING;
final int c = columns.get(str.startsWith("-") ? str.substring(1) : col);
sortCols.put(c, dir);
}
return Sorting.sort(this, sortCols);
}
@Timed
public DataFrame<V> sortBy(final Integer ... cols) {
final Map<Integer, SortDirection> sortCols = new LinkedHashMap<>();
for (final int c : cols) {
final SortDirection dir = c < 0 ?
SortDirection.DESCENDING : SortDirection.ASCENDING;
sortCols.put(Math.abs(c), dir);
}
return Sorting.sort(this, sortCols);
}
public DataFrame<V> sortBy(final Comparator<List<V>> comparator) {
return Sorting.sort(this, comparator);
}
/**
* Return the types for each of the data frame columns.
*
* @return the list of column types
*/
public List<Class<?>> types() {
return Inspection.types(this);
}
/**
* Return a data frame containing only columns with numeric data.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.append(Arrays.asList("one", 1));
* > df.append(Arrays.asList("two", 2));
* > df.numeric().columns();
* [value] }</pre>
*
* @return a data frame containing only the numeric columns
*/
public DataFrame<Number> numeric() {
final SparseBitSet numeric = Inspection.numeric(this);
final Set<Object> keep = Selection.select(columns, numeric).names();
return retain(keep.toArray(new Object[keep.size()]))
.cast(Number.class);
}
/**
* Return a data frame containing only columns with non-numeric data.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.append(Arrays.asList("one", 1));
* > df.append(Arrays.asList("two", 2));
* > df.nonnumeric().columns();
* [name] }</pre>
*
* @return a data frame containing only the non-numeric columns
*/
public DataFrame<V> nonnumeric() {
final SparseBitSet nonnumeric = Inspection.nonnumeric(this);
final Set<Object> keep = Selection.select(columns, nonnumeric).names();
return retain(keep.toArray(new Object[keep.size()]));
}
/**
* Return an iterator over the rows of the data frame. Also used
* implicitly with {@code foreach} loops.
*
* <pre> {@code
* > DataFrame<Integer> df = new DataFrame<>(
* > Arrays.asList(
* > Arrays.asList(1, 2),
* > Arrays.asList(3, 4)
* > )
* > );
* > List<Integer> results = new ArrayList<>();
* > for (List<Integer> row : df)
* > results.add(row.get(0));
* > results;
* [1, 2] }</pre>
*
* @return an iterator over the rows of the data frame.
*/
@Override
public ListIterator<List<V>> iterator() {
return iterrows();
}
public ListIterator<List<V>> iterrows() {
return new Views.ListView<>(this, true).listIterator();
}
public ListIterator<List<V>> itercols() {
return new Views.ListView<>(this, false).listIterator();
}
public ListIterator<Map<Object, V>> itermap() {
return new Views.MapView<>(this, true).listIterator();
}
public ListIterator<V> itervalues() {
return new Views.FlatView<>(this).listIterator();
}
/**
* Cast this data frame to the specified type.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<>("name", "value");
* > df.append(Arrays.asList("one", "1"));
* > DataFrame<String> dfs = df.cast(String.class);
* > dfs.get(0, 0).getClass().getName();
* java.lang.String }</pre>
*
* @param cls
* @return the data frame cast to the specified type
*/
@SuppressWarnings("unchecked")
public <T> DataFrame<T> cast(final Class<T> cls) {
return (DataFrame<T>)this;
}
/**
* Return a map of index names to rows.
*
* <pre> {@code
* > DataFrame<Integer> df = new DataFrame<>("value");
* > df.append("alpha", Arrays.asList(1));
* > df.append("bravo", Arrays.asList(2));
* > df.map();
* {alpha=[1], bravo=[2]}}</pre>
*
* @return a map of index names to rows.
*/
public Map<Object, List<V>> map() {
final Map<Object, List<V>> m = new LinkedHashMap<Object, List<V>>();
final int len = length();
final Iterator<Object> names = index.names().iterator();
for (int r = 0; r < len; r++) {
final Object name = names.hasNext() ? names.next() : r;
m.put(name, row(r));
}
return m;
}
public Map<V, List<V>> map(final Object key, final Object value) {
return map(columns.get(key), columns.get(value));
}
public Map<V, List<V>> map(final Integer key, final Integer value) {
final Map<V, List<V>> m = new LinkedHashMap<V, List<V>>();
final int len = length();
for (int r = 0; r < len; r++) {
final V name = data.get(key, r);
List<V> values = m.get(name);
if (values == null) {
values = new ArrayList<V>();
m.put(name, values);
}
values.add(data.get(value, r));
}
return m;
}
public DataFrame<V> unique(final Object ... cols) {
return unique(columns.indices(cols));
}
public DataFrame<V> unique(final Integer ... cols) {
final DataFrame<V> unique = new DataFrame<V>(columns.names());
final Set<List<V>> seen = new HashSet<List<V>>();
final List<V> key = new ArrayList<V>(cols.length);
final int len = length();
for (int r = 0; r < len; r++) {
for (final int c : cols) {
key.add(data.get(c, r));
}
if (!seen.contains(key)) {
unique.append(row(r));
seen.add(key);
}
key.clear();
}
return unique;
}
public DataFrame<V> diff() {
return diff(1);
}
public DataFrame<V> diff(final int period) {
return Timeseries.diff(this, period);
}
public DataFrame<V> percentChange() {
return percentChange(1);
}
public DataFrame<V> percentChange(final int period) {
return Timeseries.percentChange(this, period);
}
public DataFrame<V> rollapply(final Function<List<V>, V> function) {
return rollapply(function, 1);
}
public DataFrame<V> rollapply(final Function<List<V>, V> function, final int period) {
return Timeseries.rollapply(this, function, period);
}
/**
* Display the numeric columns of this data frame
* as a line chart in a new swing frame.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<Object>(
* > Collections.emptyList(),
* > Arrays.asList("name", "value"),
* > Arrays.asList(
* > Arrays.asList("alpha", "bravo", "charlie"),
* > Arrays.asList(10, 20, 30)
* > )
* > );
* > df.plot();
* } </pre>
*
*/
public final void plot() {
plot(PlotType.LINE);
}
/**
* Display the numeric columns of this data frame
* as a chart in a new swing frame using the specified type.
*
* <pre> {@code
* > DataFrame<Object> df = new DataFrame<Object>(
* > Collections.emptyList(),
* > Arrays.asList("name", "value"),
* > Arrays.asList(
* > Arrays.asList("alpha", "bravo", "charlie"),
* > Arrays.asList(10, 20, 30)
* > )
* > );
* > df.plot(PlotType.AREA);
* } </pre>
* @param type the type of plot to display
*/
public final void plot(final PlotType type) {
Display.plot(this, type);
}
/**
* Draw the numeric columns of this data frame
* as a chart in the specified {@link Container}.
*
* @param container the container to use for the chart
*/
public final void draw(final Container container) {
Display.draw(this, container, PlotType.LINE);
}
/**
* Draw the numeric columns of this data frame as a chart
* in the specified {@link Container} using the specified type.
*
* @param container the container to use for the chart
* @param type the type of plot to draw
*/
public final void draw(final Container container, final PlotType type) {
Display.draw(this, container, type);
}
public final void show() {
Display.show(this);
}
public static final <V> DataFrame<String> compare(final DataFrame<V> df1, final DataFrame<V> df2) {
return Comparison.compare(df1, df2);
}
public static final DataFrame<Object> readCsv(final String file)
throws IOException {
return Serialization.readCsv(file);
}
public static final DataFrame<Object> readCsv(final InputStream input)
throws IOException {
return Serialization.readCsv(input);
}
public static final DataFrame<Object> readCsv(final String file, final String separator)
throws IOException {
return Serialization.readCsv(file, separator, NumberDefault.LONG_DEFAULT);
}
public static final DataFrame<Object> readCsv(final InputStream input, final String separator)
throws IOException {
return Serialization.readCsv(input, separator, NumberDefault.LONG_DEFAULT, null);
}
public static final DataFrame<Object> readCsv(final InputStream input, final String separator, final String naString)
throws IOException {
return Serialization.readCsv(input, separator, NumberDefault.LONG_DEFAULT, naString);
}
public static final DataFrame<Object> readCsv(final InputStream input, final String separator, final String naString, final boolean hasHeader)
throws IOException {
return Serialization.readCsv(input, separator, NumberDefault.LONG_DEFAULT, naString, hasHeader);
}
public static final DataFrame<Object> readCsv(final String file, final String separator, final String naString, final boolean hasHeader)
throws IOException {
return Serialization.readCsv(file, separator, NumberDefault.LONG_DEFAULT, naString, hasHeader);
}
public static final DataFrame<Object> readCsv(final String file, final String separator, final NumberDefault numberDefault, final String naString, final boolean hasHeader)
throws IOException {
return Serialization.readCsv(file, separator, numberDefault, naString, hasHeader);
}
public static final DataFrame<Object> readCsv(final String file, final String separator, final NumberDefault longDefault)
throws IOException {
return Serialization.readCsv(file, separator, longDefault);
}
public static final DataFrame<Object> readCsv(final String file, final String separator, final NumberDefault longDefault, final String naString)
throws IOException {
return Serialization.readCsv(file, separator, longDefault, naString);
}
public static final DataFrame<Object> readCsv(final InputStream input, final String separator, final NumberDefault longDefault)
throws IOException {
return Serialization.readCsv(input, separator, longDefault, null);
}
public final void writeCsv(final String file)
throws IOException {
Serialization.writeCsv(this, new FileOutputStream(file));
}
public final void writeCsv(final OutputStream output)
throws IOException {
Serialization.writeCsv(this, output);
}
public static final DataFrame<Object> readXls(final String file)
throws IOException {
return Serialization.readXls(file);
}
public static final DataFrame<Object> readXls(final InputStream input)
throws IOException {
return Serialization.readXls(input);
}
public final void writeXls(final String file)
throws IOException {
Serialization.writeXls(this, new FileOutputStream(file));
}
public final void writeXls(final OutputStream output)
throws IOException {
Serialization.writeXls(this, output);
}
public final String toString(final int limit) {
return Serialization.toString(this, limit);
}
@Override
public String toString() {
return toString(10);
}
/**
* A function that is applied to objects (rows or values)
* in a {@linkplain DataFrame data frame}.
*
* <p>Implementors define {@link #apply(Object)} to perform
* the desired calculation and return the result.</p>
*
* @param <I> the type of the input values
* @param <O> the type of the output values
* @see DataFrame#apply(Function)
* @see DataFrame#aggregate(Aggregate)
*/
public interface Function<I, O> {
/**
* Perform computation on the specified
* input value and return the result.
*
* @param value the input value
* @return the result
*/
O apply(I value);
}
public interface RowFunction<I, O> {
List<List<O>> apply(List<I> values);
}
/**
* A function that converts {@linkplain DataFrame data frame}
* rows to index or group keys.
*
* <p>Implementors define {@link #apply(Object)} to accept
* a data frame row as input and return a key value, most
* commonly used by {@link DataFrame#groupBy(KeyFunction)}.</p>
*
* @param <I> the type of the input values
* @see DataFrame#groupBy(KeyFunction)
*/
public interface KeyFunction<I>
extends Function<List<I>, Object> { }
/**
* A function that converts lists of {@linkplain DataFrame data frame}
* values to aggregate results.
*
* <p>Implementors define {@link #apply(Object)} to accept
* a list of data frame values as input and return an aggregate
* result.</p>
*
* @param <I> the type of the input values
* @param <O> the type of the result
* @see DataFrame#aggregate(Aggregate)
*/
public interface Aggregate<I, O>
extends Function<List<I>, O> { }
/**
* An interface used to filter a {@linkplain DataFrame data frame}.
*
* <p>Implementors define {@link #apply(Object)} to
* return {@code true} for rows that should be included
* in the filtered data frame.</p>
*
* @param <I> the type of the input values
* @see DataFrame#select(Predicate)
*/
public interface Predicate<I>
extends Function<List<I>, Boolean> { }
public enum SortDirection {
ASCENDING,
DESCENDING
}
/**
* An enumeration of join types for joining data frames together.
*/
public enum JoinType {
INNER,
OUTER,
LEFT,
RIGHT
}
/**
* An enumeration of plot types for displaying data frames with charts.
*/
public enum PlotType {
SCATTER,
SCATTER_WITH_TREND,
LINE,
LINE_AND_POINTS,
AREA,
BAR,
GRID,
GRID_WITH_TREND
}
/**
* An enumeration of data frame axes.
*/
public enum Axis {
ROWS,
COLUMNS
}
public static enum NumberDefault {
LONG_DEFAULT,
DOUBLE_DEFAULT
}
/**
* Entry point to joinery as a command line tool.
*
* The available commands are:
* <dl>
* <dt>show</dt><dd>display the specified data frame as a swing table</dd>
* <dt>plot</dt><dd>display the specified data frame as a chart</dd>
* <dt>compare</dt><dd>merge the specified data frames and output the result</dd>
* <dt>shell</dt><dd>launch an interactive javascript shell for exploring data</dd>
* </dl>
*
* @param args file paths or urls of csv input data
* @throws IOException if an error occurs reading input
*/
public static final void main(final String[] args)
throws IOException {
final List<DataFrame<Object>> frames = new ArrayList<>();
for (int i = 1; i < args.length; i++) {
frames.add(DataFrame.readCsv(args[i]));
}
if (args.length > 0 && "plot".equalsIgnoreCase(args[0])) {
if (frames.size() == 1) {
frames.get(0).plot();
return;
}
}
if (args.length > 0 && "show".equalsIgnoreCase(args[0])) {
if (frames.size() == 1) {
frames.get(0).show();
return;
}
}
if (args.length > 0 && "compare".equalsIgnoreCase(args[0])) {
if (frames.size() == 2) {
System.out.println(DataFrame.compare(frames.get(0), frames.get(1)));
return;
}
}
if (args.length > 0 && "shell".equalsIgnoreCase(args[0])) {
Shell.repl(frames);
return;
}
System.err.printf(
"usage: %s [compare|plot|show|shell] [csv-file ...]\n",
DataFrame.class.getCanonicalName()
);
System.exit(255);
}
}