package nl.helixsoft.stats; import java.io.IOException; import java.io.OutputStream; import java.util.List; import nl.helixsoft.recordstream.Predicate; import nl.helixsoft.recordstream.Record; import nl.helixsoft.recordstream.RecordMetaData; import nl.helixsoft.recordstream.RecordStream; /** * A table of data, ready for statistical operations. * <p> * All data is kept in memory * <p> * Each column can have headers * <p> * Each column is of a single type. * <p> * Implementations may use native type arrays (double[] or int[]) for efficiency. * <p> * There are facilities for statistical functions and plotting. * <p> * Some operations modify the data frame in-place. These methods typically return "this" to allow chaining operations. * <p> * Operations like cut() ... return a copy of the DataFrame object. * * NOTE: implementing both TableModel and Iterable<Record> turned out not to be so hot because groovy inserts its own iterator() method in TableModels. * Currently implementing neither. * If you want a Iterable<Record>, see @link{DataFrame.asRecordIterable} * If you want a TableModel, use DataFrameOperation.asTableModel (df, editable) */ public interface DataFrame { /** * Get a single row by index. */ public Record getRow(int rowIdx); /** get a name for each row, may return null */ public List<String> getRowNames(); /** get the name of a row by index. */ public String getRowName (int rowIx); @Deprecated // use ColumnHeader instead public RecordMetaData getMetaData(); /** * Extract specified colums by index * returns a new DataFrame object. */ public DataFrame cut (int... columnIdx); /** * Extract specified colums by column name. * returns a new DataFrame object. */ public DataFrame cut (String... columnName); /** * Extract specified rows by index * The list of indices may contain duplicate values or re-ordered values. * returns a new DataFrame object. */ public DataFrame select (int... rowIdx); /** * Extract specified rows by index. * The input list may contain duplicate values, may re-order values, or may contain null values. * returns a new DataFrame object. */ public DataFrame select (List<Integer> rowIdx); /** * Performs a merge (a.k.a. JOIN in SQL terms) with another table. * returns a new DataFrame object. * This is a FULL JOIN - Rows where the primary key doesn't exists in either this or the other, are filled with null values. * * @deprecated : use DataFrameOperation instead * */ public DataFrame merge (DataFrame that, int onThisColumn, int onThatColumn); /** shortCut in cases where the column name is the same * * @deprecated : use DataFrameOperation instead */ @Deprecated public DataFrame merge (DataFrame that, String onColumn); /** * return column names as list * ... use getColumnHeader instead... */ @Deprecated List<String> getColumnNames(); public Object getColumnHeader(int colIx); public Header getColumnHeader(); /** Replace the column header with a new value. Note: modifies DataFrame in place, returns this */ public DataFrame setColumnHeader(int colIx, String value); /** * Turn an array of column names into an array of column indices */ public int[] getColumnIndexes(String... columnNames); public int getColumnIndex(String columnName); public void toOutputStream (OutputStream os) throws IOException; /** * Add a column * @return a new dataframe // TODO - or modify in place? */ public <T> DataFrame cbind(List<T> column); //TODO: current implementation modifies in place and returns copy of this, unlike cbind which creates a copy. public DataFrame rbind(Object... row); //TODO: these are very similar... do we need both??? //asRecordStream returns a copy of the data in the current implementation, but that is very inefficient. public RecordStream asRecordStream(); public Iterable<Record> asRecordIterable(); public int getColumnCount(); public int getRowCount(); public Object getValueAt(int rowIndex, int columnIndex); public void setValueAt(Object aValue, int rowIndex, int columnIndex); @Deprecated /** use getColumnHeader.toString() instead */ public String getColumnName(int columnIndex); public <T> Column<T> getColumn(Class<T> clazz, int columnIndex); public <T> Factor<T> getColumnAsFactor(Class<T> clazz, int columnIndex); public DataFrame sort (int columnIndex); public DataFrame sort (String columnName); /** * Ideas: * * statistical * * sum * stddev * sqsum * avg * ... * any aggregate function * * * toLongFormat * toWideFormat * * Grouping: factors * apply an aggregate function by group * * sorting * * -- efficiency * colToIntArray - get column as int array * colToDoubleArray * colToStringArray * colToObjectArray * * Implement iteration, Collection interface * Implement TableModel * * Plotting * * Change Events... */ //TODO: possibly better as Stream<Record> ??? public List<Record> filter(Predicate<Record> predicate); }