package nl.helixsoft.stats;
import java.io.FileNotFoundException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import junit.framework.TestCase;
import nl.helixsoft.recordstream.ReduceFunctions;
import nl.helixsoft.recordstream.StreamException;
import nl.helixsoft.recordstream.TabularIO;
import nl.helixsoft.stats.DataFrameOperation.JoinType;
/** Tests for all types of DataFrame and DataFrameOperation */
public class TestDataFrame extends TestCase
{
/** check if two dataframes are equal */
private static void assertSame (DataFrame expected, DataFrame observed)
{
assertEquals ("Number of columns is not the same", expected.getColumnCount(), observed.getColumnCount());
assertEquals ("Number of rows is not the same", expected.getRowCount(), observed.getRowCount());
for (int col = 0; col < expected.getColumnCount(); ++col)
{
assertEquals ("Column name mismatch", expected.getColumnHeader(col), observed.getColumnHeader(col));
for (int row = 0; row < expected.getRowCount(); ++row)
{
assertEquals("Value mismatch at (" + row + "," + col + ")", expected.getValueAt(row, col), observed.getValueAt(row, col));
}
}
}
DataFrame days = DataFrameOperation.fromArray(
new String[] { "number", "day" },
new Object[][] {
{ 1.0, "Monday" },
{ 2.0, "Tuesday" },
{ 3.0, "Wednesday" },
{ 4.0, "Thursday" },
{ 5.0, "Friday" },
{ 6.0, "Saturday" },
{ 7.0, "Sunday" },
});
public void testHeader()
{
assertEquals ("number", days.getColumnHeader(0).toString());
assertEquals ("day", days.getColumnHeader(1).toString());
}
public void testColumnView()
{
Column<?> c = new DefaultColumnView(days, 0);
assertEquals (c.getHeader(), "number");
Column<?> c2 = new DefaultColumnView(days, 1);
assertEquals (c2.getHeader(), "day");
}
public void testRowSort()
{
DataFrame result = days.sort("day");
DataFrame expect = DataFrameOperation.fromArray(
new String[] { "number", "day" },
new Object[][] {
{ 5.0, "Friday" },
{ 1.0, "Monday" },
{ 6.0, "Saturday" },
{ 7.0, "Sunday" },
{ 4.0, "Thursday" },
{ 2.0, "Tuesday" },
{ 3.0, "Wednesday" },
});
assertSame (expect, result);
}
public void testCut()
{
DataFrame result = days.cut(0);
DataFrame expect = DataFrameOperation.fromArray(
new String[] { "number" },
new Object[][] {
{ 1.0 },
{ 2.0 },
{ 3.0 },
{ 4.0 },
{ 5.0 },
{ 6.0 },
{ 7.0 },
});
assertSame (expect, result);
}
public void testCbind()
{
List<Integer> ints = new ArrayList<Integer>();
for (int x : new int[] { 1, 2, 3, 4 }) { ints.add(x); }
String[] data = new String[] {"one", "two", "three", "four"};
Matrix m1 = new Matrix(4, 4);
DataFrame df = DataFrameOperation.cbind(ints, data, m1);
//TODO
}
public void testRowBind()
{
DataFrame df1 = DataFrameOperation.fromArray(
new String[] { "number", "string" },
new Object[][] {
{ 1.0, "Hello" }
});
DataFrame df2 = DataFrameOperation.fromArray(
new String[] { "number", "string" },
new Object[][] {
{ 2.0, "World" }
});
DataFrame result = DataFrameOperation.rbind(df1, df2);
DataFrame expect = DataFrameOperation.fromArray(
new String[] { "number", "string" },
new Object[][] {
{ 1.0, "Hello" },
{ 2.0, "World" },
});
// assertSame (expect, result); //TODO
}
DataFrame dfLong = DataFrameOperation.fromArray(
new String[] { "year", "quarter", "project", "days" },
new Object[][] {
{ "2014", "Spring", "Project1", 5 },
{ "2014", "Spring", "Project2", 3 },
{ "2014", "Fall", "Project1", 6 },
{ "2014", "Fall", "Project2", 4 },
{ "2015", "Spring", "Project1", 1 },
{ "2015", "Spring", "Project2", 2 },
{ "2015", "Fall", "Project1", 8 },
{ "2015", "Fall", "Project2", 9 },
});
public void testWide() throws StreamException, FileNotFoundException
{
//TODO
DataFrame result = DataFrameOperation.wideFormat(dfLong).withColumnFactor("year", "quarter").withRowFactor("project").withValue("days").get();
TabularIO.write(result).to(System.out).go();
}
public void testCollapse() throws StreamException
{
DataFrame result = DataFrameOperation.groupBy(dfLong, "year").agg("days", ReduceFunctions.INT_SUM).get();
DataFrame expect = DataFrameOperation.fromArray(
new String[] { "year", "days" },
new Object[][] {
{ "2014", 18 },
{ "2015", 20 },
});
assertSame (expect, result);
}
/** Test the DataFrame.select method */
public void testSelect() throws StreamException
{
DataFrame result = days.select(Arrays.asList (new Integer[] { 5, 5, null, 3 }));
DataFrame expect = DataFrameOperation.fromArray(
new String[] { "number", "day" },
new Object[][] {
{ 6.0, "Saturday" },
{ 6.0, "Saturday" },
{ null, null },
{ 4.0, "Thursday" },
});
assertSame (expect, result);
}
DataFrame dfLeft = DataFrameOperation.fromArray(
new String[] { "id", "letter" },
new Object[][] {
{ "1", "A" },
{ "1", "a" },
{ "2", "B" }
});
DataFrame dfRight = DataFrameOperation.fromArray(
new String[] { "id", "roman" },
new Object[][] {
{ "1", "I" },
{ "3", "III" }
});
/** Test the DataFrameOperation.join method */
public void testJoin() throws StreamException
{
DataFrame result, expect;
result = DataFrameOperation.merge(dfLeft, dfRight).onColumn("id").fullJoin().get().sort("id");
expect = DataFrameOperation.fromArray(
new String[] { "id", "letter", "roman" },
new Object[][] {
{ "1", "A", "I" },
{ "1", "a", "I" },
{ "2", "B", null },
{ "3", null, "III" }
});
assertSame (expect, result);
result = DataFrameOperation.merge(dfLeft, dfRight).onColumn("id").leftJoin().get().sort("id");
expect = DataFrameOperation.fromArray(
new String[] { "id", "letter", "roman" },
new Object[][] {
{ "1", "A", "I" },
{ "1", "a", "I" },
{ "2", "B", null },
});
assertSame (expect, result);
result = DataFrameOperation.merge(dfLeft, dfRight).onColumn("id").rightJoin().get().sort("id");
expect = DataFrameOperation.fromArray(
new String[] { "id", "letter", "roman" },
new Object[][] {
{ "1", "A", "I" },
{ "1", "a", "I" },
{ "3", null, "III" }
});
assertSame (expect, result);
result = DataFrameOperation.merge(dfLeft, dfRight).onColumn("id").innerJoin().get().sort("id");
expect = DataFrameOperation.fromArray(
new String[] { "id", "letter", "roman" },
new Object[][] {
{ "1", "A", "I" },
{ "1", "a", "I" },
});
assertSame (expect, result);
}
public void testJoinNulls()
{
}
public void testFactors()
{
// DataFrameOperation.merge (df1, df2);
}
}