/*
* Apache License
* Version 2.0, January 2004
* http://www.apache.org/licenses/
*
* Copyright 2013 Aurelian Tutuianu
* Copyright 2014 Aurelian Tutuianu
* Copyright 2015 Aurelian Tutuianu
* Copyright 2016 Aurelian Tutuianu
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package rapaio.io;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import rapaio.data.*;
import rapaio.datasets.Datasets;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static org.junit.Assert.*;
/**
* @author <a href="mailto:padreati@yahoo.com">Aurelian Tutuianu</a>
*/
public class CsvTest {
private Csv persistence;
@Before
public void setUp() {
persistence = new Csv().withTrimSpaces(true).withEscapeChar('\"');
}
@After
public void tearDown() {
}
@Test
public void testHeader() {
try {
Frame f = persistence.read(getClass(), "csv-test.csv");
assertNotNull(f);
assertEquals(5, f.varCount());
assertArrayEquals(new String[]{"Year", "Make", "Model", "Description", "Price"}, f.varNames());
} catch (IOException ex) {
assertTrue("this should not happen.", false);
}
}
@Test
public void testLineWithoutQuotas() {
checkLine(new Csv().withSeparatorChar(',').withQuotes(false).withTrimSpaces(false),
" , ,,,", new String[]{" ", " ", "", ""});
checkLine(new Csv().withSeparatorChar(',').withQuotes(false).withTrimSpaces(true),
" , ,,,", new String[]{"", "", "", ""});
checkLine(new Csv().withSeparatorChar(',').withQuotes(false).withTrimSpaces(true),
" ana , are , mere ", new String[]{"ana", "are", "mere"});
checkLine(new Csv().withSeparatorChar(',').withQuotes(false).withTrimSpaces(false),
" ana , are , mere ", new String[]{" ana ", " are ", " mere "});
checkLine(new Csv().withSeparatorChar(',').withQuotes(false).withTrimSpaces(false),
"ana,are,mere", new String[]{"ana", "are", "mere"});
}
@Test
public void testLineWithQuotas() {
checkLine(new Csv().withSeparatorChar(',').withQuotes(true).withTrimSpaces(true).withEscapeChar('\\'),
" \"ana", new String[]{"ana"});
checkLine(new Csv().withSeparatorChar(',').withQuotes(true).withTrimSpaces(true).withEscapeChar('\\'),
" \"ana\", \"ana again\"", new String[]{"ana", "ana again"});
checkLine(new Csv().withSeparatorChar(',').withQuotes(true).withTrimSpaces(true).withEscapeChar('\\'),
" \"ana\", \"ana,again\"", new String[]{"ana", "ana,again"});
checkLine(new Csv().withSeparatorChar(',').withQuotes(true).withTrimSpaces(true).withEscapeChar('\\'),
" \"ana\", \"ana\\\"again\"", new String[]{"ana", "ana\"again"});
checkLine(new Csv().withSeparatorChar(',').withQuotes(true).withTrimSpaces(true).withEscapeChar('\"'),
" \"ana\", \"ana\"\"again\"", new String[]{"ana", "ana\"again"});
}
@Test
public void testFullFrame() {
try {
persistence.withQuotes(true);
Frame df = persistence.read(getClass(), "csv-test.csv");
assertNotNull(df);
assertEquals(5, df.varCount());
assertArrayEquals(new String[]{"Year", "Make", "Model", "Description", "Price"}, df.varNames());
} catch (IOException ex) {
assertTrue("this should not happen.", false);
}
}
private void checkLine(Csv csv, String line, String[] matches) {
List<String> tokens = csv.parseLine(line);
assertEqualTokens(tokens, matches);
}
private void assertEqualTokens(List<String> tokens, String[] matches) {
assertEquals(tokens.size(), matches.length);
for (int i = 0; i < tokens.size(); i++) {
assertEquals(matches[i], tokens.get(i));
}
}
@Test
public void testDefaults() throws IOException {
Frame df = new Csv()
.withQuotes(true)
.withHeader(true)
.withDefaultTypes(VarType.BINARY, VarType.INDEX, VarType.NUMERIC, VarType.NOMINAL)
.read(this.getClass().getResourceAsStream("defaults-test.csv"));
assertEquals(7, df.rowCount());
// x1 is binary
assertEquals(VarType.BINARY, df.var("x1").type());
assertEquals(false, df.binary(0, "x1"));
assertEquals(true, df.binary(1, "x1"));
assertEquals(false, df.binary(2, "x1"));
assertEquals(true, df.binary(3, "x1"));
assertEquals(true, df.missing(4, "x1"));
assertEquals(false, df.binary(5, "x1"));
assertEquals(true, df.binary(6, "x1"));
// x2 is index
assertEquals(VarType.INDEX, df.var("x2").type());
assertEquals(0, df.index(0, "x2"));
assertEquals(1, df.index(1, "x2"));
assertEquals(0, df.index(2, "x2"));
assertEquals(1, df.index(3, "x2"));
assertEquals(true, df.missing(4, "x2"));
assertEquals(2, df.index(5, "x2"));
assertEquals(3, df.index(6, "x2"));
// x3 is numeric
assertEquals(VarType.NUMERIC, df.var("x3").type());
assertEquals(0.0, df.value(0, "x3"), 10e-12);
assertEquals(1.0, df.value(1, "x3"), 10e-12);
assertEquals(0.0, df.value(2, "x3"), 10e-12);
assertEquals(1.0, df.value(3, "x3"), 10e-12);
assertEquals(Double.NaN, df.value(4, "x3"), 10e-12);
assertEquals(2.0, df.value(5, "x3"), 10e-12);
assertEquals(3.0, df.value(6, "x3"), 10e-12);
// x4 nominal
assertEquals(VarType.NOMINAL, df.var("x4").type());
assertEquals("0", df.label(0, "x4"));
assertEquals("1", df.label(1, "x4"));
assertEquals("false", df.label(2, "x4"));
assertEquals("other", df.label(3, "x4"));
assertEquals("?", df.label(4, "x4"));
assertEquals("2", df.label(5, "x4"));
assertEquals("3", df.label(6, "x4"));
}
@Test
public void testSkipRows() throws IOException {
List<String> allVarNames = new ArrayList<>();
allVarNames.add("sepal-length");
allVarNames.add("sepal-width");
allVarNames.add("petal-length");
allVarNames.add("petal-width");
allVarNames.add("class");
// test no skip
Frame full = new Csv().read(Datasets.class, "iris-r.csv");
Assert.assertEquals(5, full.varCount());
Assert.assertArrayEquals(allVarNames.toArray(), full.varNames());
// test skip first 10 rows
Frame r1 = new Csv().withSkipRows(0, 1, 2, 3, 4, 5, 6, 7, 8, 9).read(Datasets.class, "iris-r.csv");
Frame r2 = new Csv().withSkipRows(row -> row < 10).read(Datasets.class, "iris-r.csv");
Frame r3 = new Csv().withRows(IntStream.range(10, 150).toArray()).read(Datasets.class, "iris-r.csv");
Frame r4 = new Csv().withRows(row -> row >= 10).read(Datasets.class, "iris-r.csv");
Assert.assertTrue(r1.deepEquals(r2));
Assert.assertTrue(r1.deepEquals(r3));
Assert.assertTrue(r1.deepEquals(r4));
// test skip row % 2 == 0 and between 50 and 100
Frame r5 = new Csv().withStartRow(50).withEndRow(100).withSkipRows(row -> row % 2 == 0).read(Datasets.class, "iris-r.csv");
Assert.assertEquals(25, r5.rowCount());
Assert.assertArrayEquals(new String[]{"?", "virginica"}, r5.var("class").levels());
// test skip vars 0 and 2
Frame v1 = new Csv().withSkipCols(0, 2).read(Datasets.class, "iris-r.csv");
Frame v2 = new Csv().withSkipCols(row -> row == 0 || row == 2).read(Datasets.class, "iris-r.csv");
Frame v3 = new Csv().withCols(1, 3, 4).read(Datasets.class, "iris-r.csv");
Frame v4 = new Csv().withCols(row -> (row != 0) && (row != 2)).read(Datasets.class, "iris-r.csv");
Assert.assertEquals(3, v1.varCount());
Assert.assertTrue(v1.deepEquals(v2));
Assert.assertTrue(v1.deepEquals(v3));
Assert.assertTrue(v1.deepEquals(v4));
// test mixed
Frame m1 = new Csv().withRows(row -> row >= 20 && row < 30).withCols(col -> col >= 2).read(Datasets.class, "iris-r.csv");
Assert.assertEquals(10, m1.rowCount());
Assert.assertEquals(3, m1.varCount());
}
@Test
public void testTypes() throws IOException {
Frame t1 = new Csv()
.withTypes(VarType.NUMERIC, "sepal-length")
.withTypes(VarType.NOMINAL, "petal-width", "sepal-length")
.read(Datasets.class, "iris-r.csv");
t1.printSummary();
VarType[] types = new VarType[]{VarType.NOMINAL, VarType.NUMERIC, VarType.NUMERIC, VarType.NOMINAL, VarType.NOMINAL};
Assert.assertArrayEquals(types, t1.varStream().map(Var::type).toArray());
Frame t2 = new Csv().withTemplate(t1).read(Datasets.class, "iris-r.csv");
Assert.assertTrue(t1.deepEquals(t2));
}
@Test
public void testNAValues() throws IOException {
// no NA values
Frame na1 = new Csv().read(Datasets.class, "iris-r.csv");
Assert.assertEquals(150, na1.stream().complete().count());
// non existent NA values
Frame na2 = new Csv().withNAValues("", "xxxx").read(Datasets.class, "iris-r.csv");
Assert.assertEquals(150, na2.stream().complete().count());
Frame na3 = new Csv().withNAValues("virginica").withTypes(VarType.NOMINAL, "sepal-length").read(Datasets.class, "iris-r.csv");
Assert.assertEquals(100, na3.stream().complete().count());
Frame na4 = new Csv().withNAValues("virginica", "5").withTypes(VarType.NOMINAL, "sepal-length").read(Datasets.class, "iris-r.csv");
Assert.assertEquals(89, na4.stream().complete().count());
}
}