/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.studio.io.data.internal.file.csv;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.Locale;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import com.rapidminer.core.io.data.ColumnMetaData;
import com.rapidminer.core.io.data.ColumnMetaData.ColumnType;
import com.rapidminer.core.io.data.DataSet;
import com.rapidminer.core.io.data.DataSetException;
import com.rapidminer.core.io.data.DataSetRow;
import com.rapidminer.core.io.data.ParseException;
import com.rapidminer.studio.io.data.HeaderRowBehindStartRowException;
import com.rapidminer.studio.io.data.HeaderRowNotFoundException;
import com.rapidminer.studio.io.data.StartRowNotFoundException;
import com.rapidminer.studio.io.data.internal.ResultSetAdapter;
import com.rapidminer.studio.io.data.internal.file.FileDataSourceTestUtils;
import com.rapidminer.tools.Tools;
/**
* Unit tests for the {@link CSVDataSource#getData()} method.
*
* @author Nils Woehler, Gisa Schaefer
*
*/
public class CSVDataSourceDataTest {
private static File simpleTestFile;
private static File simpleTestFileCommentsQuotesAndEscape;
private static File simpleTestFileSeparatorAndDecimalCharacter;
private static File simpleTestFileSpaceAsSeparator;
private static File missingInHeaderRow;
private static File nominalDateTestFile;
// remember system locale
private static Locale systemLocale = Locale.getDefault();
@BeforeClass
public static void setup() throws URISyntaxException, IOException {
simpleTestFile = new File(CSVDataSourceDataTest.class.getResource("iris1.csv").toURI());
simpleTestFileSeparatorAndDecimalCharacter = new File(CSVDataSourceDataTest.class.getResource("iris2.csv").toURI());
simpleTestFileCommentsQuotesAndEscape = new File(CSVDataSourceDataTest.class.getResource("iris3.csv").toURI());
simpleTestFileSpaceAsSeparator = new File(CSVDataSourceDataTest.class.getResource("iris4.csv").toURI());
missingInHeaderRow = new File(CSVDataSourceDataTest.class.getResource("missingInHeaderRow.csv").toURI());
nominalDateTestFile = new File(CSVDataSourceDataTest.class.getResource("nominal_dates_1.csv").toURI());
// we need to set the local as otherwise test results might differ depending on the system
// local running the test
Locale.setDefault(Locale.ENGLISH);
}
@AfterClass
public static void tearDown() {
// restore system locale
Locale.setDefault(systemLocale);
}
@Test
public void defaultMetaDataTest() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(0);
dataSource.getResultSetConfiguration().setEncoding(StandardCharsets.UTF_8);
// use default guessed meta data
dataSource.createMetaData();
assertFalse(dataSource.getMetadata().isFaultTolerant());
assertEquals(Tools.DATE_TIME_FORMAT.get(), dataSource.getMetadata().getDateFormat());
assertEquals(6, dataSource.getMetadata().getColumnMetaData().size());
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(0), "a1", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(1), "a2", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(2), "a3133333333333333331311313",
ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(3), "a4", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(4), "id", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(5), FileDataSourceTestUtils.getUtf8Label(),
ColumnType.CATEGORICAL);
try (DataSet data = dataSource.getData()) {
assertEquals(-1, data.getNumberOfRows());
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of 0th and 10th row
if (index == 0) {
assertFirstSheetRowContent(row);
} else if (index == 9) {
assertEquals(4.9, row.getDouble(0), 1e-10);
assertEquals(3.1, row.getDouble(1), 1e-10);
assertEquals(1.5, row.getDouble(2), 1e-10);
assertEquals(.1, row.getDouble(3), 1e-10);
assertEquals("id_10", row.getString(4));
assertEquals("Iris-setosa", row.getString(5));
} else if (index == 149) {
assertEquals(5.9, row.getDouble(0), 1e-10);
assertEquals(3.0, row.getDouble(1), 1e-10);
assertEquals(5.1, row.getDouble(2), 1e-10);
assertEquals(1.8, row.getDouble(3), 1e-10);
assertEquals("id_150", row.getString(4));
assertEquals("Iris-virginica", row.getString(5));
}
}
assertEquals(149, data.getCurrentRowIndex());
// check reset
data.reset();
assertEquals(data.getCurrentRowIndex(), -1);
assertTrue(data.hasNext());
assertFirstSheetRowContent(data.nextRow());
}
}
}
@Test
public void defaultTestWithChangedSeparatorAndDecimalCharacter() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileSeparatorAndDecimalCharacter.toPath());
dataSource.getResultSetConfiguration().setEncoding(StandardCharsets.UTF_8);
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(0);
dataSource.getResultSetConfiguration().setColumnSeparators("|");
dataSource.getResultSetConfiguration().setDecimalCharacter(',');
// use default guessed meta data
dataSource.createMetaData();
assertEquals(6, dataSource.getMetadata().getColumnMetaData().size());
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(0), "a1", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(1), "a2", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(2), "a3133333333333333331311313",
ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(3), "a4", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(4), "id", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(5), FileDataSourceTestUtils.getUtf8Label(),
ColumnType.CATEGORICAL);
try (DataSet data = dataSource.getData()) {
assertEquals(-1, data.getNumberOfRows());
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of 0th and 10th row
if (index == 0) {
assertFirstSheetRowContent(row);
} else if (index == 9) {
assertEquals(4.9, row.getDouble(0), 1e-10);
assertEquals(3.1, row.getDouble(1), 1e-10);
assertEquals(1.5, row.getDouble(2), 1e-10);
assertEquals(.1, row.getDouble(3), 1e-10);
assertEquals("id_10", row.getString(4));
assertEquals("Iris-setosa", row.getString(5));
} else if (index == 149) {
assertEquals(5.9, row.getDouble(0), 1e-10);
assertEquals(3.0, row.getDouble(1), 1e-10);
assertEquals(5.1, row.getDouble(2), 1e-10);
assertEquals(1.8, row.getDouble(3), 1e-10);
assertEquals("id_150", row.getString(4));
assertEquals("Iris-virginica", row.getString(5));
}
}
assertEquals(149, data.getCurrentRowIndex());
// check reset
data.reset();
assertEquals(data.getCurrentRowIndex(), -1);
assertTrue(data.hasNext());
assertFirstSheetRowContent(data.nextRow());
}
}
}
@Test(expected = HeaderRowBehindStartRowException.class)
public void headerRowBehindStartRow() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
// set header row behind the start row
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(1);
// configure meta data
dataSource.createMetaData();
}
}
@Test(expected = HeaderRowBehindStartRowException.class)
public void headerRowBehindStartRow2() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
// set header row behind data start row
dataSource.getResultSetConfiguration().setStartingRow(10);
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(15);
// configure the meta data
dataSource.createMetaData();
}
}
@Test(expected = StartRowNotFoundException.class)
public void startRowNotAvailable() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
// set start row behind actual data content
dataSource.getResultSetConfiguration().setStartingRow(151);
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(150);
// configure the meta data
dataSource.createMetaData();
}
}
@Test(expected = HeaderRowNotFoundException.class)
public void headerRowNotFound() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
dataSource.getResultSetConfiguration().setStartingRow(155);
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(155);
dataSource.createMetaData();
}
}
@Test
public void dataContentStartsAtFithRow() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
dataSource.getResultSetConfiguration().setStartingRow(4);
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(0);
dataSource.getResultSetConfiguration().setEncoding(StandardCharsets.UTF_8);
// use default guessed meta data
dataSource.createMetaData();
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(0), "a1", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(1), "a2", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(2), "a3133333333333333331311313",
ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(3), "a4", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(4), "id", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(5), FileDataSourceTestUtils.getUtf8Label(),
ColumnType.CATEGORICAL);
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of first, 10th, last row
if (index == 0) {
assertEquals(4.6, row.getDouble(0), 1e-10);
assertEquals(3.1, row.getDouble(1), 1e-10);
assertEquals(1.5, row.getDouble(2), 1e-10);
assertEquals(.2, row.getDouble(3), 1e-10);
assertEquals("id_4", row.getString(4));
assertEquals("Iris-setosa", row.getString(5));
} else if (index == 9) {
assertEquals(4.8, row.getDouble(0), 1e-10);
assertEquals(3.0, row.getDouble(1), 1e-10);
assertEquals(1.4, row.getDouble(2), 1e-10);
assertEquals(.1, row.getDouble(3), 1e-10);
assertEquals("id_13", row.getString(4));
assertEquals("Iris-setosa", row.getString(5));
} else if (index == 146) {
assertEquals(5.9, row.getDouble(0), 1e-10);
assertEquals(3.0, row.getDouble(1), 1e-10);
assertEquals(5.1, row.getDouble(2), 1e-10);
assertEquals(1.8, row.getDouble(3), 1e-10);
assertEquals("id_150", row.getString(4));
assertEquals("Iris-virginica", row.getString(5));
}
}
assertEquals(146, data.getCurrentRowIndex());
// check reset
data.reset();
assertEquals(data.getCurrentRowIndex(), -1);
assertTrue(data.hasNext());
}
}
}
@Test
public void noHeaderRowDefined() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
dataSource.getResultSetConfiguration().setHasHeaderRow(false);
dataSource.getResultSetConfiguration().setHeaderRow(ResultSetAdapter.NO_HEADER_ROW);
dataSource.getResultSetConfiguration().setEncoding(StandardCharsets.UTF_8);
// use default guessed meta data
dataSource.createMetaData();
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(0), "att1", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(1), "att2", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(2), "att3", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(3), "att4", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(4), "att5", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(5), "att6", ColumnType.CATEGORICAL);
try (DataSet data = dataSource.getData()) {
assertEquals(-1, data.getNumberOfRows());
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of first, 10th, last row
if (index == 0) {
assertEquals("a1", row.getString(0));
assertEquals("a2", row.getString(1));
assertEquals("a3133333333333333331311313", row.getString(2));
assertEquals("a4", row.getString(3));
assertEquals("id", row.getString(4));
assertEquals(FileDataSourceTestUtils.getUtf8Label(), row.getString(5));
} else if (index == 9) {
assertEquals("2.9", row.getString(1));
assertEquals("1.4", row.getString(2));
assertEquals("0.2", row.getString(3));
assertEquals("id_9", row.getString(4));
assertEquals("Iris-setosa", row.getString(5));
} else if (index == 150) {
assertEquals("5.9", row.getString(0));
assertEquals("3.0", row.getString(1));
assertEquals("1.8", row.getString(3));
assertEquals("id_150", row.getString(4));
assertEquals("Iris-virginica", row.getString(5));
}
}
assertEquals(150, data.getCurrentRowIndex());
// check reset
data.reset();
assertEquals(data.getCurrentRowIndex(), -1);
assertTrue(data.hasNext());
}
}
}
@Test
public void missingInHeaderRow() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(missingInHeaderRow.toPath());
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(0);
// use default guessed meta data
dataSource.createMetaData();
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(0), "a1", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(1), "id", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(2), "att3", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(3), "date", ColumnType.CATEGORICAL);
}
}
@Test
public void dataContentStartsAtFithRowHeaderRowAsSecondRow() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
// set content start row to 5th row and header row to second row
dataSource.getResultSetConfiguration().setStartingRow(4);
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(1);
// use default guessed meta data
dataSource.createMetaData();
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(0), "5.1", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(1), "3.5", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(2), "1.4", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(3), "0.2", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(4), "id_1", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(5), "Iris-setosa", ColumnType.CATEGORICAL);
try (DataSet data = dataSource.getData()) {
assertEquals(-1, data.getNumberOfRows());
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of first, 10th and last row
if (index == 0) {
assertEquals(4.6, row.getDouble(0), 1e-10);
assertEquals(3.1, row.getDouble(1), 1e-10);
assertEquals(1.5, row.getDouble(2), 1e-10);
assertEquals(.2, row.getDouble(3), 1e-10);
assertEquals("id_4", row.getString(4));
assertEquals("Iris-setosa", row.getString(5));
} else if (index == 9) {
assertEquals(4.8, row.getDouble(0), 1e-10);
assertEquals(3.0, row.getDouble(1), 1e-10);
assertEquals(1.4, row.getDouble(2), 1e-10);
assertEquals(.1, row.getDouble(3), 1e-10);
assertEquals("id_13", row.getString(4));
assertEquals("Iris-setosa", row.getString(5));
} else if (index == 146) {
assertEquals(5.9, row.getDouble(0), 1e-10);
assertEquals(3.0, row.getDouble(1), 1e-10);
assertEquals(5.1, row.getDouble(2), 1e-10);
assertEquals(1.8, row.getDouble(3), 1e-10);
assertEquals("id_150", row.getString(4));
assertEquals("Iris-virginica", row.getString(5));
}
}
assertEquals(146, data.getCurrentRowIndex());
// check reset
data.reset();
assertEquals(data.getCurrentRowIndex(), -1);
assertTrue(data.hasNext());
}
}
}
@Test
public void firstDataRowDefined() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(0);
dataSource.getResultSetConfiguration().setEncoding(StandardCharsets.UTF_8);
// start with 50th data row
dataSource.getResultSetConfiguration().setStartingRow(50);
// use default guessed meta data
dataSource.createMetaData();
assertFalse(dataSource.getMetadata().isFaultTolerant());
assertEquals(Tools.DATE_TIME_FORMAT.get(), dataSource.getMetadata().getDateFormat());
assertEquals(6, dataSource.getMetadata().getColumnMetaData().size());
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(0), "a1", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(1), "a2", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(2), "a3133333333333333331311313",
ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(3), "a4", ColumnType.REAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(4), "id", ColumnType.CATEGORICAL);
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(5), FileDataSourceTestUtils.getUtf8Label(),
ColumnType.CATEGORICAL);
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of first and last row
if (index == 0) {
assertEquals(5.0, row.getDouble(0), 1e-10);
assertEquals(3.3, row.getDouble(1), 1e-10);
assertEquals(1.4, row.getDouble(2), 1e-10);
assertEquals(.2, row.getDouble(3), 1e-10);
assertEquals("id_50", row.getString(4));
assertEquals("Iris-setosa", row.getString(5));
} else if (index == 100) {
// check row 150 = 50 + 100, i.e. the 100th row 0-based
assertEquals(5.9, row.getDouble(0), 1e-10);
assertEquals(3.0, row.getDouble(1), 1e-10);
assertEquals(5.1, row.getDouble(2), 1e-10);
assertEquals(1.8, row.getDouble(3), 1e-10);
assertEquals("id_150", row.getString(4));
assertEquals("Iris-virginica", row.getString(5));
}
}
assertEquals(100, data.getCurrentRowIndex());
// check reset
data.reset();
assertEquals(data.getCurrentRowIndex(), -1);
assertTrue(data.hasNext());
}
}
}
@Test(expected = StartRowNotFoundException.class)
public void wrongColumnSeparator() throws DataSetException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
dataSource.getResultSetConfiguration().setColumnSeparators("\t");
// try to guess meta data even though empty
dataSource.createMetaData();
}
}
@Test
public void encodingTestUtf8() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
dataSource.getResultSetConfiguration().setEncoding(Charset.forName("UTF-8"));
try (DataSet data = dataSource.getData()) {
int index = -1;
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data 34th row
if (index == 33) {
assertEquals("id_34", row.getString(4));
assertEquals(FileDataSourceTestUtils.getUtf8Entry(), row.getString(5));
break;
}
}
}
}
}
@Test
public void encodingTest() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
dataSource.getResultSetConfiguration().setEncoding(Charset.forName("windows-1250"));
try (DataSet data = dataSource.getData()) {
int index = -1;
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data 34th row
if (index == 33) {
assertEquals("id_34", row.getString(4));
assertEquals(FileDataSourceTestUtils.getWindowsEntry(), row.getString(5));
break;
}
}
}
}
}
@Test
public void trimLinesEnabled() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileSpaceAsSeparator.toPath());
dataSource.getResultSetConfiguration().setTrimLines(true);
dataSource.getResultSetConfiguration().setColumnSeparators(" ");
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of second row
if (index == 1) {
assertEquals("id_2", row.getString(4));
assertEquals(4.9, row.getDouble(0), 1e-10);
assertEquals(3.0, row.getDouble(1), 1e-10);
assertEquals(1.4, row.getDouble(2), 1e-10);
assertEquals(.2, row.getDouble(3), 1e-10);
assertEquals("Iris-setosa", row.getString(5));
}
}
}
}
}
@Test
public void trimLinesDisabled() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileSpaceAsSeparator.toPath());
dataSource.getResultSetConfiguration().setColumnSeparators(" ");
dataSource.getResultSetConfiguration().setTrimLines(false);
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of second row
if (index == 1) {
assertEquals("id_2", row.getString(6));
assertTrue(row.isMissing(0));
assertTrue(row.isMissing(1));
assertEquals(4.9, row.getDouble(2), 1e-10);
assertEquals(3.0, row.getDouble(3), 1e-10);
assertEquals(1.4, row.getDouble(4), 1e-10);
assertEquals(.2, row.getDouble(5), 1e-10);
assertEquals("Iris-setosa", row.getString(7));
}
}
}
}
}
@Test
public void ignoreCommentsEnabled() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileCommentsQuotesAndEscape.toPath());
dataSource.getResultSetConfiguration().setSkipComments(true);
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of 42nd and 43rd row
if (index == 41) {
assertEquals("id_42", row.getString(4));
} else if (index == 42) {
assertEquals("id_44", row.getString(4));
}
}
}
}
}
@Test
public void ignoreCommentsDisabled() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileCommentsQuotesAndEscape.toPath());
dataSource.getResultSetConfiguration().setSkipComments(false);
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of 42nd, 43rd and 44th row
if (index == 41) {
assertEquals("id_42", row.getString(4));
} else if (index == 42) {
assertEquals("id_43", row.getString(4));
assertEquals("#4.4", row.getString(0));
} else if (index == 43) {
assertEquals("id_44", row.getString(4));
assertEquals("%5.0", row.getString(0));
}
}
}
}
}
@Test
public void ignoreCommentsEnabledOtherCharacter() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileCommentsQuotesAndEscape.toPath());
dataSource.getResultSetConfiguration().setSkipComments(true);
dataSource.getResultSetConfiguration().setCommentCharacters("%");
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of 42nd, 43rd and 44th row
if (index == 41) {
assertEquals("id_42", row.getString(4));
} else if (index == 42) {
assertEquals("id_43", row.getString(4));
assertEquals("#4.4", row.getString(0));
} else if (index == 43) {
assertEquals("id_45", row.getString(4));
}
}
}
}
}
@Test
public void usingQuotesAndStandardCharacters() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileCommentsQuotesAndEscape.toPath());
dataSource.getResultSetConfiguration().setUseQuotes(true);
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
if (index == 0) {
assertEquals("id_1", row.getString(4));
assertEquals("5.1;\"3.5;1.4", row.getString(5));
} else if (index == 1) {
assertEquals("id_2", row.getString(4));
assertEquals("Iris-setosa;", row.getString(5));
} else if (index == 2) {
assertEquals("id_3", row.getString(4));
assertEquals("5.1;3.5;1.4", row.getString(5));
}
}
}
}
}
@Test
public void notUsingQuotesAndStandardCharacters() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileCommentsQuotesAndEscape.toPath());
dataSource.getResultSetConfiguration().setUseQuotes(false);
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
if (index == 0) {
assertEquals("\"id_1\"", row.getString(4));
assertEquals("\"5.1", row.getString(5));
assertEquals("\"3.5", row.getString(6));
assertEquals("1.4\"", row.getString(7));
} else if (index == 1) {
assertEquals("\"id_2\"", row.getString(4));
assertEquals("\"Iris-setosa;\"", row.getString(5));
} else if (index == 2) {
assertEquals("\"id_3\"", row.getString(4));
assertEquals("5.1;3.5;1.4", row.getString(5));
}
}
}
}
}
@Test
public void notUsingQuotesAndOtherCharacters() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileCommentsQuotesAndEscape.toPath());
dataSource.getResultSetConfiguration().setUseQuotes(false);
dataSource.getResultSetConfiguration().setQuoteCharacter('`');
dataSource.getResultSetConfiguration().setEscapeCharacter('%');
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
if (index == 0) {
assertEquals("\"id_1\"", row.getString(4));
assertEquals("\"5.1", row.getString(5));
assertEquals("\\\"3.5", row.getString(6));
assertEquals("1.4\"", row.getString(7));
} else if (index == 1) {
assertEquals("\"id_2\"", row.getString(4));
assertEquals("\"Iris-setosa\\", row.getString(5));
assertEquals("\"", row.getString(6));
} else if (index == 2) {
assertEquals("\"id_3\"", row.getString(4));
assertEquals("5.1\\", row.getString(5));
assertEquals("3.5\\", row.getString(6));
assertEquals("1.4", row.getString(7));
}
}
}
}
}
@Test
public void usingOtherQuotes() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileCommentsQuotesAndEscape.toPath());
dataSource.getResultSetConfiguration().setUseQuotes(true);
dataSource.getResultSetConfiguration().setQuoteCharacter('`');
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of 42nd and 43rd row
if (index == 0) {
assertEquals("\"id_1\"", row.getString(4));
assertEquals("\"5.1", row.getString(5));
assertEquals("\"3.5", row.getString(6));
assertEquals("1.4\"", row.getString(7));
} else if (index == 1) {
assertEquals("\"id_2\"", row.getString(4));
assertEquals("\"Iris-setosa;\"", row.getString(5));
} else if (index == 2) {
assertEquals("\"id_3\"", row.getString(4));
assertEquals("5.1;3.5;1.4", row.getString(5));
} else if (index == 3) {
assertEquals("id_4", row.getString(4));
assertEquals("5.1;`3.5;1.4;", row.getString(5));
}
}
}
}
}
@Test
public void usingOtherEscapeCharacter() throws DataSetException, IndexOutOfBoundsException, ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFileCommentsQuotesAndEscape.toPath());
dataSource.getResultSetConfiguration().setUseQuotes(true);
dataSource.getResultSetConfiguration().setEscapeCharacter('%');
try (DataSet data = dataSource.getData()) {
assertTrue(data.hasNext());
int index = -1;
assertEquals(index, data.getCurrentRowIndex());
while (data.hasNext()) {
DataSetRow row = data.nextRow();
++index;
assertEquals(index, data.getCurrentRowIndex());
// check data content of 42nd and 43rd row
if (index == 0) {
assertEquals("id_2", row.getString(4));
assertEquals("Iris-setosa\\;", row.getString(5));
} else if (index == 1) {
assertEquals("id_3", row.getString(4));
assertEquals("5.1\\", row.getString(5));
assertEquals("3.5\\", row.getString(6));
assertEquals("1.4", row.getString(7));
} else if (index == 2) {
assertEquals("`id_4`", row.getString(4));
assertEquals("`5.1", row.getString(5));
assertEquals("\\`3.5", row.getString(6));
assertEquals("1.4\\", row.getString(7));
assertEquals("`", row.getString(8));
} else if (index == 3) {
assertEquals("id_5", row.getString(4));
assertEquals("5.3;5.3", row.getString(5));
}
}
}
}
}
@Test
public void cachingTest() throws DataSetException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
DataSet firstPreviewSet = dataSource.getPreview(10);
DataSet firstDataSet = dataSource.getData();
DataSet secondPreviewSet = dataSource.getPreview(10);
DataSet secondDataSet = dataSource.getData();
assertTrue(firstPreviewSet == secondPreviewSet);
assertTrue(firstDataSet == secondDataSet);
dataSource.getResultSetConfiguration().setColumnSeparators("\t");
DataSet thirdPreviewSet = dataSource.getPreview(10);
DataSet thirdDataSet = dataSource.getData();
assertFalse(thirdPreviewSet == firstPreviewSet);
assertFalse(thirdDataSet == firstDataSet);
}
}
@Test
public void lengthTest() throws DataSetException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
DataSet preview = dataSource.getPreview(10);
preview.reset();
assertEquals(-1, preview.getCurrentRowIndex());
while (preview.hasNext()) {
preview.nextRow();
}
assertEquals(9, preview.getCurrentRowIndex());
DataSet set = dataSource.getData();
set.reset();
assertEquals(-1, set.getCurrentRowIndex());
while (set.hasNext()) {
set.nextRow();
}
assertEquals(149, set.getCurrentRowIndex());
DataSet secondPreview = dataSource.getPreview(10);
secondPreview.reset();
assertEquals(-1, secondPreview.getCurrentRowIndex());
while (secondPreview.hasNext()) {
secondPreview.nextRow();
}
assertEquals(9, secondPreview.getCurrentRowIndex());
}
}
@Test
public void simpleNominalToDateTest()
throws DataSetException, ParseException, IndexOutOfBoundsException, java.text.ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(nominalDateTestFile.toPath());
dataSource.getResultSetConfiguration().setColumnSeparators(";");
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(0);
dataSource.getResultSetConfiguration().setEncoding(StandardCharsets.UTF_8);
// use default guessed meta data
dataSource.createMetaData();
int dateColumnIndex = 6;
SimpleDateFormat dateFormat = new SimpleDateFormat("M/d/yy h:mm a");
// check meta data and set to date
assertEquals(7, dataSource.getMetadata().getColumnMetaData().size());
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(dateColumnIndex), "date", ColumnType.CATEGORICAL);
dataSource.getMetadata().getColumnMetaData().get(dateColumnIndex).setType(ColumnType.DATETIME);
// set correct date format
dataSource.getMetadata().setDateFormat(dateFormat);
DataSet ds = dataSource.getData();
while (ds.hasNext()) {
DataSetRow row = ds.nextRow();
if (ds.getCurrentRowIndex() != 64) {
assertFalse(row.isMissing(dateColumnIndex));
} else {
assertTrue(row.isMissing(dateColumnIndex));
}
if (ds.getCurrentRowIndex() == 20) {
assertEquals(dateFormat.parse("2/2/17 8:24 AM"), row.getDate(dateColumnIndex));
} else if (ds.getCurrentRowIndex() == 50) {
assertEquals(dateFormat.parse("6/11/16 12:24 PM"), row.getDate(dateColumnIndex));
}
}
}
}
@Test(expected = ParseException.class)
public void wrongDateFormatTest()
throws DataSetException, ParseException, IndexOutOfBoundsException, java.text.ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(nominalDateTestFile.toPath());
dataSource.getResultSetConfiguration().setColumnSeparators(";");
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(0);
dataSource.getResultSetConfiguration().setEncoding(StandardCharsets.UTF_8);
// use default guessed meta data
dataSource.createMetaData();
int dateColumnIndex = 6;
// check meta data and set to date
checkColumnMetaData(dataSource.getMetadata().getColumnMetaData(dateColumnIndex), "date", ColumnType.CATEGORICAL);
dataSource.getMetadata().getColumnMetaData().get(dateColumnIndex).setType(ColumnType.DATETIME);
DataSet ds = dataSource.getData();
while (ds.hasNext()) {
DataSetRow row = ds.nextRow();
// will throw a parse exception
row.getDate(dateColumnIndex);
}
}
}
@Test
public void testLastRowAsStartAndHeaderRow()
throws DataSetException, ParseException, IndexOutOfBoundsException, java.text.ParseException {
try (CSVDataSource dataSource = new CSVDataSource()) {
dataSource.setLocation(simpleTestFile.toPath());
dataSource.getResultSetConfiguration().setHasHeaderRow(true);
dataSource.getResultSetConfiguration().setHeaderRow(150);
dataSource.getResultSetConfiguration().setStartingRow(150);
DataSet ds = dataSource.getData();
assertFalse(ds.hasNext());
}
}
private void assertFirstSheetRowContent(DataSetRow row) throws ParseException {
assertFirstSheetRowContent(row, 0);
}
private void assertFirstSheetRowContent(DataSetRow row, int firstColumn) throws ParseException {
if (firstColumn < 1) {
assertEquals(5.1, row.getDouble(0), 1e-10);
}
if (firstColumn < 2) {
assertEquals(3.5, row.getDouble(1 - firstColumn), 1e-10);
}
if (firstColumn < 3) {
assertEquals(1.4, row.getDouble(2 - firstColumn), 1e-10);
}
assertEquals(0.2, row.getDouble(3 - firstColumn), 1e-10);
assertEquals("id_1", row.getString(4 - firstColumn));
assertEquals("Iris-setosa", row.getString(5 - firstColumn));
}
private void checkColumnMetaData(ColumnMetaData columnMetaData, String name, ColumnType type) {
checkColumnMetaData(columnMetaData, name, null, type);
}
private void checkColumnMetaData(ColumnMetaData columnMetaData, String name, String role, ColumnType type) {
assertEquals(name, columnMetaData.getName());
assertEquals(role, columnMetaData.getRole());
assertEquals(type, columnMetaData.getType());
}
}