/* * Copyright 2013 Cloudera Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.kitesdk.data.spi.filesystem; import org.kitesdk.data.DatasetDescriptor; import org.kitesdk.data.DatasetReader; import org.kitesdk.data.DatasetRecordException; import org.kitesdk.data.LocalFileSystem; import org.kitesdk.data.TestDatasetReaders; import org.kitesdk.data.TestHelpers; import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; import org.apache.avro.generic.GenericData; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import java.io.IOException; import org.kitesdk.data.spi.DataModelUtil; public class TestCSVFileReader extends TestDatasetReaders<GenericData.Record> { /* * OpenCSV notes: * - An empty unquoted field is passed as an empty string */ public static final String CSV_CONTENT = ( "str,34,2.11,false\r\n" + "\"str,2\",,4,true\n" + "str3,\"\",null\n" + "str4,,,"); public static final String REORDERED_CSV_CONTENT = ( "myBool,myFloat,myInt,myStr,ignored\n" + "false,2.11,34,str,68\r\n" + "true,4,,\"str,2\"\n" + "true,null,\"\",str3\n" + ",,,str4\n"); public static final String VALIDATOR_CSV_CONTENT = "id,string,even\n" + "0,a,true\n" + "1,b\n" + "2,c,true\n"; public static final String TSV_CONTENT = ( "string\tinteger\tfloat\tbool\r" + "str\t34\t2.11\tfalse\r\n" + "\"str\t2\"\t\t4\ttrue\n" + "str3\t\"\"\tnull"); public static FileSystem localfs = null; public static Path csvFile = null; public static Path reorderedFile = null; public static Path validatorFile = null; public static Path tsvFile = null; public static Schema STRINGS = SchemaBuilder.record("Strings") .fields() .name("string1").type().stringType().noDefault() .name("string2").type().stringType().noDefault() .name("string3").type().stringType().noDefault() .name("string4").type().stringType().stringDefault("missing value") .endRecord(); public static final Schema VALIDATOR_SCHEMA = SchemaBuilder.record("Validator") .fields() .name("id").type().intType().noDefault() .name("string").type().stringType().noDefault() .name("even").type().booleanType().booleanDefault(false) .endRecord(); public static Schema BEAN_SCHEMA = SchemaBuilder.record(TestBean.class.getName()) .fields() .name("myStr").type().stringType().noDefault() .name("myInt").type().intType().intDefault(0) .name("myFloat").type().floatType().noDefault() .name("myBool").type().booleanType().booleanDefault(false) .endRecord(); public static Schema SCHEMA = SchemaBuilder.record("Normal") .fields() .name("myStr").type().stringType().noDefault() .name("myInt").type().intType().intDefault(0) .name("myFloat").type().floatType().noDefault() .name("myBool").type().booleanType().booleanDefault(false) .endRecord(); public static Schema TYPE_ERROR_SCHEMA = SchemaBuilder.record("Normal") .fields() .name("myString").type().stringType().noDefault() .name("myInt").type().intType().intDefault(0) .name("myFloat").type().intType().intDefault(34) .name("myBool").type().booleanType().booleanDefault(false) .endRecord(); @BeforeClass public static void createCSVFiles() throws IOException { localfs = LocalFileSystem.getInstance(); csvFile = new Path("target/temp.csv"); reorderedFile = new Path("target/reordered.csv"); tsvFile = new Path("target/temp.tsv"); validatorFile = new Path("target/validator.csv"); FSDataOutputStream out = localfs.create(csvFile, true); out.writeBytes(CSV_CONTENT); out.close(); out = localfs.create(reorderedFile, true); out.writeBytes(REORDERED_CSV_CONTENT); out.close(); out = localfs.create(validatorFile, true); out.writeBytes(VALIDATOR_CSV_CONTENT); out.close(); out = localfs.create(tsvFile, true); out.writeBytes(TSV_CONTENT); out.close(); } @Override public DatasetReader<GenericData.Record> newReader() throws IOException { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("kite.csv.has-header", "true") .schema(VALIDATOR_SCHEMA) .build(); return new CSVFileReader<GenericData.Record>(localfs, validatorFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); } @Override public int getTotalRecords() { return 3; } @Override public DatasetTestUtilities.RecordValidator<GenericData.Record> getValidator() { return new DatasetTestUtilities.RecordValidator<GenericData.Record>() { private static final String chars = "abcdef"; @Override public void validate(GenericData.Record record, int recordNum) { Assert.assertEquals(recordNum, record.get("id")); Assert.assertEquals(Character.toString(chars.charAt(recordNum)), record.get("string")); Assert.assertEquals((recordNum % 2) == 0, record.get("even")); } }; } @Test(expected = IllegalArgumentException.class) public void testRejectsNonRecordSchemas() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(SchemaBuilder.array().items().stringType()) .build(); new CSVFileReader<GenericData.Record>(localfs, csvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); } @Test public void testStringSchema() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(STRINGS) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, csvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); GenericData.Record rec = reader.next(); Assert.assertEquals("str", rec.get(0)); Assert.assertEquals("34", rec.get(1)); Assert.assertEquals("2.11", rec.get(2)); Assert.assertEquals("false", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str,2", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("4", rec.get(2)); Assert.assertEquals("true", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str3", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("null", rec.get(2)); Assert.assertEquals("missing value", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str4", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("", rec.get(2)); Assert.assertEquals("", rec.get(3)); Assert.assertFalse(reader.hasNext()); } @Test public void testTSV() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("kite.csv.delimiter", "\t") .property("kite.csv.lines-to-skip", "1") .schema(STRINGS) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, tsvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); GenericData.Record rec = reader.next(); Assert.assertEquals("str", rec.get(0)); Assert.assertEquals("34", rec.get(1)); Assert.assertEquals("2.11", rec.get(2)); Assert.assertEquals("false", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str\t2", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("4", rec.get(2)); Assert.assertEquals("true", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str3", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("null", rec.get(2)); Assert.assertEquals("missing value", rec.get(3)); Assert.assertFalse(reader.hasNext()); } @Test public void testTSVWithDeprecatedProperties() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("cdk.csv.delimiter", "\t") .property("cdk.csv.lines-to-skip", "1") .schema(STRINGS) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, tsvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); GenericData.Record rec = reader.next(); Assert.assertEquals("str", rec.get(0)); Assert.assertEquals("34", rec.get(1)); Assert.assertEquals("2.11", rec.get(2)); Assert.assertEquals("false", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str\t2", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("4", rec.get(2)); Assert.assertEquals("true", rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str3", rec.get(0)); Assert.assertEquals("", rec.get(1)); Assert.assertEquals("null", rec.get(2)); Assert.assertEquals("missing value", rec.get(3)); Assert.assertFalse(reader.hasNext()); } @Test public void testNormalSchema() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(SCHEMA) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, csvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); GenericData.Record rec = reader.next(); Assert.assertEquals("str", rec.get(0)); Assert.assertEquals(34, rec.get(1)); Assert.assertEquals(2.11f, rec.get(2)); Assert.assertEquals(false, rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str,2", rec.get(0)); Assert.assertEquals(0, rec.get(1)); Assert.assertEquals(4.0f, rec.get(2)); Assert.assertEquals(true, rec.get(3)); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about null as a number", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about missing default", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertFalse(reader.hasNext()); } @Test public void testBadNumericSchema() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(TYPE_ERROR_SCHEMA) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, csvFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should reject float value for integer schema", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); } @Test public void testNormalSchemaWithReorderedContent() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("kite.csv.has-header", "true") .schema(SCHEMA) .build(); final CSVFileReader<GenericData.Record> reader = new CSVFileReader<GenericData.Record>(localfs, reorderedFile, desc, DataModelUtil.accessor(GenericData.Record.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); GenericData.Record rec = reader.next(); Assert.assertEquals("str", rec.get(0)); Assert.assertEquals(34, rec.get(1)); Assert.assertEquals(2.11f, rec.get(2)); Assert.assertEquals(false, rec.get(3)); Assert.assertTrue(reader.hasNext()); rec = reader.next(); Assert.assertEquals("str,2", rec.get(0)); Assert.assertEquals(0, rec.get(1)); Assert.assertEquals(4.0f, rec.get(2)); Assert.assertEquals(true, rec.get(3)); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about null as a number", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about missing default", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertFalse(reader.hasNext()); } @Test public void testReflectNormalSchemaWithReorderedContent() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .property("kite.csv.has-header", "true") .schema(BEAN_SCHEMA) .build(); final CSVFileReader<TestBean> reader = new CSVFileReader<TestBean>(localfs, reorderedFile, desc, DataModelUtil.accessor(TestBean.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestBean bean = reader.next(); Assert.assertEquals("str", bean.myStr); Assert.assertEquals((Integer) 34, bean.myInt); Assert.assertEquals((Float) 2.11f, bean.myFloat); Assert.assertEquals(false, bean.myBool); Assert.assertTrue(reader.hasNext()); bean = reader.next(); Assert.assertEquals("str,2", bean.myStr); Assert.assertEquals(null, bean.myInt); Assert.assertEquals((Float) 4.0f, bean.myFloat); Assert.assertEquals(true, bean.myBool); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about null as a number", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertTrue(reader.hasNext()); bean = reader.next(); Assert.assertEquals("str4", bean.myStr); Assert.assertEquals(null, bean.myInt); // null because the read schema from ReflectData.AllowNull permits it Assert.assertEquals(null, bean.myFloat); // not null because there is a value present in the data, which is false // when converted with Boolean.valueOf Assert.assertEquals(false, bean.myBool); Assert.assertFalse(reader.hasNext()); } @Test public void testReflectedRecords() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(BEAN_SCHEMA) .build(); final CSVFileReader<TestBean> reader = new CSVFileReader<TestBean>(localfs, csvFile, desc, DataModelUtil.accessor(TestBean.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestBean bean = reader.next(); Assert.assertEquals("str", bean.myStr); Assert.assertEquals((Integer) 34, bean.myInt); Assert.assertEquals((Float) 2.11f, bean.myFloat); Assert.assertEquals(false, bean.myBool); Assert.assertTrue(reader.hasNext()); bean = reader.next(); Assert.assertEquals("str,2", bean.myStr); Assert.assertEquals(null, bean.myInt); Assert.assertEquals((Float) 4.0f, bean.myFloat); Assert.assertEquals(true, bean.myBool); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about null as a number", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertTrue(reader.hasNext()); bean = reader.next(); Assert.assertEquals("str4", bean.myStr); Assert.assertEquals(null, bean.myInt); // null because the read schema from ReflectData.AllowNull permits it Assert.assertEquals(null, bean.myFloat); // not null because there is a value present in the data, which is false // when converted with Boolean.valueOf Assert.assertEquals(false, bean.myBool); Assert.assertFalse(reader.hasNext()); } @Test public void testCustomGenericRecords() { final DatasetDescriptor desc = new DatasetDescriptor.Builder() .schema(SCHEMA) .build(); final CSVFileReader<TestGenericRecord> reader = new CSVFileReader<TestGenericRecord>(localfs, csvFile, desc, DataModelUtil.accessor(TestGenericRecord.class, desc.getSchema())); reader.initialize(); Assert.assertTrue(reader.hasNext()); TestGenericRecord record = reader.next(); Assert.assertEquals("str", record.get(0)); Assert.assertEquals((Integer) 34, record.get(1)); Assert.assertEquals((Float) 2.11f, record.get(2)); Assert.assertEquals(false, record.get(3)); Assert.assertTrue(reader.hasNext()); record = reader.next(); Assert.assertEquals("str,2", record.get(0)); Assert.assertEquals((Integer) 0, record.get(1)); Assert.assertEquals((Float) 4.0f, record.get(2)); Assert.assertEquals(true, record.get(3)); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about null as a number", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertTrue(reader.hasNext()); TestHelpers.assertThrows("Should complain about missing default", DatasetRecordException.class, new Runnable() { @Override public void run() { reader.next(); } }); Assert.assertFalse(reader.hasNext()); } }