/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.api.java.io;
import org.apache.flink.api.common.io.ParseException;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.SqlTimeTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.typeutils.RowTypeInfo;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.FileInputSplit;
import org.apache.flink.core.fs.Path;
import org.apache.flink.types.Row;
import org.apache.flink.types.parser.FieldParser;
import org.apache.flink.types.parser.StringParser;
import org.junit.Ignore;
import org.junit.Test;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
import java.sql.Time;
import java.sql.Timestamp;
import java.util.HashMap;
import java.util.Map;
import static junit.framework.TestCase.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.fail;
public class RowCsvInputFormatTest {
private static Path PATH = new Path("an/ignored/file/");
// static variables for testing the removal of \r\n to \n
private static String FIRST_PART = "That is the first part";
private static String SECOND_PART = "That is the second part";
@Test
public void ignoreInvalidLines() throws Exception {
String fileContent =
"#description of the data\n" +
"header1|header2|header3|\n" +
"this is|1|2.0|\n" +
"//a comment\n" +
"a test|3|4.0|\n" +
"#next|5|6.0|\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO, BasicTypeInfo.DOUBLE_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.setLenient(false);
Configuration parameters = new Configuration();
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
try {
result = format.nextRecord(result);
fail("Parse Exception was not thrown! (Row too short)");
} catch (ParseException ignored) {
} // => ok
try {
result = format.nextRecord(result);
fail("Parse Exception was not thrown! (Invalid int value)");
} catch (ParseException ignored) {
} // => ok
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("this is", result.getField(0));
assertEquals(1, result.getField(1));
assertEquals(2.0, result.getField(2));
try {
result = format.nextRecord(result);
fail("Parse Exception was not thrown! (Row too short)");
} catch (ParseException ignored) {
} // => ok
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a test", result.getField(0));
assertEquals(3, result.getField(1));
assertEquals(4.0, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("#next", result.getField(0));
assertEquals(5, result.getField(1));
assertEquals(6.0, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
// re-open with lenient = true
format.setLenient(true);
format.configure(parameters);
format.open(split);
result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("header1", result.getField(0));
assertNull(result.getField(1));
assertNull(result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("this is", result.getField(0));
assertEquals(1, result.getField(1));
assertEquals(2.0, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a test", result.getField(0));
assertEquals(3, result.getField(1));
assertEquals(4.0, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("#next", result.getField(0));
assertEquals(5, result.getField(1));
assertEquals(6.0, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
}
@Test
public void ignoreSingleCharPrefixComments() throws Exception {
String fileContent =
"#description of the data\n" +
"#successive commented line\n" +
"this is|1|2.0|\n" +
"a test|3|4.0|\n" +
"#next|5|6.0|\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.DOUBLE_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.setCommentPrefix("#");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("this is", result.getField(0));
assertEquals(1, result.getField(1));
assertEquals(2.0, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a test", result.getField(0));
assertEquals(3, result.getField(1));
assertEquals(4.0, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
}
@Test
public void ignoreMultiCharPrefixComments() throws Exception {
String fileContent =
"//description of the data\n" +
"//successive commented line\n" +
"this is|1|2.0|\n" +
"a test|3|4.0|\n" +
"//next|5|6.0|\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.DOUBLE_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.setCommentPrefix("//");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("this is", result.getField(0));
assertEquals(1, result.getField(1));
assertEquals(2.0, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a test", result.getField(0));
assertEquals(3, result.getField(1));
assertEquals(4.0, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
}
@Test
public void readStringFields() throws Exception {
String fileContent = "abc|def|ghijk\nabc||hhg\n|||\n||";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("def", result.getField(1));
assertEquals("ghijk", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("hhg", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void readMixedQuotedStringFields() throws Exception {
String fileContent = "@a|b|c@|def|@ghijk@\nabc||@|hhg@\n|||\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.configure(new Configuration());
format.enableQuotedStringParsing('@');
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("a|b|c", result.getField(0));
assertEquals("def", result.getField(1));
assertEquals("ghijk", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("|hhg", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void readStringFieldsWithTrailingDelimiters() throws Exception {
String fileContent = "abc|-def|-ghijk\nabc|-|-hhg\n|-|-|-\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.setFieldDelimiter("|-");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("def", result.getField(1));
assertEquals("ghijk", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("hhg", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void testTailingEmptyFields() throws Exception {
String fileContent = "abc|-def|-ghijk\n" +
"abc|-def|-\n" +
"abc|-|-\n" +
"|-|-|-\n" +
"|-|-\n" +
"abc|-def\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.setFieldDelimiter("|-");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("def", result.getField(1));
assertEquals("ghijk", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("def", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("abc", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals("", result.getField(0));
assertEquals("", result.getField(1));
assertEquals("", result.getField(2));
try {
format.nextRecord(result);
fail("Parse Exception was not thrown! (Row too short)");
} catch (ParseException e) {}
}
@Test
public void testIntegerFields() throws Exception {
String fileContent = "111|222|333|444|555\n666|777|888|999|000|\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, "\n", "|");
format.setFieldDelimiter("|");
format.configure(new Configuration());
format.open(split);
Row result = new Row(5);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(111, result.getField(0));
assertEquals(222, result.getField(1));
assertEquals(333, result.getField(2));
assertEquals(444, result.getField(3));
assertEquals(555, result.getField(4));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(666, result.getField(0));
assertEquals(777, result.getField(1));
assertEquals(888, result.getField(2));
assertEquals(999, result.getField(3));
assertEquals(0, result.getField(4));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void testEmptyFields() throws Exception {
String fileContent =
",,,,,,,,\n" +
",,,,,,,\n" +
",,,,,,,,\n" +
",,,,,,,\n" +
",,,,,,,,\n" +
",,,,,,,,\n" +
",,,,,,,\n" +
",,,,,,,,\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.BOOLEAN_TYPE_INFO,
BasicTypeInfo.BYTE_TYPE_INFO,
BasicTypeInfo.DOUBLE_TYPE_INFO,
BasicTypeInfo.FLOAT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.LONG_TYPE_INFO,
BasicTypeInfo.SHORT_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes, true);
format.setFieldDelimiter(",");
format.configure(new Configuration());
format.open(split);
Row result = new Row(8);
int linesCnt = fileContent.split("\n").length;
for (int i = 0; i < linesCnt; i++) {
result = format.nextRecord(result);
assertNull(result.getField(i));
}
// ensure no more rows
assertNull(format.nextRecord(result));
assertTrue(format.reachedEnd());
}
@Test
public void testDoubleFields() throws Exception {
String fileContent = "11.1|22.2|33.3|44.4|55.5\n66.6|77.7|88.8|99.9|00.0|\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.DOUBLE_TYPE_INFO,
BasicTypeInfo.DOUBLE_TYPE_INFO,
BasicTypeInfo.DOUBLE_TYPE_INFO,
BasicTypeInfo.DOUBLE_TYPE_INFO,
BasicTypeInfo.DOUBLE_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes);
format.setFieldDelimiter("|");
format.configure(new Configuration());
format.open(split);
Row result = new Row(5);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(11.1, result.getField(0));
assertEquals(22.2, result.getField(1));
assertEquals(33.3, result.getField(2));
assertEquals(44.4, result.getField(3));
assertEquals(55.5, result.getField(4));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(66.6, result.getField(0));
assertEquals(77.7, result.getField(1));
assertEquals(88.8, result.getField(2));
assertEquals(99.9, result.getField(3));
assertEquals(0.0, result.getField(4));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void testReadFirstN() throws Exception {
String fileContent = "111|222|333|444|555|\n666|777|888|999|000|\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes);
format.setFieldDelimiter("|");
format.configure(new Configuration());
format.open(split);
Row result = new Row(2);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(111, result.getField(0));
assertEquals(222, result.getField(1));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(666, result.getField(0));
assertEquals(777, result.getField(1));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void testReadSparseWithNullFieldsForTypes() throws Exception {
String fileContent = "111|x|222|x|333|x|444|x|555|x|666|x|777|x|888|x|999|x|000|x|\n" +
"000|x|999|x|888|x|777|x|666|x|555|x|444|x|333|x|222|x|111|x|";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(
PATH,
fieldTypes,
new int[]{0,3,7});
format.setFieldDelimiter("|x|");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(111, result.getField(0));
assertEquals(444, result.getField(1));
assertEquals(888, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(0, result.getField(0));
assertEquals(777, result.getField(1));
assertEquals(333, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void testReadSparseWithPositionSetter() throws Exception {
String fileContent = "111|222|333|444|555|666|777|888|999|000|\n" +
"000|999|888|777|666|555|444|333|222|111|";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(
PATH,
fieldTypes,
new int[]{0, 3, 7});
format.setFieldDelimiter("|");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(111, result.getField(0));
assertEquals(444, result.getField(1));
assertEquals(888, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(0, result.getField(0));
assertEquals(777, result.getField(1));
assertEquals(333, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void testReadSparseWithMask() throws Exception {
String fileContent = "111&&222&&333&&444&&555&&666&&777&&888&&999&&000&&\n" +
"000&&999&&888&&777&&666&&555&&444&&333&&222&&111&&";
FileInputSplit split = RowCsvInputFormatTest.createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(
PATH,
fieldTypes,
new int[]{0, 3, 7});
format.setFieldDelimiter("&&");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(111, result.getField(0));
assertEquals(444, result.getField(1));
assertEquals(888, result.getField(2));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(0, result.getField(0));
assertEquals(777, result.getField(1));
assertEquals(333, result.getField(2));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void testParseStringErrors() throws Exception {
StringParser stringParser = new StringParser();
stringParser.enableQuotedStringParsing((byte) '"');
Map<String, StringParser.ParseErrorState> failures = new HashMap<>();
failures.put("\"string\" trailing", FieldParser.ParseErrorState.UNQUOTED_CHARS_AFTER_QUOTED_STRING);
failures.put("\"unterminated ", FieldParser.ParseErrorState.UNTERMINATED_QUOTED_STRING);
for (Map.Entry<String, StringParser.ParseErrorState> failure : failures.entrySet()) {
int result = stringParser.parseField(
failure.getKey().getBytes(ConfigConstants.DEFAULT_CHARSET),
0,
failure.getKey().length(),
new byte[]{(byte) '|'},
null);
assertEquals(-1, result);
assertEquals(failure.getValue(), stringParser.getErrorState());
}
}
// Test disabled because we do not support double-quote escaped quotes right now.
@Test
@Ignore
public void testParserCorrectness() throws Exception {
// RFC 4180 Compliance Test content
// Taken from http://en.wikipedia.org/wiki/Comma-separated_values#Example
String fileContent = "Year,Make,Model,Description,Price\n" +
"1997,Ford,E350,\"ac, abs, moon\",3000.00\n" +
"1999,Chevy,\"Venture \"\"Extended Edition\"\"\",\"\",4900.00\n" +
"1996,Jeep,Grand Cherokee,\"MUST SELL! air, moon roof, loaded\",4799.00\n" +
"1999,Chevy,\"Venture \"\"Extended Edition, Very Large\"\"\",,5000.00\n" +
",,\"Venture \"\"Extended Edition\"\"\",\"\",4900.00";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.DOUBLE_TYPE_INFO};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes);
format.setSkipFirstLineAsHeader(true);
format.setFieldDelimiter(",");
format.configure(new Configuration());
format.open(split);
Row result = new Row(5);
Row r1 = new Row(5);
r1.setField(0, 1997);
r1.setField(1, "Ford");
r1.setField(2, "E350");
r1.setField(3, "ac, abs, moon");
r1.setField(4, 3000.0);
Row r2 = new Row(5);
r2.setField(0, 1999);
r2.setField(1, "Chevy");
r2.setField(2, "Venture \"Extended Edition\"");
r2.setField(3, "");
r2.setField(4, 4900.0);
Row r3 = new Row(5);
r3.setField(0, 1996);
r3.setField(1, "Jeep");
r3.setField(2, "Grand Cherokee");
r3.setField(3, "MUST SELL! air, moon roof, loaded");
r3.setField(4, 4799.0);
Row r4 = new Row(5);
r4.setField(0, 1999);
r4.setField(1, "Chevy");
r4.setField(2, "Venture \"Extended Edition, Very Large\"");
r4.setField(3, "");
r4.setField(4, 5000.0);
Row r5 = new Row(5);
r5.setField(0, 0);
r5.setField(1, "");
r5.setField(2, "Venture \"Extended Edition\"");
r5.setField(3, "");
r5.setField(4, 4900.0);
Row[] expectedLines = new Row[]{r1, r2, r3, r4, r5};
for (Row expected : expectedLines) {
result = format.nextRecord(result);
assertEquals(expected, result);
}
assertNull(format.nextRecord(result));
assertTrue(format.reachedEnd());
}
@Test
public void testWindowsLineEndRemoval() throws Exception {
// check typical use case -- linux file is correct and it is set up to linux(\n)
testRemovingTrailingCR("\n", "\n");
// check typical windows case -- windows file endings and file has windows file endings set up
testRemovingTrailingCR("\r\n", "\r\n");
// check problematic case windows file -- windows file endings(\r\n)
// but linux line endings (\n) set up
testRemovingTrailingCR("\r\n", "\n");
// check problematic case linux file -- linux file endings (\n)
// but windows file endings set up (\r\n)
// specific setup for windows line endings will expect \r\n because
// it has to be set up and is not standard.
}
@Test
public void testQuotedStringParsingWithIncludeFields() throws Exception {
String fileContent = "\"20:41:52-1-3-2015\"|\"Re: Taskmanager memory error in Eclipse\"|" +
"\"Blahblah <blah@blahblah.org>\"|\"blaaa|\"blubb\"";
File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
tempFile.deleteOnExit();
tempFile.setWritable(true);
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
writer.write(fileContent);
writer.close();
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO};
RowCsvInputFormat inputFormat = new RowCsvInputFormat(
new Path(tempFile.toURI().toString()),
fieldTypes,
new int[]{0, 2});
inputFormat.enableQuotedStringParsing('"');
inputFormat.setFieldDelimiter("|");
inputFormat.setDelimiter('\n');
inputFormat.configure(new Configuration());
FileInputSplit[] splits = inputFormat.createInputSplits(1);
inputFormat.open(splits[0]);
Row record = inputFormat.nextRecord(new Row(2));
assertEquals("20:41:52-1-3-2015", record.getField(0));
assertEquals("Blahblah <blah@blahblah.org>", record.getField(1));
}
@Test
public void testQuotedStringParsingWithEscapedQuotes() throws Exception {
String fileContent = "\"\\\"Hello\\\" World\"|\"We are\\\" young\"";
File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
tempFile.deleteOnExit();
tempFile.setWritable(true);
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
writer.write(fileContent);
writer.close();
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.STRING_TYPE_INFO,
BasicTypeInfo.STRING_TYPE_INFO};
RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(tempFile.toURI().toString()), fieldTypes);
inputFormat.enableQuotedStringParsing('"');
inputFormat.setFieldDelimiter("|");
inputFormat.setDelimiter('\n');
inputFormat.configure(new Configuration());
FileInputSplit[] splits = inputFormat.createInputSplits(1);
inputFormat.open(splits[0]);
Row record = inputFormat.nextRecord(new Row(2));
assertEquals("\\\"Hello\\\" World", record.getField(0));
assertEquals("We are\\\" young", record.getField(1));
}
@Test
public void testSqlTimeFields() throws Exception {
String fileContent = "1990-10-14|02:42:25|1990-10-14 02:42:25.123|1990-1-4 2:2:5\n" +
"1990-10-14|02:42:25|1990-10-14 02:42:25.123|1990-1-4 2:2:5.3\n";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
SqlTimeTypeInfo.DATE,
SqlTimeTypeInfo.TIME,
SqlTimeTypeInfo.TIMESTAMP,
SqlTimeTypeInfo.TIMESTAMP};
RowCsvInputFormat format = new RowCsvInputFormat(PATH, fieldTypes);
format.setFieldDelimiter("|");
format.configure(new Configuration());
format.open(split);
Row result = new Row(4);
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(Date.valueOf("1990-10-14"), result.getField(0));
assertEquals(Time.valueOf("02:42:25"), result.getField(1));
assertEquals(Timestamp.valueOf("1990-10-14 02:42:25.123"), result.getField(2));
assertEquals(Timestamp.valueOf("1990-01-04 02:02:05"), result.getField(3));
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(Date.valueOf("1990-10-14"), result.getField(0));
assertEquals(Time.valueOf("02:42:25"), result.getField(1));
assertEquals(Timestamp.valueOf("1990-10-14 02:42:25.123"), result.getField(2));
assertEquals(Timestamp.valueOf("1990-01-04 02:02:05.3"), result.getField(3));
result = format.nextRecord(result);
assertNull(result);
assertTrue(format.reachedEnd());
}
@Test
public void testScanOrder() throws Exception {
String fileContent =
// first row
"111|222|333|444|555|666|777|888|999|000|\n" +
// second row
"000|999|888|777|666|555|444|333|222|111|";
FileInputSplit split = createTempFile(fileContent);
TypeInformation[] fieldTypes = new TypeInformation[]{
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO,
BasicTypeInfo.INT_TYPE_INFO};
int[] order = new int[]{7, 3, 0};
RowCsvInputFormat format = new RowCsvInputFormat(
PATH,
fieldTypes,
order);
format.setFieldDelimiter("|");
format.configure(new Configuration());
format.open(split);
Row result = new Row(3);
// check first row
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(888, result.getField(0));
assertEquals(444, result.getField(1));
assertEquals(111, result.getField(2));
// check second row
result = format.nextRecord(result);
assertNotNull(result);
assertEquals(333, result.getField(0));
assertEquals(777, result.getField(1));
assertEquals(0, result.getField(2));
}
private static FileInputSplit createTempFile(String content) throws IOException {
File tempFile = File.createTempFile("test_contents", "tmp");
tempFile.deleteOnExit();
OutputStreamWriter wrt = new OutputStreamWriter(new FileOutputStream(tempFile), StandardCharsets.UTF_8);
wrt.write(content);
wrt.close();
return new FileInputSplit(0, new Path(tempFile.toURI().toString()), 0, tempFile.length(), new String[]{"localhost"});
}
private static void testRemovingTrailingCR(String lineBreakerInFile, String lineBreakerSetup) throws IOException {
String fileContent = FIRST_PART + lineBreakerInFile + SECOND_PART + lineBreakerInFile;
// create input file
File tempFile = File.createTempFile("CsvInputFormatTest", "tmp");
tempFile.deleteOnExit();
tempFile.setWritable(true);
OutputStreamWriter wrt = new OutputStreamWriter(new FileOutputStream(tempFile));
wrt.write(fileContent);
wrt.close();
TypeInformation[] fieldTypes = new TypeInformation[]{BasicTypeInfo.STRING_TYPE_INFO};
RowCsvInputFormat inputFormat = new RowCsvInputFormat(new Path(tempFile.toURI().toString()), fieldTypes);
inputFormat.configure(new Configuration());
inputFormat.setDelimiter(lineBreakerSetup);
FileInputSplit[] splits = inputFormat.createInputSplits(1);
inputFormat.open(splits[0]);
Row result = inputFormat.nextRecord(new Row(1));
assertNotNull("Expecting to not return null", result);
assertEquals(FIRST_PART, result.getField(0));
result = inputFormat.nextRecord(result);
assertNotNull("Expecting to not return null", result);
assertEquals(SECOND_PART, result.getField(0));
}
}