/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.api.java.record.io; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.DataOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import junit.framework.Assert; import org.apache.log4j.Level; import org.junit.After; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import eu.stratosphere.configuration.Configuration; import eu.stratosphere.configuration.IllegalConfigurationException; import eu.stratosphere.core.fs.FileInputSplit; import eu.stratosphere.core.fs.Path; import eu.stratosphere.types.IntValue; import eu.stratosphere.types.Record; import eu.stratosphere.types.StringValue; import eu.stratosphere.util.LogUtils; public class CsvInputFormatTest { protected File tempFile; private final CsvInputFormat format = new CsvInputFormat(); //Static variables for testing the removal of \r\n to \n private static final String FIRST_PART = "That is the first part"; private static final String SECOND_PART = "That is the second part"; // -------------------------------------------------------------------------------------------- @BeforeClass public static void initialize() { LogUtils.initializeDefaultConsoleLogger(Level.WARN); } @Before public void setup() { format.setFilePath("file:///some/file/that/will/not/be/read"); } @After public void setdown() throws Exception { if (this.format != null) { this.format.close(); } if (this.tempFile != null) { this.tempFile.delete(); } } @Test public void testConfigureEmptyConfig() { try { Configuration config = new Configuration(); // empty configuration, plus no fields on the format itself is not valid try { format.configure(config); fail(); // should give an error } catch (IllegalConfigurationException e) { ; // okay } } catch (Exception ex) { Assert.fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage()); } } @SuppressWarnings("unchecked") @Test public void readWithEmptyFieldInstanceParameters() { try { final String fileContent = "abc|def|ghijk\nabc||hhg\n|||"; final FileInputSplit split = createTempFile(fileContent); final Configuration parameters = new Configuration(); format.setFieldDelimiter('|'); format.setFieldTypes(StringValue.class, StringValue.class, StringValue.class); format.configure(parameters); format.open(split); Record record = new Record(); assertNotNull(format.nextRecord(record)); assertEquals("abc", record.getField(0, StringValue.class).getValue()); assertEquals("def", record.getField(1, StringValue.class).getValue()); assertEquals("ghijk", record.getField(2, StringValue.class).getValue()); assertNotNull(format.nextRecord(record)); assertEquals("abc", record.getField(0, StringValue.class).getValue()); assertEquals("", record.getField(1, StringValue.class).getValue()); assertEquals("hhg", record.getField(2, StringValue.class).getValue()); assertNotNull(format.nextRecord(record)); assertEquals("", record.getField(0, StringValue.class).getValue()); assertEquals("", record.getField(1, StringValue.class).getValue()); assertEquals("", record.getField(2, StringValue.class).getValue()); } catch (Exception ex) { ex.printStackTrace(); Assert.fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage()); } } @Test public void readWithEmptyFieldConfigParameters() { try { final String fileContent = "abc|def|ghijk\nabc||hhg\n|||"; final FileInputSplit split = createTempFile(fileContent); final Configuration parameters = new Configuration(); new CsvInputFormat.ConfigBuilder(null, parameters) .field(StringValue.class, 0).field(StringValue.class, 1).field(StringValue.class, 2); format.setFieldDelimiter('|'); format.configure(parameters); format.open(split); Record record = new Record(); assertNotNull(format.nextRecord(record)); assertEquals("abc", record.getField(0, StringValue.class).getValue()); assertEquals("def", record.getField(1, StringValue.class).getValue()); assertEquals("ghijk", record.getField(2, StringValue.class).getValue()); assertNotNull(format.nextRecord(record)); assertEquals("abc", record.getField(0, StringValue.class).getValue()); assertEquals("", record.getField(1, StringValue.class).getValue()); assertEquals("hhg", record.getField(2, StringValue.class).getValue()); assertNotNull(format.nextRecord(record)); assertEquals("", record.getField(0, StringValue.class).getValue()); assertEquals("", record.getField(1, StringValue.class).getValue()); assertEquals("", record.getField(2, StringValue.class).getValue()); } catch (Exception ex) { Assert.fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage()); } } @Test public void testReadAll() throws IOException { try { final String fileContent = "111|222|333|444|555\n666|777|888|999|000|"; final FileInputSplit split = createTempFile(fileContent); final Configuration parameters = new Configuration(); new CsvInputFormat.ConfigBuilder(null, parameters) .fieldDelimiter('|') .field(IntValue.class, 0).field(IntValue.class, 1).field(IntValue.class, 2) .field(IntValue.class, 3).field(IntValue.class, 4); format.configure(parameters); format.open(split); Record record = new Record(); assertNotNull(format.nextRecord(record)); assertEquals(111, record.getField(0, IntValue.class).getValue()); assertEquals(222, record.getField(1, IntValue.class).getValue()); assertEquals(333, record.getField(2, IntValue.class).getValue()); assertEquals(444, record.getField(3, IntValue.class).getValue()); assertEquals(555, record.getField(4, IntValue.class).getValue()); assertNotNull(format.nextRecord(record)); assertEquals(666, record.getField(0, IntValue.class).getValue()); assertEquals(777, record.getField(1, IntValue.class).getValue()); assertEquals(888, record.getField(2, IntValue.class).getValue()); assertEquals(999, record.getField(3, IntValue.class).getValue()); assertEquals(000, record.getField(4, IntValue.class).getValue()); assertNull(format.nextRecord(record)); assertTrue(format.reachedEnd()); } catch (Exception ex) { Assert.fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage()); } } @Test public void testReadFirstN() throws IOException { try { final String fileContent = "111|222|333|444|555|\n666|777|888|999|000|"; final FileInputSplit split = createTempFile(fileContent); final Configuration parameters = new Configuration(); new CsvInputFormat.ConfigBuilder(null, parameters) .fieldDelimiter('|') .field(IntValue.class, 0).field(IntValue.class, 1); format.configure(parameters); format.open(split); Record record = new Record(); assertNotNull(format.nextRecord(record)); assertEquals(111, record.getField(0, IntValue.class).getValue()); assertEquals(222, record.getField(1, IntValue.class).getValue()); boolean notParsed = false; try { record.getField(2, IntValue.class); } catch (IndexOutOfBoundsException ioo) { notParsed = true; } assertTrue(notParsed); assertNotNull(format.nextRecord(record)); assertEquals(666, record.getField(0, IntValue.class).getValue()); assertEquals(777, record.getField(1, IntValue.class).getValue()); notParsed = false; try { record.getField(2, IntValue.class); } catch (IndexOutOfBoundsException ioo) { notParsed = true; } assertTrue(notParsed); assertNull(format.nextRecord(record)); assertTrue(format.reachedEnd()); } catch (Exception ex) { Assert.fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage()); } } @Test public void testReadSparse() throws IOException { try { final String fileContent = "111|222|333|444|555|666|777|888|999|000|\n000|999|888|777|666|555|444|333|222|111|"; final FileInputSplit split = createTempFile(fileContent); final Configuration parameters = new Configuration(); new CsvInputFormat.ConfigBuilder(null, parameters) .fieldDelimiter('|') .field(IntValue.class, 0).field(IntValue.class, 3).field(IntValue.class, 7); format.configure(parameters); format.open(split); Record record = new Record(); assertNotNull(format.nextRecord(record)); assertEquals(111, record.getField(0, IntValue.class).getValue()); assertEquals(444, record.getField(1, IntValue.class).getValue()); assertEquals(888, record.getField(2, IntValue.class).getValue()); assertNotNull(format.nextRecord(record)); assertEquals(000, record.getField(0, IntValue.class).getValue()); assertEquals(777, record.getField(1, IntValue.class).getValue()); assertEquals(333, record.getField(2, IntValue.class).getValue()); assertNull(format.nextRecord(record)); assertTrue(format.reachedEnd()); } catch (Exception ex) { Assert.fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage()); } } @Test public void testReadSparseShufflePosition() throws IOException { try { final String fileContent = "111|222|333|444|555|666|777|888|999|000|\n000|999|888|777|666|555|444|333|222|111|"; final FileInputSplit split = createTempFile(fileContent); final Configuration parameters = new Configuration(); new CsvInputFormat.ConfigBuilder(null, parameters) .fieldDelimiter('|') .field(IntValue.class, 8).field(IntValue.class, 1).field(IntValue.class, 3); format.configure(parameters); format.open(split); Record record = new Record(); assertNotNull(format.nextRecord(record)); assertEquals(999, record.getField(0, IntValue.class).getValue()); assertEquals(222, record.getField(1, IntValue.class).getValue()); assertEquals(444, record.getField(2, IntValue.class).getValue()); assertNotNull(format.nextRecord(record)); assertEquals(222, record.getField(0, IntValue.class).getValue()); assertEquals(999, record.getField(1, IntValue.class).getValue()); assertEquals(777, record.getField(2, IntValue.class).getValue()); assertNull(format.nextRecord(record)); assertTrue(format.reachedEnd()); } catch (Exception ex) { Assert.fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage()); } } private FileInputSplit createTempFile(String content) throws IOException { this.tempFile = File.createTempFile("test_contents", "tmp"); this.tempFile.deleteOnExit(); DataOutputStream dos = new DataOutputStream(new FileOutputStream(tempFile)); dos.writeBytes(content); dos.close(); return new FileInputSplit(0, new Path(this.tempFile.toURI().toString()), 0, this.tempFile.length(), new String[] {"localhost"}); } @Test public void testWindowsLineEndRemoval() { //Check typical use case -- linux file is correct and it is set up to linuc(\n) this.testRemovingTrailingCR("\n", "\n"); //Check typical windows case -- windows file endings and file has windows file endings set up this.testRemovingTrailingCR("\r\n", "\r\n"); //Check problematic case windows file -- windows file endings(\r\n) but linux line endings (\n) set up this.testRemovingTrailingCR("\r\n", "\n"); //Check problematic case linux file -- linux file endings (\n) but windows file endings set up (\r\n) //Specific setup for windows line endings will expect \r\n because it has to be set up and is not standard. } private void testRemovingTrailingCR(String lineBreakerInFile, String lineBreakerSetup) { File tempFile=null; String fileContent = CsvInputFormatTest.FIRST_PART + lineBreakerInFile + CsvInputFormatTest.SECOND_PART + lineBreakerInFile; try { // create input file tempFile = File.createTempFile("CsvInputFormatTest", "tmp"); tempFile.deleteOnExit(); tempFile.setWritable(true); OutputStreamWriter wrt = new OutputStreamWriter(new FileOutputStream(tempFile)); wrt.write(fileContent); wrt.close(); //Instantiate input format CsvInputFormat inputFormat = new CsvInputFormat(); Configuration parameters = new Configuration(); new CsvInputFormat.ConfigBuilder(null, parameters) .field(StringValue.class, 0).filePath(tempFile.toURI().toString()); inputFormat.configure(parameters); inputFormat.setDelimiter(lineBreakerSetup); FileInputSplit[] splits = inputFormat.createInputSplits(1); inputFormat.open(splits[0]); Record record = new Record(); Record result = inputFormat.nextRecord(record); assertNotNull("Expecting to not return null", result); assertEquals(FIRST_PART, result.getField(0, StringValue.class).getValue()); result = inputFormat.nextRecord(record); assertNotNull("Expecting to not return null", result); assertEquals(SECOND_PART, result.getField(0, StringValue.class).getValue()); } catch (Throwable t) { System.err.println("test failed with exception: " + t.getMessage()); t.printStackTrace(System.err); fail("Test erroneous"); } } }