/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.common.io; import org.apache.commons.lang3.StringUtils; import org.apache.flink.configuration.ConfigConstants; import org.apache.flink.configuration.Configuration; import org.apache.flink.core.fs.FileInputSplit; import org.apache.flink.core.fs.Path; import org.junit.After; import org.junit.Before; import org.junit.Test; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; public class DelimitedInputFormatTest { private DelimitedInputFormat<String> format; // -------------------------------------------------------------------------------------------- @Before public void setup() { format = new MyTextInputFormat(); this.format.setFilePath(new Path("file:///some/file/that/will/not/be/read")); } @After public void shutdown() throws Exception { if (this.format != null) { this.format.close(); } } // -------------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------------- @Test public void testConfigure() { Configuration cfg = new Configuration(); cfg.setString("delimited-format.delimiter", "\n"); format.configure(cfg); assertEquals("\n", new String(format.getDelimiter(), format.getCharset())); cfg.setString("delimited-format.delimiter", "&-&"); format.configure(cfg); assertEquals("&-&", new String(format.getDelimiter(), format.getCharset())); } @Test public void testSerialization() throws Exception { final byte[] DELIMITER = new byte[] {1, 2, 3, 4}; final int NUM_LINE_SAMPLES = 7; final int LINE_LENGTH_LIMIT = 12345; final int BUFFER_SIZE = 178; DelimitedInputFormat<String> format = new MyTextInputFormat(); format.setDelimiter(DELIMITER); format.setNumLineSamples(NUM_LINE_SAMPLES); format.setLineLengthLimit(LINE_LENGTH_LIMIT); format.setBufferSize(BUFFER_SIZE); ByteArrayOutputStream baos = new ByteArrayOutputStream(4096); ObjectOutputStream oos = new ObjectOutputStream(baos); oos.writeObject(format); oos.flush(); oos.close(); ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(baos.toByteArray())); @SuppressWarnings("unchecked") DelimitedInputFormat<String> deserialized = (DelimitedInputFormat<String>) ois.readObject(); assertEquals(NUM_LINE_SAMPLES, deserialized.getNumLineSamples()); assertEquals(LINE_LENGTH_LIMIT, deserialized.getLineLengthLimit()); assertEquals(BUFFER_SIZE, deserialized.getBufferSize()); assertArrayEquals(DELIMITER, deserialized.getDelimiter()); } @Test public void testOpen() throws IOException { final String myString = "my mocked line 1\nmy mocked line 2\n"; final FileInputSplit split = createTempFile(myString); int bufferSize = 5; format.setBufferSize(bufferSize); format.open(split); assertEquals(0, format.splitStart); assertEquals(myString.length() - bufferSize, format.splitLength); assertEquals(bufferSize, format.getBufferSize()); } @Test public void testReadWithoutTrailingDelimiter() throws IOException { // 2. test case final String myString = "my key|my val$$$my key2\n$$ctd.$$|my value2"; final FileInputSplit split = createTempFile(myString); final Configuration parameters = new Configuration(); // default delimiter = '\n' format.configure(parameters); format.open(split); String first = format.nextRecord(null); String second = format.nextRecord(null); assertNotNull(first); assertNotNull(second); assertEquals("my key|my val$$$my key2", first); assertEquals("$$ctd.$$|my value2", second); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); } @Test public void testReadWithTrailingDelimiter() throws IOException { // 2. test case final String myString = "my key|my val$$$my key2\n$$ctd.$$|my value2\n"; final FileInputSplit split = createTempFile(myString); final Configuration parameters = new Configuration(); // default delimiter = '\n' format.configure(parameters); format.open(split); String first = format.nextRecord(null); String second = format.nextRecord(null); assertNotNull(first); assertNotNull(second); assertEquals("my key|my val$$$my key2", first); assertEquals("$$ctd.$$|my value2", second); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); } @Test public void testReadCustomDelimiter() throws IOException { final String myString = "my key|my val$$$my key2\n$$ctd.$$|my value2"; final FileInputSplit split = createTempFile(myString); final Configuration parameters = new Configuration(); format.setDelimiter("$$$"); format.configure(parameters); format.open(split); String first = format.nextRecord(null); assertNotNull(first); assertEquals("my key|my val", first); String second = format.nextRecord(null); assertNotNull(second); assertEquals("my key2\n$$ctd.$$|my value2", second); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); } @Test public void testMultiCharDelimiter() throws IOException { final String myString = "www112xx1123yyy11123zzzzz1123"; final FileInputSplit split = createTempFile(myString); final Configuration parameters = new Configuration(); format.setDelimiter("1123"); format.configure(parameters); format.open(split); String first = format.nextRecord(null); assertNotNull(first); assertEquals("www112xx", first); String second = format.nextRecord(null); assertNotNull(second); assertEquals("yyy1", second); String third = format.nextRecord(null); assertNotNull(third); assertEquals("zzzzz", third); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); } @Test public void testReadCustomDelimiterWithCharset() throws IOException { // Unicode row fragments String[] records = new String[]{"\u020e\u021f\u05c0\u020b\u020f", "Apache", "\nFlink", "\u0000", "\u05c0"}; // Unicode delimiter String delimiter = "\u05c0\u05c0"; String fileContent = StringUtils.join(records, delimiter); for (final String charset : new String[]{ "UTF-8", "UTF-16BE", "UTF-16LE" }) { // use charset when instantiating the record String DelimitedInputFormat<String> format = new DelimitedInputFormat<String>() { @Override public String readRecord(String reuse, byte[] bytes, int offset, int numBytes) throws IOException { return new String(bytes, offset, numBytes, charset); } }; format.setFilePath("file:///some/file/that/will/not/be/read"); final FileInputSplit split = createTempFile(fileContent, charset); format.setDelimiter(delimiter); // use the same encoding to parse the file as used to read the file; // the delimiter is reinterpreted when the charset is set format.setCharset(charset); format.configure(new Configuration()); format.open(split); for (String record : records) { String value = format.nextRecord(null); assertEquals(record, value); } assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); } } /** * Tests that the records are read correctly when the split boundary is in the middle of a record. */ @Test public void testReadOverSplitBoundariesUnaligned() throws IOException { final String myString = "value1\nvalue2\nvalue3"; final FileInputSplit split = createTempFile(myString); FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2, split.getHostnames()); FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(), split.getHostnames()); final Configuration parameters = new Configuration(); format.configure(parameters); format.open(split1); assertEquals("value1", format.nextRecord(null)); assertEquals("value2", format.nextRecord(null)); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); format.open(split2); assertEquals("value3", format.nextRecord(null)); assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); } /** * Tests that the correct number of records is read when the split boundary is exact at the record boundary. */ @Test public void testReadWithBufferSizeIsMultiple() throws IOException { final String myString = "aaaaaaa\nbbbbbbb\nccccccc\nddddddd\n"; final FileInputSplit split = createTempFile(myString); FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2, split.getHostnames()); FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(), split.getHostnames()); final Configuration parameters = new Configuration(); format.setBufferSize(2 * ((int) split1.getLength())); format.configure(parameters); String next; int count = 0; // read split 1 format.open(split1); while ((next = format.nextRecord(null)) != null) { assertEquals(7, next.length()); count++; } assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); // this one must have read one too many, because the next split will skipp the trailing remainder // which happens to be one full record assertEquals(3, count); // read split 2 format.open(split2); while ((next = format.nextRecord(null)) != null) { assertEquals(7, next.length()); count++; } format.close(); assertEquals(4, count); } @Test public void testReadExactlyBufferSize() throws IOException { final String myString = "aaaaaaa\nbbbbbbb\nccccccc\nddddddd\n"; final FileInputSplit split = createTempFile(myString); final Configuration parameters = new Configuration(); format.setBufferSize((int) split.getLength()); format.configure(parameters); format.open(split); String next; int count = 0; while ((next = format.nextRecord(null)) != null) { assertEquals(7, next.length()); count++; } assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); assertEquals(4, count); } @Test public void testReadRecordsLargerThanBuffer() throws IOException { final String myString = "aaaaaaaaaaaaaaaaaaaaa\n" + "bbbbbbbbbbbbbbbbbbbbbbbbb\n" + "ccccccccccccccccccc\n" + "ddddddddddddddddddddddddddddddddddd\n"; final FileInputSplit split = createTempFile(myString); FileInputSplit split1 = new FileInputSplit(0, split.getPath(), 0, split.getLength() / 2, split.getHostnames()); FileInputSplit split2 = new FileInputSplit(1, split.getPath(), split1.getLength(), split.getLength(), split.getHostnames()); final Configuration parameters = new Configuration(); format.setBufferSize(8); format.configure(parameters); String next; List<String> result = new ArrayList<String>(); format.open(split1); while ((next = format.nextRecord(null)) != null) { result.add(next); } assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); format.open(split2); while ((next = format.nextRecord(null)) != null) { result.add(next); } assertNull(format.nextRecord(null)); assertTrue(format.reachedEnd()); format.close(); assertEquals(4, result.size()); assertEquals(Arrays.asList(myString.split("\n")), result); } static FileInputSplit createTempFile(String contents) throws IOException { File tempFile = File.createTempFile("test_contents", "tmp"); tempFile.deleteOnExit(); try (Writer out = new OutputStreamWriter(new FileOutputStream(tempFile))) { out.write(contents); } return new FileInputSplit(0, new Path(tempFile.toURI().toString()), 0, tempFile.length(), new String[] {"localhost"}); } static FileInputSplit createTempFile(String contents, String charset) throws IOException { File tempFile = File.createTempFile("test_contents", "tmp"); tempFile.deleteOnExit(); try (Writer out = new OutputStreamWriter(new FileOutputStream(tempFile), charset)) { out.write(contents); } return new FileInputSplit(0, new Path(tempFile.toURI().toString()), 0, tempFile.length(), new String[] {"localhost"}); } protected static final class MyTextInputFormat extends DelimitedInputFormat<String> { private static final long serialVersionUID = 1L; @Override public String readRecord(String reuse, byte[] bytes, int offset, int numBytes) { return new String(bytes, offset, numBytes, ConfigConstants.DEFAULT_CHARSET); } } }