/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.api.common.io;
import org.apache.flink.api.common.io.FileInputFormat.FileBaseStatistics;
import org.apache.flink.api.common.io.statistics.BaseStatistics;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.FSDataInputStream;
import org.apache.flink.core.fs.FileInputSplit;
import org.apache.flink.testutils.TestFileUtils;
import org.apache.flink.types.IntValue;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.util.Arrays;
import java.util.Collections;
/**
* Tests for the FileInputFormat
*/
public class FileInputFormatTest {
@Rule
public TemporaryFolder temporaryFolder = new TemporaryFolder();
// ------------------------------------------------------------------------
// Statistics
// ------------------------------------------------------------------------
@Test
public void testGetStatisticsNonExistingFile() {
try {
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath("file:///some/none/existing/directory/");
format.configure(new Configuration());
BaseStatistics stats = format.getStatistics(null);
Assert.assertNull("The file statistics should be null.", stats);
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
@Test
public void testGetStatisticsOneFileNoCachedVersion() {
try {
final long SIZE = 1024 * 500;
String tempFile = TestFileUtils.createTempFile(SIZE);
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(tempFile);
format.configure(new Configuration());
BaseStatistics stats = format.getStatistics(null);
Assert.assertEquals("The file size from the statistics is wrong.", SIZE, stats.getTotalInputSize());
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
@Test
public void testGetStatisticsMultipleFilesNoCachedVersion() {
try {
final long SIZE1 = 2077;
final long SIZE2 = 31909;
final long SIZE3 = 10;
final long TOTAL = SIZE1 + SIZE2 + SIZE3;
String tempDir = TestFileUtils.createTempFileDir(SIZE1, SIZE2, SIZE3);
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(tempDir);
format.configure(new Configuration());
BaseStatistics stats = format.getStatistics(null);
Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, stats.getTotalInputSize());
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
@Test
public void testGetStatisticsOneFileWithCachedVersion() {
try {
final long SIZE = 50873;
final long FAKE_SIZE = 10065;
String tempFile = TestFileUtils.createTempFile(SIZE);
DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(tempFile);
format.configure(new Configuration());
FileBaseStatistics stats = format.getStatistics(null);
Assert.assertEquals("The file size from the statistics is wrong.", SIZE, stats.getTotalInputSize());
format = new DummyFileInputFormat();
format.setFilePath(tempFile);
format.configure(new Configuration());
FileBaseStatistics newStats = format.getStatistics(stats);
Assert.assertTrue("Statistics object was changed", newStats == stats);
// insert fake stats with the correct modification time. the call should return the fake stats
format = new DummyFileInputFormat();
format.setFilePath(tempFile);
format.configure(new Configuration());
FileBaseStatistics fakeStats = new FileBaseStatistics(stats.getLastModificationTime(), FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
BaseStatistics latest = format.getStatistics(fakeStats);
Assert.assertEquals("The file size from the statistics is wrong.", FAKE_SIZE, latest.getTotalInputSize());
// insert fake stats with the expired modification time. the call should return new accurate stats
format = new DummyFileInputFormat();
format.setFilePath(tempFile);
format.configure(new Configuration());
FileBaseStatistics outDatedFakeStats = new FileBaseStatistics(stats.getLastModificationTime()-1, FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
BaseStatistics reGathered = format.getStatistics(outDatedFakeStats);
Assert.assertEquals("The file size from the statistics is wrong.", SIZE, reGathered.getTotalInputSize());
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
@Test
public void testGetStatisticsMultipleFilesWithCachedVersion() {
try {
final long SIZE1 = 2077;
final long SIZE2 = 31909;
final long SIZE3 = 10;
final long TOTAL = SIZE1 + SIZE2 + SIZE3;
final long FAKE_SIZE = 10065;
String tempDir = TestFileUtils.createTempFileDir(SIZE1, SIZE2, SIZE3);
DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(tempDir);
format.configure(new Configuration());
FileBaseStatistics stats = format.getStatistics(null);
Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, stats.getTotalInputSize());
format = new DummyFileInputFormat();
format.setFilePath(tempDir);
format.configure(new Configuration());
FileBaseStatistics newStats = format.getStatistics(stats);
Assert.assertTrue("Statistics object was changed", newStats == stats);
// insert fake stats with the correct modification time. the call should return the fake stats
format = new DummyFileInputFormat();
format.setFilePath(tempDir);
format.configure(new Configuration());
FileBaseStatistics fakeStats = new FileBaseStatistics(stats.getLastModificationTime(), FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
BaseStatistics latest = format.getStatistics(fakeStats);
Assert.assertEquals("The file size from the statistics is wrong.", FAKE_SIZE, latest.getTotalInputSize());
// insert fake stats with the correct modification time. the call should return the fake stats
format = new DummyFileInputFormat();
format.setFilePath(tempDir);
format.configure(new Configuration());
FileBaseStatistics outDatedFakeStats = new FileBaseStatistics(stats.getLastModificationTime()-1, FAKE_SIZE, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN);
BaseStatistics reGathered = format.getStatistics(outDatedFakeStats);
Assert.assertEquals("The file size from the statistics is wrong.", TOTAL, reGathered.getTotalInputSize());
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
// ------------------------------------------------------------------------
// Unsplittable input files
// ------------------------------------------------------------------------
// ---- Tests for .deflate ---------
/**
* Create directory with files with .deflate extension and see if it creates a split
* for each file. Each split has to start from the beginning.
*/
@Test
public void testFileInputSplit() {
try {
String tempFile = TestFileUtils.createTempFileDirExtension(".deflate", "some", "stupid", "meaningless", "files");
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(tempFile);
format.configure(new Configuration());
FileInputSplit[] splits = format.createInputSplits(2);
Assert.assertEquals(4, splits.length);
for(FileInputSplit split : splits) {
Assert.assertEquals(-1L, split.getLength()); // unsplittable deflate files have this size as a flag for "read whole file"
Assert.assertEquals(0L, split.getStart()); // always read from the beginning.
}
// test if this also works for "mixed" directories
TestFileUtils.createTempFileInDirectory(tempFile.replace("file:", ""), "this creates a test file with a random extension (at least not .deflate)");
final DummyFileInputFormat formatMixed = new DummyFileInputFormat();
formatMixed.setFilePath(tempFile);
formatMixed.configure(new Configuration());
FileInputSplit[] splitsMixed = formatMixed.createInputSplits(2);
Assert.assertEquals(5, splitsMixed.length);
for(FileInputSplit split : splitsMixed) {
if(split.getPath().getName().endsWith(".deflate")) {
Assert.assertEquals(-1L, split.getLength()); // unsplittable deflate files have this size as a flag for "read whole file"
Assert.assertEquals(0L, split.getStart()); // always read from the beginning.
} else {
Assert.assertEquals(0L, split.getStart());
Assert.assertTrue("split size not correct", split.getLength() > 0);
}
}
} catch (Exception ex) {
ex.printStackTrace();
Assert.fail(ex.getMessage());
}
}
// ------------------------------------------------------------------------
// Ignored Files
// ------------------------------------------------------------------------
@Test
public void testIgnoredUnderscoreFiles() {
try {
final String contents = "CONTENTS";
// create some accepted, some ignored files
File child1 = temporaryFolder.newFile("dataFile1.txt");
File child2 = temporaryFolder.newFile("another_file.bin");
File luigiFile = temporaryFolder.newFile("_luigi");
File success = temporaryFolder.newFile("_SUCCESS");
createTempFiles(contents.getBytes(ConfigConstants.DEFAULT_CHARSET), child1, child2, luigiFile, success);
// test that only the valid files are accepted
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(temporaryFolder.getRoot().toURI().toString());
format.configure(new Configuration());
FileInputSplit[] splits = format.createInputSplits(1);
Assert.assertEquals(2, splits.length);
final URI uri1 = splits[0].getPath().toUri();
final URI uri2 = splits[1].getPath().toUri();
final URI childUri1 = child1.toURI();
final URI childUri2 = child2.toURI();
Assert.assertTrue( (uri1.equals(childUri1) && uri2.equals(childUri2)) ||
(uri1.equals(childUri2) && uri2.equals(childUri1)) );
}
catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
Assert.fail(e.getMessage());
}
}
@Test
public void testExcludeFiles() {
try {
final String contents = "CONTENTS";
// create some accepted, some ignored files
File child1 = temporaryFolder.newFile("dataFile1.txt");
File child2 = temporaryFolder.newFile("another_file.bin");
File[] files = { child1, child2 };
createTempFiles(contents.getBytes(ConfigConstants.DEFAULT_CHARSET), files);
// test that only the valid files are accepted
Configuration configuration = new Configuration();
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(temporaryFolder.getRoot().toURI().toString());
format.configure(configuration);
format.setFilesFilter(new GlobFilePathFilter(
Collections.singletonList("**"),
Collections.singletonList("**/another_file.bin")));
FileInputSplit[] splits = format.createInputSplits(1);
Assert.assertEquals(1, splits.length);
final URI uri1 = splits[0].getPath().toUri();
final URI childUri1 = child1.toURI();
Assert.assertEquals(uri1, childUri1);
}
catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
Assert.fail(e.getMessage());
}
}
@Test
public void testReadMultiplePatterns() throws Exception {
final String contents = "CONTENTS";
// create some accepted, some ignored files
File child1 = temporaryFolder.newFile("dataFile1.txt");
File child2 = temporaryFolder.newFile("another_file.bin");
createTempFiles(contents.getBytes(ConfigConstants.DEFAULT_CHARSET), child1, child2);
// test that only the valid files are accepted
Configuration configuration = new Configuration();
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(temporaryFolder.getRoot().toURI().toString());
format.configure(configuration);
format.setFilesFilter(new GlobFilePathFilter(
Collections.singletonList("**"),
Arrays.asList("**/another_file.bin", "**/dataFile1.txt")
));
FileInputSplit[] splits = format.createInputSplits(1);
Assert.assertEquals(0, splits.length);
}
@Test
public void testGetStatsIgnoredUnderscoreFiles() {
try {
final int SIZE = 2048;
final long TOTAL = 2*SIZE;
// create two accepted and two ignored files
File child1 = temporaryFolder.newFile("dataFile1.txt");
File child2 = temporaryFolder.newFile("another_file.bin");
File luigiFile = temporaryFolder.newFile("_luigi");
File success = temporaryFolder.newFile("_SUCCESS");
createTempFiles(new byte[SIZE], child1, child2, luigiFile, success);
final DummyFileInputFormat format = new DummyFileInputFormat();
format.setFilePath(temporaryFolder.getRoot().toURI().toString());
format.configure(new Configuration());
// check that only valid files are used for statistics computation
BaseStatistics stats = format.getStatistics(null);
Assert.assertEquals(TOTAL, stats.getTotalInputSize());
}
catch (Exception e) {
System.err.println(e.getMessage());
e.printStackTrace();
Assert.fail(e.getMessage());
}
}
// ------------------------------------------------------------------------
// Stream Decoration
// ------------------------------------------------------------------------
@Test
public void testDecorateInputStream() throws IOException {
// create temporary file with 3 blocks
final File tempFile = File.createTempFile("input-stream-decoration-test", "tmp");
tempFile.deleteOnExit();
final int blockSize = 8;
final int numBlocks = 3;
FileOutputStream fileOutputStream = new FileOutputStream(tempFile);
for (int i = 0; i < blockSize * numBlocks; i++) {
fileOutputStream.write(new byte[]{(byte) i});
}
fileOutputStream.close();
final Configuration config = new Configuration();
final FileInputFormat<byte[]> inputFormat = new MyDecoratedInputFormat();
inputFormat.setFilePath(tempFile.toURI().toString());
inputFormat.configure(config);
inputFormat.openInputFormat();
FileInputSplit[] inputSplits = inputFormat.createInputSplits(3);
byte[] bytes = null;
byte prev = 0;
for (FileInputSplit inputSplit : inputSplits) {
inputFormat.open(inputSplit);
while (!inputFormat.reachedEnd()) {
if ((bytes = inputFormat.nextRecord(bytes)) != null) {
Assert.assertArrayEquals(new byte[]{--prev}, bytes);
}
}
}
inputFormat.closeInputFormat();
}
// ------------------------------------------------------------------------
private void createTempFiles(byte[] contents, File... files) throws IOException {
for (File child : files) {
child.deleteOnExit();
BufferedOutputStream out = new BufferedOutputStream(new FileOutputStream(child));
try {
out.write(contents);
} finally {
out.close();
}
}
}
private class DummyFileInputFormat extends FileInputFormat<IntValue> {
private static final long serialVersionUID = 1L;
@Override
public boolean reachedEnd() throws IOException {
return true;
}
@Override
public IntValue nextRecord(IntValue record) throws IOException {
return null;
}
}
private static final class MyDecoratedInputFormat extends FileInputFormat<byte[]> {
private static final long serialVersionUID = 1L;
@Override
public boolean reachedEnd() throws IOException {
return this.stream.getPos() >= this.splitStart + this.splitLength;
}
@Override
public byte[] nextRecord(byte[] reuse) throws IOException {
int read = this.stream.read();
if (read == -1) throw new IllegalStateException();
return new byte[]{(byte) read};
}
@Override
protected FSDataInputStream decorateInputStream(FSDataInputStream inputStream, FileInputSplit fileSplit) throws Throwable {
inputStream = super.decorateInputStream(inputStream, fileSplit);
return new InputStreamFSInputWrapper(new InvertedInputStream(inputStream));
}
}
private static final class InvertedInputStream extends InputStream {
private final InputStream originalStream;
private InvertedInputStream(InputStream originalStream) {
this.originalStream = originalStream;
}
@Override
public int read() throws IOException {
int read = this.originalStream.read();
return read == -1 ? -1 : (~read & 0xFF);
}
@Override
public int available() throws IOException {
return this.originalStream.available();
}
}
}