/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.api.common.io;
import org.apache.flink.annotation.Public;
import org.apache.flink.annotation.PublicEvolving;
import org.apache.flink.api.common.io.statistics.BaseStatistics;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.BlockLocation;
import org.apache.flink.core.fs.FSDataInputStream;
import org.apache.flink.core.fs.FileInputSplit;
import org.apache.flink.core.fs.FileStatus;
import org.apache.flink.core.fs.FileSystem;
import org.apache.flink.core.fs.Path;
import org.apache.flink.core.memory.DataInputView;
import org.apache.flink.core.memory.DataInputViewStreamWrapper;
import org.apache.flink.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.FilterInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* Base class for all input formats that use blocks of fixed size. The input splits are aligned to these blocks,
* meaning that each split will consist of one block. Without configuration, these block sizes equal the native
* block sizes of the HDFS.
*
* A block will contain a {@link BlockInfo} at the end of the block. There, the reader can find some statistics
* about the split currently being read, that will help correctly parse the contents of the block.
*/
@Public
public abstract class BinaryInputFormat<T> extends FileInputFormat<T>
implements CheckpointableInputFormat<FileInputSplit, Tuple2<Long, Long>> {
private static final long serialVersionUID = 1L;
/** The log. */
private static final Logger LOG = LoggerFactory.getLogger(BinaryInputFormat.class);
/** The config parameter which defines the fixed length of a record. */
public static final String BLOCK_SIZE_PARAMETER_KEY = "input.block_size";
public static final long NATIVE_BLOCK_SIZE = Long.MIN_VALUE;
/** The block size to use. */
private long blockSize = NATIVE_BLOCK_SIZE;
private transient DataInputViewStreamWrapper dataInputStream;
/** The BlockInfo for the Block corresponding to the split currently being read. */
private transient BlockInfo blockInfo;
/** A wrapper around the block currently being read. */
private transient BlockBasedInput blockBasedInput = null;
/**
* The number of records already read from the block.
* This is used to decide if the end of the block has been
* reached.
*/
private long readRecords = 0;
@Override
public void configure(Configuration parameters) {
super.configure(parameters);
// the if is to prevent the configure() method from
// overwriting the value set by the setter
if (this.blockSize == NATIVE_BLOCK_SIZE) {
long blockSize = parameters.getLong(BLOCK_SIZE_PARAMETER_KEY, NATIVE_BLOCK_SIZE);
setBlockSize(blockSize);
}
}
public void setBlockSize(long blockSize) {
if (blockSize < 1 && blockSize != NATIVE_BLOCK_SIZE) {
throw new IllegalArgumentException("The block size parameter must be set and larger than 0.");
}
if (blockSize > Integer.MAX_VALUE) {
throw new UnsupportedOperationException("Currently only block sizes up to Integer.MAX_VALUE are supported");
}
this.blockSize = blockSize;
}
public long getBlockSize() {
return this.blockSize;
}
@Override
public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException {
List<FileStatus> files = this.getFiles();
final FileSystem fs = this.filePath.getFileSystem();
final long blockSize = this.blockSize == NATIVE_BLOCK_SIZE ? fs.getDefaultBlockSize() : this.blockSize;
final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits);
for (FileStatus file : files) {
for (long pos = 0, length = file.getLen(); pos < length; pos += blockSize) {
long remainingLength = Math.min(pos + blockSize, length) - pos;
// get the block locations and make sure they are in order with respect to their offset
final BlockLocation[] blocks = fs.getFileBlockLocations(file, pos, remainingLength);
Arrays.sort(blocks);
inputSplits.add(new FileInputSplit(inputSplits.size(), file.getPath(), pos, remainingLength,
blocks[0].getHosts()));
}
}
if (inputSplits.size() < minNumSplits) {
LOG.warn(String.format(
"With the given block size %d, the file %s cannot be split into %d blocks. Filling up with empty splits...",
blockSize, this.filePath, minNumSplits));
FileStatus last = files.get(files.size() - 1);
final BlockLocation[] blocks = fs.getFileBlockLocations(last, 0, last.getLen());
for (int index = files.size(); index < minNumSplits; index++) {
inputSplits.add(new FileInputSplit(index, last.getPath(), last.getLen(), 0, blocks[0].getHosts()));
}
}
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
protected List<FileStatus> getFiles() throws IOException {
// get all the files that are involved in the splits
List<FileStatus> files = new ArrayList<FileStatus>();
final FileSystem fs = this.filePath.getFileSystem();
final FileStatus pathFile = fs.getFileStatus(this.filePath);
if (pathFile.isDir()) {
// input is directory. list all contained files
final FileStatus[] partials = fs.listStatus(this.filePath);
for (FileStatus partial : partials) {
if (!partial.isDir()) {
files.add(partial);
}
}
} else {
files.add(pathFile);
}
return files;
}
@Override
public SequentialStatistics getStatistics(BaseStatistics cachedStats) {
final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ?
(FileBaseStatistics) cachedStats : null;
try {
final Path filePath = this.filePath;
// get the filesystem
final FileSystem fs = FileSystem.get(filePath.toUri());
final ArrayList<FileStatus> allFiles = new ArrayList<FileStatus>(1);
// let the file input format deal with the up-to-date check and the basic size
final FileBaseStatistics stats = getFileStats(cachedFileStats, filePath, fs, allFiles);
if (stats == null) {
return null;
}
// check whether the file stats are still sequential stats (in that case they are still valid)
if (stats instanceof SequentialStatistics) {
return (SequentialStatistics) stats;
}
return createStatistics(allFiles, stats);
} catch (IOException ioex) {
if (LOG.isWarnEnabled()) {
LOG.warn(
String.format("Could not determine complete statistics for file '%s' due to an I/O error",
this.filePath),
ioex);
}
} catch (Throwable t) {
if (LOG.isErrorEnabled()) {
LOG.error(
String.format("Unexpected problem while getting the file statistics for file '%s'",
this.filePath),
t);
}
}
// no stats available
return null;
}
protected FileInputSplit[] getInputSplits() throws IOException {
return this.createInputSplits(0);
}
public BlockInfo createBlockInfo() {
return new BlockInfo();
}
private BlockInfo createAndReadBlockInfo() throws IOException {
BlockInfo blockInfo = new BlockInfo();
if (this.splitLength > blockInfo.getInfoSize()) {
// At first we go and read the block info containing the recordCount, the accumulatedRecordCount
// and the firstRecordStart offset in the current block. This is written at the end of the block and
// is of fixed size, currently 3 * Long.SIZE.
// TODO: seek not supported by compressed streams. Will throw exception
this.stream.seek(this.splitStart + this.splitLength - blockInfo.getInfoSize());
blockInfo.read(new DataInputViewStreamWrapper(this.stream));
}
return blockInfo;
}
/**
* Fill in the statistics. The last modification time and the total input size are prefilled.
*
* @param files
* The files that are associated with this block input format.
* @param stats
* The pre-filled statistics.
*/
protected SequentialStatistics createStatistics(List<FileStatus> files, FileBaseStatistics stats)
throws IOException {
if (files.isEmpty()) {
return null;
}
BlockInfo blockInfo = new BlockInfo();
long totalCount = 0;
for (FileStatus file : files) {
// invalid file
if (file.getLen() < blockInfo.getInfoSize()) {
continue;
}
FileSystem fs = file.getPath().getFileSystem();
try (FSDataInputStream fdis = fs.open(file.getPath(), blockInfo.getInfoSize())) {
fdis.seek(file.getLen() - blockInfo.getInfoSize());
blockInfo.read(new DataInputViewStreamWrapper(fdis));
totalCount += blockInfo.getAccumulatedRecordCount();
}
}
final float avgWidth = totalCount == 0 ? 0 : ((float) stats.getTotalInputSize() / totalCount);
return new SequentialStatistics(stats.getLastModificationTime(), stats.getTotalInputSize(), avgWidth,
totalCount);
}
private static class SequentialStatistics extends FileBaseStatistics {
private final long numberOfRecords;
public SequentialStatistics(long fileModTime, long fileSize, float avgBytesPerRecord, long numberOfRecords) {
super(fileModTime, fileSize, avgBytesPerRecord);
this.numberOfRecords = numberOfRecords;
}
@Override
public long getNumberOfRecords() {
return this.numberOfRecords;
}
}
@Override
public void open(FileInputSplit split) throws IOException {
super.open(split);
this.blockInfo = this.createAndReadBlockInfo();
// We set the size of the BlockBasedInput to splitLength as each split contains one block.
// After reading the block info, we seek in the file to the correct position.
this.readRecords = 0;
this.stream.seek(this.splitStart + this.blockInfo.getFirstRecordStart());
this.blockBasedInput = new BlockBasedInput(this.stream,
(int) blockInfo.getFirstRecordStart(), this.splitLength);
this.dataInputStream = new DataInputViewStreamWrapper(blockBasedInput);
}
@Override
public boolean reachedEnd() throws IOException {
return this.readRecords >= this.blockInfo.getRecordCount();
}
@Override
public T nextRecord(T record) throws IOException {
if (this.reachedEnd()) {
return null;
}
record = this.deserialize(record, this.dataInputStream);
this.readRecords++;
return record;
}
protected abstract T deserialize(T reuse, DataInputView dataInput) throws IOException;
/**
* Reads the content of a block of data. The block contains its {@link BlockInfo}
* at the end, and this method takes this into account when reading the data.
*/
protected class BlockBasedInput extends FilterInputStream {
private final int maxPayloadSize;
private int blockPos;
public BlockBasedInput(FSDataInputStream in, int blockSize) {
super(in);
this.blockPos = (int) BinaryInputFormat.this.blockInfo.getFirstRecordStart();
this.maxPayloadSize = blockSize - BinaryInputFormat.this.blockInfo.getInfoSize();
}
public BlockBasedInput(FSDataInputStream in, int startPos, long length) {
super(in);
this.blockPos = startPos;
this.maxPayloadSize = (int) (length - BinaryInputFormat.this.blockInfo.getInfoSize());
}
@Override
public int read() throws IOException {
if (this.blockPos++ >= this.maxPayloadSize) {
this.skipHeader();
}
return this.in.read();
}
private long getCurrBlockPos() {
return this.blockPos;
}
private void skipHeader() throws IOException {
byte[] dummy = new byte[BinaryInputFormat.this.blockInfo.getInfoSize()];
this.in.read(dummy, 0, dummy.length);
// the blockPos is set to 0 for the case of remote reads,
// these are the cases where the last record of a block spills on the next block
this.blockPos = 0;
}
@Override
public int read(byte[] b) throws IOException {
return this.read(b, 0, b.length);
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
int totalRead = 0;
for (int remainingLength = len, offset = off; remainingLength > 0;) {
int blockLen = Math.min(remainingLength, this.maxPayloadSize - this.blockPos);
int read = this.in.read(b, offset, blockLen);
if (read < 0) {
return read;
}
totalRead += read;
this.blockPos += read;
offset += read;
if (this.blockPos >= this.maxPayloadSize) {
this.skipHeader();
}
remainingLength -= read;
}
return totalRead;
}
}
// --------------------------------------------------------------------------------------------
// Checkpointing
// --------------------------------------------------------------------------------------------
@PublicEvolving
@Override
public Tuple2<Long, Long> getCurrentState() throws IOException {
if (this.blockBasedInput == null) {
throw new RuntimeException("You must have forgotten to call open() on your input format.");
}
return new Tuple2<>(
this.blockBasedInput.getCurrBlockPos(), // the last read index in the block
this.readRecords // the number of records read
);
}
@PublicEvolving
@Override
public void reopen(FileInputSplit split, Tuple2<Long, Long> state) throws IOException {
Preconditions.checkNotNull(split, "reopen() cannot be called on a null split.");
Preconditions.checkNotNull(state, "reopen() cannot be called with a null initial state.");
try {
this.open(split);
} finally {
this.blockInfo = this.createAndReadBlockInfo();
long blockPos = state.f0;
this.readRecords = state.f1;
this.stream.seek(this.splitStart + blockPos);
this.blockBasedInput = new BlockBasedInput(this.stream, (int) blockPos, this.splitLength);
this.dataInputStream = new DataInputViewStreamWrapper(blockBasedInput);
}
}
}