/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.api.common.io; import org.apache.flink.annotation.Public; import org.apache.flink.api.common.io.compression.Bzip2InputStreamFactory; import org.apache.flink.api.common.io.compression.DeflateInflaterInputStreamFactory; import org.apache.flink.api.common.io.compression.GzipInflaterInputStreamFactory; import org.apache.flink.api.common.io.compression.InflaterInputStreamFactory; import org.apache.flink.api.common.io.compression.XZInputStreamFactory; import org.apache.flink.api.common.io.statistics.BaseStatistics; import org.apache.flink.configuration.ConfigConstants; import org.apache.flink.configuration.Configuration; import org.apache.flink.configuration.GlobalConfiguration; import org.apache.flink.core.fs.BlockLocation; import org.apache.flink.core.fs.FSDataInputStream; import org.apache.flink.core.fs.FileInputSplit; import org.apache.flink.core.fs.FileStatus; import org.apache.flink.core.fs.FileSystem; import org.apache.flink.core.fs.Path; import org.apache.flink.util.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import static org.apache.flink.util.Preconditions.checkNotNull; /** * The base class for {@link RichInputFormat}s that read from files. For specific input types the * {@link #nextRecord(Object)} and {@link #reachedEnd()} methods need to be implemented. * Additionally, one may override {@link #open(FileInputSplit)} and {@link #close()} to * change the life cycle behavior. * * <p>After the {@link #open(FileInputSplit)} method completed, the file input data is available * from the {@link #stream} field.</p> */ @Public public abstract class FileInputFormat<OT> extends RichInputFormat<OT, FileInputSplit> { // -------------------------------------- Constants ------------------------------------------- private static final Logger LOG = LoggerFactory.getLogger(FileInputFormat.class); private static final long serialVersionUID = 1L; /** * The fraction that the last split may be larger than the others. */ private static final float MAX_SPLIT_SIZE_DISCREPANCY = 1.1f; /** * The timeout (in milliseconds) to wait for a filesystem stream to respond. */ private static long DEFAULT_OPENING_TIMEOUT; /** * A mapping of file extensions to decompression algorithms based on DEFLATE. Such compressions lead to * unsplittable files. */ protected static final Map<String, InflaterInputStreamFactory<?>> INFLATER_INPUT_STREAM_FACTORIES = new HashMap<String, InflaterInputStreamFactory<?>>(); /** * The splitLength is set to -1L for reading the whole split. */ protected static final long READ_WHOLE_SPLIT_FLAG = -1L; static { initDefaultsFromConfiguration(GlobalConfiguration.loadConfiguration()); initDefaultInflaterInputStreamFactories(); } /** * Initialize defaults for input format. Needs to be a static method because it is configured for local * cluster execution, see LocalFlinkMiniCluster. * @param configuration The configuration to load defaults from */ private static void initDefaultsFromConfiguration(Configuration configuration) { final long to = configuration.getLong(ConfigConstants.FS_STREAM_OPENING_TIMEOUT_KEY, ConfigConstants.DEFAULT_FS_STREAM_OPENING_TIMEOUT); if (to < 0) { LOG.error("Invalid timeout value for filesystem stream opening: " + to + ". Using default value of " + ConfigConstants.DEFAULT_FS_STREAM_OPENING_TIMEOUT); DEFAULT_OPENING_TIMEOUT = ConfigConstants.DEFAULT_FS_STREAM_OPENING_TIMEOUT; } else if (to == 0) { DEFAULT_OPENING_TIMEOUT = 300000; // 5 minutes } else { DEFAULT_OPENING_TIMEOUT = to; } } private static void initDefaultInflaterInputStreamFactories() { InflaterInputStreamFactory<?>[] defaultFactories = { DeflateInflaterInputStreamFactory.getInstance(), GzipInflaterInputStreamFactory.getInstance(), Bzip2InputStreamFactory.getInstance(), XZInputStreamFactory.getInstance(), }; for (InflaterInputStreamFactory<?> inputStreamFactory : defaultFactories) { for (String fileExtension : inputStreamFactory.getCommonFileExtensions()) { registerInflaterInputStreamFactory(fileExtension, inputStreamFactory); } } } /** * Registers a decompression algorithm through a {@link org.apache.flink.api.common.io.compression.InflaterInputStreamFactory} * with a file extension for transparent decompression. * @param fileExtension of the compressed files * @param factory to create an {@link java.util.zip.InflaterInputStream} that handles the decompression format */ public static void registerInflaterInputStreamFactory(String fileExtension, InflaterInputStreamFactory<?> factory) { synchronized (INFLATER_INPUT_STREAM_FACTORIES) { if (INFLATER_INPUT_STREAM_FACTORIES.put(fileExtension, factory) != null) { LOG.warn("Overwriting an existing decompression algorithm for \"{}\" files.", fileExtension); } } } protected static InflaterInputStreamFactory<?> getInflaterInputStreamFactory(String fileExtension) { synchronized (INFLATER_INPUT_STREAM_FACTORIES) { return INFLATER_INPUT_STREAM_FACTORIES.get(fileExtension); } } /** * Returns the extension of a file name (!= a path). * @return the extension of the file name or {@code null} if there is no extension. */ protected static String extractFileExtension(String fileName) { checkNotNull(fileName); int lastPeriodIndex = fileName.lastIndexOf('.'); if (lastPeriodIndex < 0){ return null; } else { return fileName.substring(lastPeriodIndex + 1); } } // -------------------------------------------------------------------------------------------- // Variables for internal operation. // They are all transient, because we do not want them so be serialized // -------------------------------------------------------------------------------------------- /** * The input stream reading from the input file. */ protected transient FSDataInputStream stream; /** * The start of the split that this parallel instance must consume. */ protected transient long splitStart; /** * The length of the split that this parallel instance must consume. */ protected transient long splitLength; /** * The current split that this parallel instance must consume. */ protected transient FileInputSplit currentSplit; // -------------------------------------------------------------------------------------------- // The configuration parameters. Configured on the instance and serialized to be shipped. // -------------------------------------------------------------------------------------------- /** * The path to the file that contains the input. */ protected Path filePath; /** * The minimal split size, set by the configure() method. */ protected long minSplitSize = 0; /** * The desired number of splits, as set by the configure() method. */ protected int numSplits = -1; /** * Stream opening timeout. */ protected long openTimeout = DEFAULT_OPENING_TIMEOUT; /** * Some file input formats are not splittable on a block level (avro, deflate) * Therefore, the FileInputFormat can only read whole files. */ protected boolean unsplittable = false; /** * The flag to specify whether recursive traversal of the input directory * structure is enabled. */ protected boolean enumerateNestedFiles = false; /** * Files filter for determining what files/directories should be included. */ private FilePathFilter filesFilter = new GlobFilePathFilter(); // -------------------------------------------------------------------------------------------- // Constructors // -------------------------------------------------------------------------------------------- public FileInputFormat() {} protected FileInputFormat(Path filePath) { this.filePath = filePath; } // -------------------------------------------------------------------------------------------- // Getters/setters for the configurable parameters // -------------------------------------------------------------------------------------------- public Path getFilePath() { return filePath; } public void setFilePath(String filePath) { if (filePath == null) { throw new IllegalArgumentException("File path cannot be null."); } // TODO The job-submission web interface passes empty args (and thus empty // paths) to compute the preview graph. The following is a workaround for // this situation and we should fix this. // comment (Stephan Ewen) this should be no longer relevant with the current Java/Scalal APIs. if (filePath.isEmpty()) { setFilePath(new Path()); return; } try { this.filePath = new Path(filePath); } catch (RuntimeException rex) { throw new RuntimeException("Could not create a valid URI from the given file path name: " + rex.getMessage()); } } public void setFilePath(Path filePath) { if (filePath == null) { throw new IllegalArgumentException("File path must not be null."); } this.filePath = filePath; } public long getMinSplitSize() { return minSplitSize; } public void setMinSplitSize(long minSplitSize) { if (minSplitSize < 0) { throw new IllegalArgumentException("The minimum split size cannot be negative."); } this.minSplitSize = minSplitSize; } public int getNumSplits() { return numSplits; } public void setNumSplits(int numSplits) { if (numSplits < -1 || numSplits == 0) { throw new IllegalArgumentException("The desired number of splits must be positive or -1 (= don't care)."); } this.numSplits = numSplits; } public long getOpenTimeout() { return openTimeout; } public void setOpenTimeout(long openTimeout) { if (openTimeout < 0) { throw new IllegalArgumentException("The timeout for opening the input splits must be positive or zero (= infinite)."); } this.openTimeout = openTimeout; } public void setNestedFileEnumeration(boolean enable) { this.enumerateNestedFiles = enable; } public boolean getNestedFileEnumeration() { return this.enumerateNestedFiles; } // -------------------------------------------------------------------------------------------- // Getting information about the split that is currently open // -------------------------------------------------------------------------------------------- /** * Gets the start of the current split. * * @return The start of the split. */ public long getSplitStart() { return splitStart; } /** * Gets the length or remaining length of the current split. * * @return The length or remaining length of the current split. */ public long getSplitLength() { return splitLength; } public void setFilesFilter(FilePathFilter filesFilter) { this.filesFilter = Preconditions.checkNotNull(filesFilter, "Files filter should not be null"); } // -------------------------------------------------------------------------------------------- // Pre-flight: Configuration, Splits, Sampling // -------------------------------------------------------------------------------------------- /** * Configures the file input format by reading the file path from the configuration. * * @see org.apache.flink.api.common.io.InputFormat#configure(org.apache.flink.configuration.Configuration) */ @Override public void configure(Configuration parameters) { // the if() clauses are to prevent the configure() method from // overwriting the values set by the setters if (filePath == null) { String filePath = parameters.getString(FILE_PARAMETER_KEY, null); setFilePath(filePath); } if (!this.enumerateNestedFiles) { this.enumerateNestedFiles = parameters.getBoolean(ENUMERATE_NESTED_FILES_FLAG, false); } } /** * Obtains basic file statistics containing only file size. If the input is a directory, then the size is the sum of all contained files. * * @see org.apache.flink.api.common.io.InputFormat#getStatistics(org.apache.flink.api.common.io.statistics.BaseStatistics) */ @Override public FileBaseStatistics getStatistics(BaseStatistics cachedStats) throws IOException { final FileBaseStatistics cachedFileStats = (cachedStats != null && cachedStats instanceof FileBaseStatistics) ? (FileBaseStatistics) cachedStats : null; try { final Path path = this.filePath; final FileSystem fs = FileSystem.get(path.toUri()); return getFileStats(cachedFileStats, path, fs, new ArrayList<FileStatus>(1)); } catch (IOException ioex) { if (LOG.isWarnEnabled()) { LOG.warn("Could not determine statistics for file '" + this.filePath + "' due to an io error: " + ioex.getMessage()); } } catch (Throwable t) { if (LOG.isErrorEnabled()) { LOG.error("Unexpected problem while getting the file statistics for file '" + this.filePath + "': " + t.getMessage(), t); } } // no statistics available return null; } protected FileBaseStatistics getFileStats(FileBaseStatistics cachedStats, Path filePath, FileSystem fs, ArrayList<FileStatus> files) throws IOException { // get the file info and check whether the cached statistics are still valid. final FileStatus file = fs.getFileStatus(filePath); long totalLength = 0; // enumerate all files if (file.isDir()) { totalLength += addFilesInDir(file.getPath(), files, false); } else { files.add(file); testForUnsplittable(file); totalLength += file.getLen(); } // check the modification time stamp long latestModTime = 0; for (FileStatus f : files) { latestModTime = Math.max(f.getModificationTime(), latestModTime); } // check whether the cached statistics are still valid, if we have any if (cachedStats != null && latestModTime <= cachedStats.getLastModificationTime()) { return cachedStats; } // sanity check if (totalLength <= 0) { totalLength = BaseStatistics.SIZE_UNKNOWN; } return new FileBaseStatistics(latestModTime, totalLength, BaseStatistics.AVG_RECORD_BYTES_UNKNOWN); } @Override public LocatableInputSplitAssigner getInputSplitAssigner(FileInputSplit[] splits) { return new LocatableInputSplitAssigner(splits); } /** * Computes the input splits for the file. By default, one file block is one split. If more splits * are requested than blocks are available, then a split may be a fraction of a block and splits may cross * block boundaries. * * @param minNumSplits The minimum desired number of file splits. * @return The computed file splits. * * @see org.apache.flink.api.common.io.InputFormat#createInputSplits(int) */ @Override public FileInputSplit[] createInputSplits(int minNumSplits) throws IOException { if (minNumSplits < 1) { throw new IllegalArgumentException("Number of input splits has to be at least 1."); } // take the desired number of splits into account minNumSplits = Math.max(minNumSplits, this.numSplits); final Path path = this.filePath; final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(minNumSplits); // get all the files that are involved in the splits List<FileStatus> files = new ArrayList<FileStatus>(); long totalLength = 0; final FileSystem fs = path.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(path); if (pathFile.isDir()) { totalLength += addFilesInDir(path, files, true); } else { testForUnsplittable(pathFile); files.add(pathFile); totalLength += pathFile.getLen(); } // returns if unsplittable if (unsplittable) { int splitNum = 0; for (final FileStatus file : files) { final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, file.getLen()); Set<String> hosts = new HashSet<String>(); for(BlockLocation block : blocks) { hosts.addAll(Arrays.asList(block.getHosts())); } long len = file.getLen(); if(testForUnsplittable(file)) { len = READ_WHOLE_SPLIT_FLAG; } FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, len, hosts.toArray(new String[hosts.size()])); inputSplits.add(fis); } return inputSplits.toArray(new FileInputSplit[inputSplits.size()]); } final long maxSplitSize = (minNumSplits < 1) ? Long.MAX_VALUE : (totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1)); // now that we have the files, generate the splits int splitNum = 0; for (final FileStatus file : files) { final long len = file.getLen(); final long blockSize = file.getBlockSize(); final long minSplitSize; if (this.minSplitSize <= blockSize) { minSplitSize = this.minSplitSize; } else { if (LOG.isWarnEnabled()) { LOG.warn("Minimal split size of " + this.minSplitSize + " is larger than the block size of " + blockSize + ". Decreasing minimal split size to block size."); } minSplitSize = blockSize; } final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize)); final long halfSplit = splitSize >>> 1; final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY); if (len > 0) { // get the block locations and make sure they are in order with respect to their offset final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len); Arrays.sort(blocks); long bytesUnassigned = len; long position = 0; int blockIndex = 0; while (bytesUnassigned > maxBytesForLastSplit) { // get the block containing the majority of the data blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex); // create a new split FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex].getHosts()); inputSplits.add(fis); // adjust the positions position += splitSize; bytesUnassigned -= splitSize; } // assign the last split if (bytesUnassigned > 0) { blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex); final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts()); inputSplits.add(fis); } } else { // special case with a file of zero bytes size final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0); String[] hosts; if (blocks.length > 0) { hosts = blocks[0].getHosts(); } else { hosts = new String[0]; } final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts); inputSplits.add(fis); } } return inputSplits.toArray(new FileInputSplit[inputSplits.size()]); } /** * Enumerate all files in the directory and recursive if enumerateNestedFiles is true. * @return the total length of accepted files. */ private long addFilesInDir(Path path, List<FileStatus> files, boolean logExcludedFiles) throws IOException { final FileSystem fs = path.getFileSystem(); long length = 0; for(FileStatus dir: fs.listStatus(path)) { if (dir.isDir()) { if (acceptFile(dir) && enumerateNestedFiles) { length += addFilesInDir(dir.getPath(), files, logExcludedFiles); } else { if (logExcludedFiles && LOG.isDebugEnabled()) { LOG.debug("Directory "+dir.getPath().toString()+" did not pass the file-filter and is excluded."); } } } else { if(acceptFile(dir)) { files.add(dir); length += dir.getLen(); testForUnsplittable(dir); } else { if (logExcludedFiles && LOG.isDebugEnabled()) { LOG.debug("Directory "+dir.getPath().toString()+" did not pass the file-filter and is excluded."); } } } } return length; } protected boolean testForUnsplittable(FileStatus pathFile) { if(getInflaterInputStreamFactory(pathFile.getPath()) != null) { unsplittable = true; return true; } return false; } private InflaterInputStreamFactory<?> getInflaterInputStreamFactory(Path path) { String fileExtension = extractFileExtension(path.getName()); if (fileExtension != null) { return getInflaterInputStreamFactory(fileExtension); } else { return null; } } /** * A simple hook to filter files and directories from the input. * The method may be overridden. Hadoop's FileInputFormat has a similar mechanism and applies the * same filters by default. * * @param fileStatus The file status to check. * @return true, if the given file or directory is accepted */ public boolean acceptFile(FileStatus fileStatus) { final String name = fileStatus.getPath().getName(); return !name.startsWith("_") && !name.startsWith(".") && !filesFilter.filterPath(fileStatus.getPath()); } /** * Retrieves the index of the <tt>BlockLocation</tt> that contains the part of the file described by the given * offset. * * @param blocks The different blocks of the file. Must be ordered by their offset. * @param offset The offset of the position in the file. * @param startIndex The earliest index to look at. * @return The index of the block containing the given position. */ private int getBlockIndexForPosition(BlockLocation[] blocks, long offset, long halfSplitSize, int startIndex) { // go over all indexes after the startIndex for (int i = startIndex; i < blocks.length; i++) { long blockStart = blocks[i].getOffset(); long blockEnd = blockStart + blocks[i].getLength(); if (offset >= blockStart && offset < blockEnd) { // got the block where the split starts // check if the next block contains more than this one does if (i < blocks.length - 1 && blockEnd - offset < halfSplitSize) { return i + 1; } else { return i; } } } throw new IllegalArgumentException("The given offset is not contained in the any block."); } // -------------------------------------------------------------------------------------------- /** * Opens an input stream to the file defined in the input format. * The stream is positioned at the beginning of the given split. * <p> * The stream is actually opened in an asynchronous thread to make sure any interruptions to the thread * working on the input format do not reach the file system. */ @Override public void open(FileInputSplit fileSplit) throws IOException { this.currentSplit = fileSplit; this.splitStart = fileSplit.getStart(); this.splitLength = fileSplit.getLength(); if (LOG.isDebugEnabled()) { LOG.debug("Opening input split " + fileSplit.getPath() + " [" + this.splitStart + "," + this.splitLength + "]"); } // open the split in an asynchronous thread final InputSplitOpenThread isot = new InputSplitOpenThread(fileSplit, this.openTimeout); isot.start(); try { this.stream = isot.waitForCompletion(); this.stream = decorateInputStream(this.stream, fileSplit); } catch (Throwable t) { throw new IOException("Error opening the Input Split " + fileSplit.getPath() + " [" + splitStart + "," + splitLength + "]: " + t.getMessage(), t); } // get FSDataInputStream if (this.splitStart != 0) { this.stream.seek(this.splitStart); } } /** * This method allows to wrap/decorate the raw {@link FSDataInputStream} for a certain file split, e.g., for decoding. * When overriding this method, also consider adapting {@link FileInputFormat#testForUnsplittable} if your * stream decoration renders the input file unsplittable. Also consider calling existing superclass implementations. * * @param inputStream is the input stream to decorated * @param fileSplit is the file split for which the input stream shall be decorated * @return the decorated input stream * @throws Throwable if the decoration fails * @see org.apache.flink.api.common.io.InputStreamFSInputWrapper */ protected FSDataInputStream decorateInputStream(FSDataInputStream inputStream, FileInputSplit fileSplit) throws Throwable { // Wrap stream in a extracting (decompressing) stream if file ends with a known compression file extension. InflaterInputStreamFactory<?> inflaterInputStreamFactory = getInflaterInputStreamFactory(fileSplit.getPath()); if (inflaterInputStreamFactory != null) { return new InputStreamFSInputWrapper(inflaterInputStreamFactory.create(stream)); } return inputStream; } /** * Closes the file input stream of the input format. */ @Override public void close() throws IOException { if (this.stream != null) { // close input stream this.stream.close(); stream = null; } } public String toString() { return this.filePath == null ? "File Input (unknown file)" : "File Input (" + this.filePath.toString() + ')'; } // ============================================================================================ /** * Encapsulation of the basic statistics the optimizer obtains about a file. Contained are the size of the file * and the average bytes of a single record. The statistics also have a time-stamp that records the modification * time of the file and indicates as such for which time the statistics were valid. */ public static class FileBaseStatistics implements BaseStatistics { protected final long fileModTime; // timestamp of the last modification protected final long fileSize; // size of the file(s) in bytes protected final float avgBytesPerRecord; // the average number of bytes for a record /** * Creates a new statistics object. * * @param fileModTime * The timestamp of the latest modification of any of the involved files. * @param fileSize * The size of the file, in bytes. <code>-1</code>, if unknown. * @param avgBytesPerRecord * The average number of byte in a record, or <code>-1.0f</code>, if unknown. */ public FileBaseStatistics(long fileModTime, long fileSize, float avgBytesPerRecord) { this.fileModTime = fileModTime; this.fileSize = fileSize; this.avgBytesPerRecord = avgBytesPerRecord; } /** * Gets the timestamp of the last modification. * * @return The timestamp of the last modification. */ public long getLastModificationTime() { return fileModTime; } /** * Gets the file size. * * @return The fileSize. * @see org.apache.flink.api.common.io.statistics.BaseStatistics#getTotalInputSize() */ @Override public long getTotalInputSize() { return this.fileSize; } /** * Gets the estimates number of records in the file, computed as the file size divided by the * average record width, rounded up. * * @return The estimated number of records in the file. * @see org.apache.flink.api.common.io.statistics.BaseStatistics#getNumberOfRecords() */ @Override public long getNumberOfRecords() { return (this.fileSize == SIZE_UNKNOWN || this.avgBytesPerRecord == AVG_RECORD_BYTES_UNKNOWN) ? NUM_RECORDS_UNKNOWN : (long) Math.ceil(this.fileSize / this.avgBytesPerRecord); } /** * Gets the estimated average number of bytes per record. * * @return The average number of bytes per record. * @see org.apache.flink.api.common.io.statistics.BaseStatistics#getAverageRecordWidth() */ @Override public float getAverageRecordWidth() { return this.avgBytesPerRecord; } @Override public String toString() { return "size=" + this.fileSize + ", recWidth=" + this.avgBytesPerRecord + ", modAt=" + this.fileModTime; } } // ============================================================================================ /** * Obtains a DataInputStream in an thread that is not interrupted. * This is a necessary hack around the problem that the HDFS client is very sensitive to InterruptedExceptions. */ public static class InputSplitOpenThread extends Thread { private final FileInputSplit split; private final long timeout; private volatile FSDataInputStream fdis; private volatile Throwable error; private volatile boolean aborted; public InputSplitOpenThread(FileInputSplit split, long timeout) { super("Transient InputSplit Opener"); setDaemon(true); this.split = split; this.timeout = timeout; } @Override public void run() { try { final FileSystem fs = FileSystem.get(this.split.getPath().toUri()); this.fdis = fs.open(this.split.getPath()); // check for canceling and close the stream in that case, because no one will obtain it if (this.aborted) { final FSDataInputStream f = this.fdis; this.fdis = null; f.close(); } } catch (Throwable t) { this.error = t; } } public FSDataInputStream waitForCompletion() throws Throwable { final long start = System.currentTimeMillis(); long remaining = this.timeout; do { try { // wait for the task completion this.join(remaining); } catch (InterruptedException iex) { // we were canceled, so abort the procedure abortWait(); throw iex; } } while (this.error == null && this.fdis == null && (remaining = this.timeout + start - System.currentTimeMillis()) > 0); if (this.error != null) { throw this.error; } if (this.fdis != null) { return this.fdis; } else { // double-check that the stream has not been set by now. we don't know here whether // a) the opener thread recognized the canceling and closed the stream // b) the flag was set such that the stream did not see it and we have a valid stream // In any case, close the stream and throw an exception. abortWait(); final boolean stillAlive = this.isAlive(); final StringBuilder bld = new StringBuilder(256); for (StackTraceElement e : this.getStackTrace()) { bld.append("\tat ").append(e.toString()).append('\n'); } throw new IOException("Input opening request timed out. Opener was " + (stillAlive ? "" : "NOT ") + " alive. Stack of split open thread:\n" + bld.toString()); } } /** * Double checked procedure setting the abort flag and closing the stream. */ private void abortWait() { this.aborted = true; final FSDataInputStream inStream = this.fdis; this.fdis = null; if (inStream != null) { try { inStream.close(); } catch (Throwable t) {} } } } // ============================================================================================ // Parameterization via configuration // ============================================================================================ // ------------------------------------- Config Keys ------------------------------------------ /** * The config parameter which defines the input file path. */ private static final String FILE_PARAMETER_KEY = "input.file.path"; /** * The config parameter which defines whether input directories are recursively traversed. */ public static final String ENUMERATE_NESTED_FILES_FLAG = "recursive.file.enumeration"; }