/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.nephele.template; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; import eu.stratosphere.core.fs.BlockLocation; import eu.stratosphere.core.fs.FileInputSplit; import eu.stratosphere.core.fs.FileStatus; import eu.stratosphere.core.fs.FileSystem; import eu.stratosphere.core.fs.Path; /** * Specialized subtype of {@link AbstractInputTask} for tasks which are supposed to generate input from * a file. In addition to {@link AbstractInputTask} this class includes a method to query file splits * which should be read during the task's execution. * */ public abstract class AbstractFileInputTask extends AbstractInputTask<FileInputSplit> { public static final String INPUT_PATH_CONFIG_KEY = "input.path"; /** * The fraction that the last split may be larger than the others. */ private static final float MAX_SPLIT_SIZE_DISCREPANCY = 1.1f; // -------------------------------------------------------------------------------------------- /** * Returns an iterator to a (possible empty) list of file input splits which is expected to be consumed by this * instance of the {@link AbstractFileInputTask}. * * @return an iterator to a (possible empty) list of file input splits. */ public Iterator<FileInputSplit> getFileInputSplits() { return new InputSplitIterator<FileInputSplit>(getEnvironment().getInputSplitProvider()); } @Override public FileInputSplit[] computeInputSplits(final int minNumSplits) throws IOException { final String pathURI = getTaskConfiguration().getString(INPUT_PATH_CONFIG_KEY, null); if (pathURI == null) { throw new IOException("The path to the file was not found in the runtime configuration."); } final Path path; try { path = new Path(pathURI); } catch (Exception iaex) { throw new IOException("Invalid file path specifier: ", iaex); } final List<FileInputSplit> inputSplits = new ArrayList<FileInputSplit>(); // get all the files that are involved in the splits final List<FileStatus> files = new ArrayList<FileStatus>(); long totalLength = 0; final FileSystem fs = path.getFileSystem(); final FileStatus pathFile = fs.getFileStatus(path); if (pathFile.isDir()) { // input is directory. list all contained files final FileStatus[] dir = fs.listStatus(path); for (int i = 0; i < dir.length; i++) { if (!dir[i].isDir()) { files.add(dir[i]); totalLength += dir[i].getLen(); } } } else { files.add(pathFile); totalLength += pathFile.getLen(); } final long minSplitSize = 1; final long maxSplitSize = (minNumSplits < 1) ? Long.MAX_VALUE : (totalLength / minNumSplits + (totalLength % minNumSplits == 0 ? 0 : 1)); // now that we have the files, generate the splits int splitNum = 0; for (final FileStatus file : files) { final long len = file.getLen(); final long blockSize = file.getBlockSize(); final long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize)); final long halfSplit = splitSize >>> 1; final long maxBytesForLastSplit = (long) (splitSize * MAX_SPLIT_SIZE_DISCREPANCY); if (len > 0) { // get the block locations and make sure they are in order with respect to their offset final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, len); Arrays.sort(blocks); long bytesUnassigned = len; long position = 0; int blockIndex = 0; while (bytesUnassigned > maxBytesForLastSplit) { // get the block containing the majority of the data blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex); // create a new split final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, splitSize, blocks[blockIndex] .getHosts()); inputSplits.add(fis); // adjust the positions position += splitSize; bytesUnassigned -= splitSize; } // assign the last split if (bytesUnassigned > 0) { blockIndex = getBlockIndexForPosition(blocks, position, halfSplit, blockIndex); final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), position, bytesUnassigned, blocks[blockIndex].getHosts()); inputSplits.add(fis); } } else { // special case with a file of zero bytes size final BlockLocation[] blocks = fs.getFileBlockLocations(file, 0, 0); String[] hosts; if (blocks.length > 0) { hosts = blocks[0].getHosts(); } else { hosts = new String[0]; } final FileInputSplit fis = new FileInputSplit(splitNum++, file.getPath(), 0, 0, hosts); inputSplits.add(fis); } } return inputSplits.toArray(new FileInputSplit[inputSplits.size()]); } /** * Retrieves the index of the <tt>BlockLocation</tt> that contains the part of the file described by the given * offset. * * @param blocks * The different blocks of the file. Must be ordered by their offset. * @param offset * The offset of the position in the file. * @param startIndex * The earliest index to look at. * @return The index of the block containing the given position. */ private final int getBlockIndexForPosition(final BlockLocation[] blocks, final long offset, final long halfSplitSize, final int startIndex) { // go over all indexes after the startIndex for (int i = startIndex; i < blocks.length; i++) { long blockStart = blocks[i].getOffset(); long blockEnd = blockStart + blocks[i].getLength(); if (offset >= blockStart && offset < blockEnd) { // got the block where the split starts // check if the next block contains more than this one does if (i < blocks.length - 1 && blockEnd - offset < halfSplitSize) { return i + 1; } else { return i; } } } throw new IllegalArgumentException("The given offset is not contained in the any block."); } @Override public Class<FileInputSplit> getInputSplitType() { return FileInputSplit.class; } }