/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data.stream; import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.InputSplit; import java.io.IOException; import java.net.URI; import java.util.List; /** * Helper class for computing {@link InputSplit} for a stream data file. * * It splits a stream event file into equal size blocks (except the last block). The split size is computed by * * <br/><br/> * {@code Math.min(minSplitSize, Math.max(maxSplitSize, fileBlockSize)) } * <br/><br/> * * Each split produced will also carries {@code startTime} and {@code endTime} so that only stream events within * the given time range will get processed. */ final class StreamDataFileSplitter { private final FileStatus eventFileStatus; StreamDataFileSplitter(FileStatus eventFileStatus) { this.eventFileStatus = eventFileStatus; } /** * Computes splits for the event file. */ <T> void computeSplits(FileSystem fs, long minSplitSize, long maxSplitSize, long startTime, long endTime, List<T> splits, StreamInputSplitFactory<T> splitFactory) throws IOException { // Compute the splits based on the min/max size Path eventFile = eventFileStatus.getPath(); Path indexFile = getIndexFile(eventFile); BlockLocation[] blockLocations = fs.getFileBlockLocations(eventFile, 0, eventFileStatus.getLen()); long length = eventFileStatus.getLen(); long offset = 0; int blockIndex = 0; while (offset < length) { blockIndex = getBlockIndex(blockLocations, offset, blockIndex); String[] hosts = null; if (blockIndex >= 0) { hosts = blockLocations[blockIndex].getHosts(); } else { blockIndex = 0; } long splitSize = computeSplitSize(eventFileStatus, offset, minSplitSize, maxSplitSize); splits.add(splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, splitSize, hosts)); offset += splitSize; } // One extra split for the tail of the file. splits.add(splitFactory.createSplit(eventFile, indexFile, startTime, endTime, offset, Long.MAX_VALUE, null)); } /** * Returns the array index of the given blockLocations that contains the given offset. * * @param blockLocations Array of {@link BlockLocation} to search for. * @param offset File offset. * @param startIdx Starting index for the search in the array. * @return The array index of the {@link BlockLocation} that contains the given offset. */ private int getBlockIndex(BlockLocation[] blockLocations, long offset, int startIdx) { if (blockLocations == null) { return -1; } for (int i = startIdx; i < blockLocations.length; i++) { BlockLocation blockLocation = blockLocations[i]; long endOffset = blockLocation.getOffset() + blockLocation.getLength(); if (blockLocation.getOffset() <= offset && offset < endOffset) { return i; } } return -1; } /** * Compute the actual split size. The split size compute would be no larger than the given max split size. * The split size would be no smaller than the given min split size, except if number of bytes between * offset and file length is smaller than min split size. * * @param fileStatus The FileStatus of the file to split on. * @param offset Starting offset for the split. * @param minSplitSize Minimum size for the split. * @param maxSplitSize Maximum size for the split. * @return */ private long computeSplitSize(FileStatus fileStatus, long offset, long minSplitSize, long maxSplitSize) { long blockSize = fileStatus.getBlockSize(); long splitSize = Math.max(minSplitSize, Math.min(maxSplitSize, blockSize)); return Math.min(splitSize, fileStatus.getLen() - offset); } private Path getIndexFile(Path eventFile) { String eventPath = eventFile.toUri().toString(); int extLength = StreamFileType.EVENT.getSuffix().length(); return new Path(URI.create(String.format("%s%s", eventPath.substring(0, eventPath.length() - extLength), StreamFileType.INDEX.getSuffix()))); } }