/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapred.lib;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.EnumMap;
import java.util.LinkedList;
import java.util.HashSet;
import java.util.List;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RecordReader;
import org.mortbay.util.ajax.JSON;
/**
* An abstract {@link org.apache.hadoop.mapred.InputFormat} that returns {@link CombineFileSplit}'s
* in {@link org.apache.hadoop.mapred.InputFormat#getSplits(JobConf, int)} method.
* Splits are constructed from the files under the input paths.
* A split cannot have files from different pools.
* Each split returned may contain blocks from different files.
* If a maxSplitSize is specified, then blocks on the same node are
* combined to form a single split. Blocks that are left over are
* then combined with other blocks in the same rack.
* If maxSplitSize is not specified, then blocks from the same rack
* are combined in a single split; no attempt is made to create
* node-local splits.
* If the maxSplitSize is equal to the block size, then this class
* is similar to the default spliting behaviour in Hadoop: each
* block is a locally processed split.
* Subclasses implement {@link org.apache.hadoop.mapred.InputFormat#getRecordReader(InputSplit, JobConf, Reporter)}
* to construct <code>RecordReader</code>'s for <code>CombineFileSplit</code>'s.
* @see CombineFileSplit
*/
public abstract class CombineFileInputFormat<K, V>
extends FileInputFormat<K, V> {
// ability to limit the size of a single split
private long maxSplitSize = 0;
private long minSplitSizeNode = 0;
private long minSplitSizeRack = 0;
private long maxNumBlocksPerSplit = 0;
// A pool of input paths filters. A split cannot have blocks from files
// across multiple pools.
private ArrayList<MultiPathFilter> pools = new ArrayList<MultiPathFilter>();
// mapping from a rack name to the set of Nodes in the rack
private HashMap<String, Set<String>> rackToNodes =
new HashMap<String, Set<String>>();
// Whether to pass only the path component of the URI to the pool filters
private boolean poolFilterPathOnly = true;
// Special log for json metrics (all split stats sent here for easier
// parsing)
private static final Log JSON_METRICS_LOG = LogFactory.getLog("JsonMetrics");
// Map of the stat type to actual stats
private final EnumMap<SplitType, SplitTypeStats> splitTypeStatsMap =
new EnumMap<SplitType, SplitTypeStats>(SplitType.class);
// Split statistics types
private enum SplitType {
SINGLE_BLOCK_LOCAL,
NODE_LOCAL, NODE_LOCAL_LEFTOVER,
RACK_LOCAL, RACK_LOCAL_LEFTOVER,
OVERFLOW, OVERFLOW_LEFTOVER, ALL
}
/** Are the split types stats valid? */
private boolean isSplitTypeStatsValid = true;
/**
* Get whether the type stats are valid. Used for testing.
*
* @return true if the type stats are valid, false otherwise
*/
public boolean isTypeStatsValid() {
return isSplitTypeStatsValid;
}
/**
* Stats associated with a split type
*/
private class SplitTypeStats {
private int totalSplitCount = 0;
private long totalSize = 0;
private long totalBlockCount = 0;
private long totalHostCount = 0;
/**
* Add a split for this type
* @param splitSize Size of the split
* @param hostCount Hosts listed for this split
* @param blockCount Blocks in this split
*/
public void addSplit(long splitSize, long hostCount, long blockCount) {
++totalSplitCount;
totalSize += splitSize;
totalBlockCount += blockCount;
totalHostCount += hostCount;
}
public int getTotalSplitCount() {
return totalSplitCount;
}
public long getTotalSize() {
return totalSize;
}
public long getTotalHostCount() {
return totalHostCount;
}
public long getTotalBlockCount() {
return totalBlockCount;
}
}
/**
* Add stats for a split type (i.e node local splits,
* rack local splits, etc.) and keep a total count.
* @param splitSize Size of the split
* @param hostCount Hosts listed for this split
* @param blockCount Blocks in this split
*/
private void addStatsForSplitType(
SplitType splitType, long splitSize, long hostCount, long blockCount) {
SplitTypeStats splitTypeStats = splitTypeStatsMap.get(splitType);
if (splitTypeStats == null) {
splitTypeStats = new SplitTypeStats();
splitTypeStatsMap.put(splitType, splitTypeStats);
}
splitTypeStats.addSplit(splitSize, hostCount, blockCount);
// Add all splits to the ALL split type
if (splitType != SplitType.ALL) {
addStatsForSplitType(SplitType.ALL, splitSize, hostCount, blockCount);
}
}
/**
* Get stats for every split type as a string
* @return String of all split type stats
*/
private String getStatsString() {
SplitTypeStats allTypeStats = splitTypeStatsMap.get(SplitType.ALL);
Map<String, Map<String, Number>> statsMapMap =
new HashMap<String, Map<String, Number>>();
for (Map.Entry<SplitType, SplitTypeStats> entry :
splitTypeStatsMap.entrySet()) {
Map<String, Number> statsMap = new HashMap<String, Number>();
statsMapMap.put(entry.getKey().toString(), statsMap);
float percentTotalSplitCount =
(100f * entry.getValue().getTotalSplitCount()) /
allTypeStats.getTotalSplitCount();
float percentTotalSize =
(100f * entry.getValue().getTotalSize()) /
allTypeStats.getTotalSize();
float percentTotalBlockCount =
(100f * entry.getValue().getTotalBlockCount()) /
allTypeStats.getTotalBlockCount();
float averageSizePerSplit =
((float) entry.getValue().getTotalSize()) /
entry.getValue().getTotalSplitCount();
float averageHostCountPerSplit =
((float) entry.getValue().getTotalHostCount()) /
entry.getValue().getTotalSplitCount();
float averageBlockCountPerSplit =
((float) entry.getValue().getTotalBlockCount()) /
entry.getValue().getTotalSplitCount();
statsMap.put("totalSplitCount", entry.getValue().getTotalSplitCount());
statsMap.put("percentTotalSplitCount", percentTotalSplitCount);
statsMap.put("totalSize", entry.getValue().getTotalSize());
statsMap.put("percentTotalSize", percentTotalSize);
statsMap.put("averageSizePerSplit", averageSizePerSplit);
statsMap.put("totalHostCount", entry.getValue().getTotalHostCount());
statsMap.put("averageHostCountPerSplit", averageHostCountPerSplit);
statsMap.put("totalBlockCount", entry.getValue().getTotalBlockCount());
statsMap.put("percentTotalBlockCount", percentTotalBlockCount);
statsMap.put("averageBlockCountPerSplit", averageBlockCountPerSplit);
}
return JSON.toString(statsMapMap);
}
/**
* Specify the maximum size (in bytes) of each split. Each split is
* approximately equal to the specified size.
*/
protected void setMaxSplitSize(long maxSplitSize) {
this.maxSplitSize = maxSplitSize;
}
/**
* Specify the maximum number of blocks in each split.
*/
protected void setMaxNumBlocksPerSplit(long maxNumBlocksPerSplit) {
this.maxNumBlocksPerSplit = maxNumBlocksPerSplit;
}
/**
* Specify the minimum size (in bytes) of each split per node.
* This applies to data that is left over after combining data on a single
* node into splits that are of maximum size specified by maxSplitSize.
* This leftover data will be combined into its own split if its size
* exceeds minSplitSizeNode.
*/
protected void setMinSplitSizeNode(long minSplitSizeNode) {
this.minSplitSizeNode = minSplitSizeNode;
}
/**
* Specify the minimum size (in bytes) of each split per rack.
* This applies to data that is left over after combining data on a single
* rack into splits that are of maximum size specified by maxSplitSize.
* This leftover data will be combined into its own split if its size
* exceeds minSplitSizeRack.
*/
protected void setMinSplitSizeRack(long minSplitSizeRack) {
this.minSplitSizeRack = minSplitSizeRack;
}
/**
* Create a new pool and add the filters to it.
* A split cannot have files from different pools.
*/
protected void createPool(JobConf conf, List<PathFilter> filters) {
pools.add(new MultiPathFilter(filters));
}
/**
* Create a new pool and add the filters to it.
* A pathname can satisfy any one of the specified filters.
* A split cannot have files from different pools.
*/
protected void createPool(JobConf conf, PathFilter... filters) {
MultiPathFilter multi = new MultiPathFilter();
for (PathFilter f: filters) {
multi.add(f);
}
pools.add(multi);
}
private CompressionCodecFactory compressionCodecs =
new CompressionCodecFactory(new JobConf());
@Override
protected boolean isSplitable(FileSystem ignored, Path file) {
return compressionCodecs.getCodec(file) == null;
}
/**
* default constructor
*/
public CombineFileInputFormat() {
// Add the all stats, in case of no splits
splitTypeStatsMap.put(SplitType.ALL, new SplitTypeStats());
}
/**
*
* @param pathOnly If true, pass only the path component of input paths (i.e.
* strip out the scheme and authority) to the pool filters
*/
protected void setPoolFilterPathOnly(boolean pathOnly) {
poolFilterPathOnly = pathOnly;
}
protected boolean getPoolFilterPathOnly() {
return poolFilterPathOnly;
}
@Override
public InputSplit[] getSplits(JobConf job, int numSplits)
throws IOException {
long minSizeNode = 0;
long minSizeRack = 0;
long maxSize = 0;
long maxNumBlocks = 0;
// the values specified by setxxxSplitSize() takes precedence over the
// values that might have been specified in the config
if (minSplitSizeNode != 0) {
minSizeNode = minSplitSizeNode;
} else {
minSizeNode = job.getLong("mapred.min.split.size.per.node", 0);
}
if (maxNumBlocksPerSplit != 0) {
maxNumBlocks = maxNumBlocksPerSplit;
} else {
maxNumBlocks = job.getLong("mapred.max.num.blocks.per.split", 0);
}
if (minSplitSizeRack != 0) {
minSizeRack = minSplitSizeRack;
} else {
minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0);
}
if (maxSplitSize != 0) {
maxSize = maxSplitSize;
} else {
maxSize = job.getLong("mapred.max.split.size", 0);
}
if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
throw new IOException("Minimum split size pernode " + minSizeNode +
" cannot be larger than maximum split size " +
maxSize);
}
if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
throw new IOException("Minimum split size per rack " + minSizeRack +
" cannot be larger than maximum split size " +
maxSize);
}
if (minSizeRack != 0 && minSizeNode > minSizeRack) {
throw new IOException("Minimum split size per rack " + minSizeRack +
" cannot be smaller than minimum split size per node " +
minSizeNode);
}
// all the files in input set
LocatedFileStatus[] stats = listLocatedStatus(job);
long totalLen = 0;
for (LocatedFileStatus stat: stats) {
totalLen += stat.getLen();
}
List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>();
if (stats.length == 0) {
return splits.toArray(new CombineFileSplit[splits.size()]);
}
// Put them into a list for easier removal during iteration
Collection<LocatedFileStatus> newstats = new LinkedList<LocatedFileStatus>();
Collections.addAll(newstats, stats);
stats = null;
// In one single iteration, process all the paths in a single pool.
// Processing one pool at a time ensures that a split contains paths
// from a single pool only.
for (MultiPathFilter onepool : pools) {
ArrayList<LocatedFileStatus> myStats = new ArrayList<LocatedFileStatus>();
// pick one input path. If it matches all the filters in a pool,
// add it to the output set
for (Iterator<LocatedFileStatus> iter = newstats.iterator();
iter.hasNext();) {
LocatedFileStatus stat = iter.next();
if (onepool.accept(stat.getPath(), poolFilterPathOnly)) {
myStats.add(stat); // add it to my output set
iter.remove();
}
}
// create splits for all files in this pool.
getMoreSplits(job, myStats,
maxSize, minSizeNode, minSizeRack, maxNumBlocks, splits);
}
// create splits for all files that are not in any pool.
getMoreSplits(job, newstats,
maxSize, minSizeNode, minSizeRack, maxNumBlocks, splits);
// free up rackToNodes map
rackToNodes.clear();
verifySplits(job, totalLen, splits);
// Print the stats of the splits to the special json metrics log for easier
// parsing. Also, clean up the stats after each getSplits() call since
// others may call it multiple times (i.e. CombineHiveInputFormat)
JSON_METRICS_LOG.info(getStatsString());
splitTypeStatsMap.clear();
return splits.toArray(new CombineFileSplit[splits.size()]);
}
private void verifySplits(
JobConf conf, long totalLen, List<CombineFileSplit> splits)
throws IOException {
if (!conf.getBoolean("mapred.fileinputformat.verifysplits", true)) {
return;
}
long totalSplitLen = 0;
for (CombineFileSplit split: splits) {
totalSplitLen += split.getLength();
}
if (totalLen != totalSplitLen) {
throw new IOException("Total length expected is " + totalLen +
", but total split length is " + totalSplitLen);
}
if (splitTypeStatsMap.get(SplitType.ALL).getTotalSize() != totalSplitLen) {
LOG.error("Total length expected is " + totalLen +
", but total split length according to stats is " +
splitTypeStatsMap.get(SplitType.ALL).getTotalSize() +
", previous isSplitTypeStatsValid = " + isSplitTypeStatsValid);
isSplitTypeStatsValid = false;
}
}
/**
* Comparator to be used with sortBlocksBySize to sort from largest to
* smallest.
*/
private class OneBlockInfoSizeComparator implements Comparator<OneBlockInfo> {
@Override
public int compare(OneBlockInfo left, OneBlockInfo right) {
return (int) (right.length - left.length);
}
}
/**
* Sort the blocks on each node by size, largest to smallest
*
* @param nodeToBlocks Map of nodes to all blocks on that node
*/
private void sortBlocksBySize(Map<String, List<OneBlockInfo>> nodeToBlocks) {
OneBlockInfoSizeComparator comparator = new OneBlockInfoSizeComparator();
for (Entry<String, List<OneBlockInfo>> entry : nodeToBlocks.entrySet()) {
Collections.sort(entry.getValue(), comparator);
}
}
/**
* Return all the splits in the specified set of paths
*/
private void getMoreSplits(JobConf job, Collection<LocatedFileStatus> stats,
long maxSize, long minSizeNode,
long minSizeRack, long maxNumBlocksPerSplit,
List<CombineFileSplit> splits)
throws IOException {
// all blocks for all the files in input set
OneFileInfo[] files;
// mapping from a rack name to the list of blocks it has
HashMap<String, List<OneBlockInfo>> rackToBlocks =
new HashMap<String, List<OneBlockInfo>>();
// mapping from a block to the nodes on which it has replicas
HashMap<OneBlockInfo, String[]> blockToNodes =
new HashMap<OneBlockInfo, String[]>();
// mapping from a node to the list of blocks that it contains
HashMap<String, List<OneBlockInfo>> nodeToBlocks =
new HashMap<String, List<OneBlockInfo>>();
if (stats.isEmpty()) {
return;
}
files = new OneFileInfo[stats.size()];
// populate all the blocks for all files
long totLength = 0;
int fileIndex = 0;
for (LocatedFileStatus oneStatus : stats) {
files[fileIndex] = new OneFileInfo(oneStatus, job,
isSplitable(FileSystem.get(job), oneStatus.getPath()),
rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes, maxSize);
totLength += files[fileIndex].getLength();
fileIndex++;
}
// Sort the blocks on each node from biggest to smallest by size to
// encourage more node-local single block splits
sortBlocksBySize(nodeToBlocks);
ArrayList<OneBlockInfo> validBlocks = new ArrayList<OneBlockInfo>();
Set<String> nodes = new HashSet<String>();
long curSplitSize = 0;
// process all nodes and create splits that are local
// to a node.
for (Iterator<Map.Entry<String,
List<OneBlockInfo>>> iter = nodeToBlocks.entrySet().iterator();
iter.hasNext();) {
Map.Entry<String, List<OneBlockInfo>> one = iter.next();
nodes.add(one.getKey());
List<OneBlockInfo> blocksInNode = one.getValue();
// for each block, copy it into validBlocks. Delete it from
// blockToNodes so that the same block does not appear in
// two different splits.
for (OneBlockInfo oneblock : blocksInNode) {
if (blockToNodes.containsKey(oneblock)) {
validBlocks.add(oneblock);
blockToNodes.remove(oneblock);
curSplitSize += oneblock.length;
// if the accumulated split size exceeds the maximum, then
// create this split.
if ((maxSize != 0 && curSplitSize >= maxSize) ||
(maxNumBlocksPerSplit > 0 && validBlocks.size() >= maxNumBlocksPerSplit)) {
// create an input split and add it to the splits array
// if only one block, add all the node replicas
if (validBlocks.size() == 1) {
Set<String> blockLocalNodes =
new HashSet<String>(Arrays.asList(validBlocks.get(0).hosts));
addCreatedSplit(job, splits, blockLocalNodes, validBlocks);
addStatsForSplitType(SplitType.SINGLE_BLOCK_LOCAL, curSplitSize,
blockLocalNodes.size(), validBlocks.size());
} else {
addCreatedSplit(job, splits, nodes, validBlocks);
addStatsForSplitType(SplitType.NODE_LOCAL, curSplitSize,
nodes.size(), validBlocks.size());
}
curSplitSize = 0;
validBlocks.clear();
}
}
}
// if there were any blocks left over and their combined size is
// larger than minSplitNode, then combine them into one split.
// Otherwise add them back to the unprocessed pool. It is likely
// that they will be combined with other blocks from the same rack later on.
if (minSizeNode != 0 && curSplitSize >= minSizeNode) {
// create an input split and add it to the splits array
addCreatedSplit(job, splits, nodes, validBlocks);
addStatsForSplitType(SplitType.NODE_LOCAL_LEFTOVER, curSplitSize,
nodes.size(), validBlocks.size());
} else {
for (OneBlockInfo oneblock : validBlocks) {
blockToNodes.put(oneblock, oneblock.hosts);
}
}
validBlocks.clear();
nodes.clear();
curSplitSize = 0;
}
// if blocks in a rack are below the specified minimum size, then keep them
// in 'overflow'. After the processing of all racks is complete, these overflow
// blocks will be combined into splits.
ArrayList<OneBlockInfo> overflowBlocks = new ArrayList<OneBlockInfo>();
Set<String> racks = new HashSet<String>();
// Process all racks over and over again until there is no more work to do.
boolean noRacksMadeSplit = false;
while (blockToNodes.size() > 0) {
// Create one split for this rack before moving over to the next rack.
// Come back to this rack after creating a single split for each of the
// remaining racks.
// Process one rack location at a time, Combine all possible blocks that
// reside on this rack as one split. (constrained by minimum and maximum
// split size).
// Iterate over all racks. Add to the overflow blocks only if at least
// one pass over all the racks was completed without adding any splits
long splitsAddedOnAllRacks = 0;
for (Iterator<Map.Entry<String, List<OneBlockInfo>>> iter =
rackToBlocks.entrySet().iterator(); iter.hasNext();) {
Map.Entry<String, List<OneBlockInfo>> one = iter.next();
racks.add(one.getKey());
List<OneBlockInfo> blocks = one.getValue();
// for each block, copy it into validBlocks. Delete it from
// blockToNodes so that the same block does not appear in
// two different splits.
boolean createdSplit = false;
for (OneBlockInfo oneblock : blocks) {
if (blockToNodes.containsKey(oneblock)) {
validBlocks.add(oneblock);
blockToNodes.remove(oneblock);
curSplitSize += oneblock.length;
// if the accumulated split size exceeds the maximum, then
// create this split.
if ((maxSize != 0 && curSplitSize >= maxSize) ||
(maxNumBlocksPerSplit > 0 && validBlocks.size() >= maxNumBlocksPerSplit)) {
// create an input split and add it to the splits array
addCreatedSplit(job, splits, getHosts(racks), validBlocks);
addStatsForSplitType(SplitType.RACK_LOCAL, curSplitSize,
getHosts(racks).size(), validBlocks.size());
createdSplit = true;
++splitsAddedOnAllRacks;
break;
}
}
}
// if we created a split, then just go to the next rack
if (createdSplit) {
curSplitSize = 0;
validBlocks.clear();
racks.clear();
continue;
}
if (!validBlocks.isEmpty()) {
if (minSizeRack != 0 && curSplitSize >= minSizeRack) {
// if there is a mimimum size specified, then create a single split
// otherwise, store these blocks into overflow data structure
addCreatedSplit(job, splits, getHosts(racks), validBlocks);
addStatsForSplitType(SplitType.RACK_LOCAL_LEFTOVER, curSplitSize,
getHosts(racks).size(), validBlocks.size());
++splitsAddedOnAllRacks;
} else if (!noRacksMadeSplit) {
// Add the blocks back if a pass on all rack found at least one
// split or this is the first pass
for (OneBlockInfo oneblock : validBlocks) {
blockToNodes.put(oneblock, oneblock.hosts);
}
} else {
// There were a few blocks in this rack that remained to be processed.
// Keep them in 'overflow' block list. These will be combined later.
overflowBlocks.addAll(validBlocks);
}
}
curSplitSize = 0;
validBlocks.clear();
racks.clear();
}
if (splitsAddedOnAllRacks == 0) {
noRacksMadeSplit = true;
}
}
assert blockToNodes.isEmpty();
assert curSplitSize == 0;
assert validBlocks.isEmpty();
assert racks.isEmpty();
// Process all overflow blocks
for (OneBlockInfo oneblock : overflowBlocks) {
validBlocks.add(oneblock);
curSplitSize += oneblock.length;
// This might cause an exiting rack location to be re-added,
// but it should be OK because racks is a Set.
for (int i = 0; i < oneblock.racks.length; i++) {
racks.add(oneblock.racks[i]);
}
// if the accumulated split size exceeds the maximum, then
// create this split.
if ((maxSize != 0 && curSplitSize >= maxSize) ||
(maxNumBlocksPerSplit > 0 && validBlocks.size() >= maxNumBlocksPerSplit)) {
// create an input split and add it to the splits array
addCreatedSplit(job, splits, getHosts(racks), validBlocks);
addStatsForSplitType(SplitType.OVERFLOW, curSplitSize,
getHosts(racks).size(), validBlocks.size());
curSplitSize = 0;
validBlocks.clear();
racks.clear();
}
}
// Process any remaining blocks, if any.
if (!validBlocks.isEmpty()) {
addCreatedSplit(job, splits, getHosts(racks), validBlocks);
addStatsForSplitType(SplitType.OVERFLOW_LEFTOVER, curSplitSize,
getHosts(racks).size(), validBlocks.size());
}
}
/**
* Create a single split from the list of blocks specified in validBlocks
* Add this new split into splitList.
*/
private void addCreatedSplit(JobConf job,
List<CombineFileSplit> splitList,
Collection<String> locations,
ArrayList<OneBlockInfo> validBlocks) {
// create an input split
Path[] fl = new Path[validBlocks.size()];
long[] offset = new long[validBlocks.size()];
long[] length = new long[validBlocks.size()];
for (int i = 0; i < validBlocks.size(); i++) {
fl[i] = validBlocks.get(i).onepath;
offset[i] = validBlocks.get(i).offset;
length[i] = validBlocks.get(i).length;
}
// add this split to the list that is returned
CombineFileSplit thissplit = new CombineFileSplit(job, fl, offset,
length, locations.toArray(new String[locations.size()]));
splitList.add(thissplit);
}
/**
* This is not implemented yet.
*/
public abstract RecordReader<K, V> getRecordReader(InputSplit split,
JobConf job, Reporter reporter)
throws IOException;
/**
* information about one file from the File System
*/
private static class OneFileInfo {
private long fileSize; // size of the file
private OneBlockInfo[] blocks; // all blocks in this file
OneFileInfo(LocatedFileStatus stat, JobConf job,
boolean isSplitable,
HashMap<String, List<OneBlockInfo>> rackToBlocks,
HashMap<OneBlockInfo, String[]> blockToNodes,
HashMap<String, List<OneBlockInfo>> nodeToBlocks,
HashMap<String, Set<String>> rackToNodes,
long maxSize)
throws IOException {
this.fileSize = 0;
// get block locations from file system
BlockLocation[] locations = stat.getBlockLocations();
// create a list of all block and their locations
if (locations == null || locations.length == 0) {
blocks = new OneBlockInfo[0];
} else {
if (!isSplitable) {
// if the file is not splitable, just create the one block with
// full file length
blocks = new OneBlockInfo[1];
fileSize = stat.getLen();
blocks[0] = new OneBlockInfo(stat.getPath(), 0, fileSize, locations[0]
.getHosts(), locations[0].getTopologyPaths());
} else {
ArrayList<OneBlockInfo> blocksList = new ArrayList<OneBlockInfo>(locations.length);
for (int i = 0; i < locations.length; i++) {
fileSize += locations[i].getLength();
// each split can be a maximum of maxSize
long left = locations[i].getLength();
long myOffset = locations[i].getOffset();
long myLength = 0;
while (left > 0) {
if (maxSize == 0) {
myLength = left;
} else {
if (left > maxSize && left < 2 * maxSize) {
// if remainder is between max and 2*max - then
// instead of creating splits of size max, left-max we
// create splits of size left/2 and left/2.
myLength = left / 2;
} else {
myLength = Math.min(maxSize, left);
}
}
OneBlockInfo oneblock = new OneBlockInfo(stat.getPath(),
myOffset,
myLength,
locations[i].getHosts(),
locations[i].getTopologyPaths());
left -= myLength;
myOffset += myLength;
blocksList.add(oneblock);
}
}
blocks = blocksList.toArray(new OneBlockInfo[blocksList.size()]);
}
for (OneBlockInfo oneblock : blocks) {
// add this block to the block --> node locations map
blockToNodes.put(oneblock, oneblock.hosts);
// For blocks that do not have host/rack information,
// assign to default rack.
String[] racks = null;
if (oneblock.hosts.length == 0) {
racks = new String[]{NetworkTopology.DEFAULT_RACK};
} else {
racks = oneblock.racks;
}
// add this block to the rack --> block map
for (int j = 0; j < racks.length; j++) {
String rack = racks[j];
List<OneBlockInfo> blklist = rackToBlocks.get(rack);
if (blklist == null) {
blklist = new ArrayList<OneBlockInfo>();
rackToBlocks.put(rack, blklist);
}
blklist.add(oneblock);
if (!racks[j].equals(NetworkTopology.DEFAULT_RACK)) {
// Add this host to rackToNodes map
addHostToRack(rackToNodes, racks[j], oneblock.hosts[j]);
}
}
// add this block to the node --> block map
for (int j = 0; j < oneblock.hosts.length; j++) {
String node = oneblock.hosts[j];
List<OneBlockInfo> blklist = nodeToBlocks.get(node);
if (blklist == null) {
blklist = new ArrayList<OneBlockInfo>();
nodeToBlocks.put(node, blklist);
}
blklist.add(oneblock);
}
}
}
}
long getLength() {
return fileSize;
}
OneBlockInfo[] getBlocks() {
return blocks;
}
}
/**
* information about one block from the File System
*/
private static class OneBlockInfo {
Path onepath; // name of this file
long offset; // offset in file
long length; // length of this block
String[] hosts; // nodes on whch this block resides
String[] racks; // network topology of hosts
OneBlockInfo(Path path, long offset, long len,
String[] hosts, String[] topologyPaths) {
this.onepath = path;
this.offset = offset;
this.hosts = hosts;
this.length = len;
assert (hosts.length == topologyPaths.length ||
topologyPaths.length == 0);
// if the file ystem does not have any rack information, then
// use dummy rack location.
if (topologyPaths.length == 0) {
topologyPaths = new String[hosts.length];
for (int i = 0; i < topologyPaths.length; i++) {
topologyPaths[i] = (new NodeBase(hosts[i], NetworkTopology.DEFAULT_RACK)).
toString();
}
}
// The topology paths have the host name included as the last
// component. Strip it.
this.racks = new String[topologyPaths.length];
for (int i = 0; i < topologyPaths.length; i++) {
this.racks[i] = (new NodeBase(topologyPaths[i])).getNetworkLocation();
}
}
}
private static void addHostToRack(HashMap<String, Set<String>> rackToNodes,
String rack, String host) {
Set<String> hosts = rackToNodes.get(rack);
if (hosts == null) {
hosts = new HashSet<String>();
rackToNodes.put(rack, hosts);
}
hosts.add(host);
}
private Set<String> getHosts(Set<String> racks) {
Set<String> hosts = new HashSet<String>();
for (String rack : racks) {
if (rackToNodes.containsKey(rack)) {
hosts.addAll(rackToNodes.get(rack));
}
}
return hosts;
}
/**
* Accept a path only if any one of filters given in the
* constructor do.
*/
private static class MultiPathFilter implements PathFilter {
private List<PathFilter> filters;
public MultiPathFilter() {
this.filters = new ArrayList<PathFilter>();
}
public MultiPathFilter(List<PathFilter> filters) {
this.filters = filters;
}
public void add(PathFilter one) {
filters.add(one);
}
public boolean accept(Path path) {
for (PathFilter filter : filters) {
if (filter.accept(path)) {
return true;
}
}
return false;
}
/**
*
* @param path
* @param pathOnly whether to strip out the scheme/authority before passing
* to the constituent filters
* @return whether the path matches all of the filters
*/
public boolean accept(Path path, boolean pathOnly) {
Path pathToCheck = path;
if (pathOnly) {
pathToCheck = new Path(path.toUri().getPath());
}
return accept(pathToCheck);
}
public String toString() {
StringBuffer buf = new StringBuffer();
buf.append("[");
for (PathFilter f: filters) {
buf.append(f);
buf.append(",");
}
buf.append("]");
return buf.toString();
}
}
}