/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode;
import org.apache.commons.logging.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.net.DNSToSwitchMapping;
import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase;
import org.apache.hadoop.util.HostsFileReader;
import org.apache.hadoop.raid.Codec;
import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicyDefault;
import java.io.IOException;
import java.util.*;
import org.apache.commons.lang.ArrayUtils;
/**
* This block placement policy tries (best effort) to the following:
*
* If the file is under the staging directory (a specially named directory)
* then all blocks of the file is kept on the same host. Additionally, all
* the raid blocks (if any) for the same file is also kept on the same host.
*
* If the file is not under the staging directory then blocks are put in such
* a way that all blocks within the same stripe end up on random hosts in
* different racks. For example, the 10 data blocks and 4 parity blocks in a
* stripe should end up in different racks.
*/
public class FaultTolerantBlockPlacementPolicy extends BlockPlacementPolicyRaid {
private int stripeLen;
private String stagingDir;
private String localDir;
private FSNamesystem namesystem = null;
private boolean considerLoad;
private List<Codec> acceptedCodecs = new ArrayList<Codec>();
private static Set<String> badRacks = new HashSet<String>();
private static Set<String> badHosts = new HashSet<String>();
FaultTolerantBlockPlacementPolicy(Configuration conf,
FSClusterStats stats,
NetworkTopology clusterMap) {
initialize(conf, stats, clusterMap, null, null, null);
}
FaultTolerantBlockPlacementPolicy() {
}
/** A function to be used by unit tests only */
public static void setBadHostsAndRacks(Set<String> racks,
Set<String> hosts) {
badRacks = racks;
badHosts = hosts;
}
/** {@inheritDoc} */
public void initialize(Configuration conf,
FSClusterStats stats,
NetworkTopology clusterMap,
HostsFileReader hostsReader,
DNSToSwitchMapping dnsToSwitchMapping,
FSNamesystem ns) {
super.initialize(
conf, stats, clusterMap, hostsReader, dnsToSwitchMapping, ns);
this.namesystem = ns;
// Default
this.stripeLen = 0;
this.considerLoad = conf.getBoolean("dfs.replication.considerLoad", true);
FSNamesystem.LOG.info("F4: Block placement will consider load: "
+ this.considerLoad);
initParityConfigs();
this.stagingDir = conf.get("dfs.f4.staging", "/staging");
this.localDir = conf.get("dfs.f4.local", "/local");
}
/**
* This function initializes configuration for the supported parities.
*
* Currently, we support RS and XOR. Those two can have different
* configurations individually. Respective configurations will be used when
* placing the parity files. There is one exception. The stripe length is
* calculated based on the maximum of the stripe lengths of the individual
* parities.
*/
private void initParityConfigs() {
Set<String> acceptedCodecIds = new HashSet<String>();
for (String s : conf.get("dfs.f4.accepted.codecs", "rs,xor").split(",")) {
acceptedCodecIds.add(s);
}
for (Codec c : Codec.getCodecs()) {
if (acceptedCodecIds.contains(c.id)) {
FSNamesystem.LOG.info("F4: Parity info."
+ " Id: " + c.id
+ " Parity Length: " + c.parityLength
+ " Parity Stripe Length: " + c.stripeLength
+ " Parity directory: " + c.parityDirectory
+ " Parity temp directory: " + c.tmpParityDirectory);
acceptedCodecs.add(c);
if (c.stripeLength > this.stripeLen) {
// Use the max stripe length
this.stripeLen = c.stripeLength;
}
}
}
FSNamesystem.LOG.info("F4: Initialized stripe len to: " + this.stripeLen);
}
private Codec getCodec(String fileName) {
for (Codec c : this.acceptedCodecs) {
// This should be "/raidrs/" or /"raid/". If any of these two is
// is present in the file path, we will assume that is the parity type.
String uniqueSubtringId = c.parityDirectory + "/";
if (fileName.contains(uniqueSubtringId)) {
return c;
}
}
Codec c = this.acceptedCodecs.get(0);
FSNamesystem.LOG.error("F4: Could not find any valid codec for the file: "
+ fileName + ", hence returning the first one: " + c.id);
return c;
}
private String getParityStagingDir(String parityFileName) {
Codec c = getCodec(parityFileName);
return c.parityDirectory + this.stagingDir;
}
private boolean isStaging(String fileName) {
return fileName.startsWith(this.stagingDir) ||
fileName.startsWith(this.getParityStagingDir(fileName));
}
private boolean isLocal(String fileName) {
return fileName.startsWith(this.localDir);
}
@Override
public DatanodeDescriptor[] chooseTarget(
String srcPath,
int numOfReplicas,
DatanodeDescriptor writer,
List<DatanodeDescriptor> chosenNodes,
long blocksize) {
return chooseTargetF4(
srcPath, numOfReplicas, writer, chosenNodes, null, blocksize);
}
/**
* This function finds a node where to place a block of a file under the
* "local" directory. The basic idea is to have as few locations (preferably
* one, and preferably on the writer node)
*
* 1) Choose a node that contains one of the blocks in the blocks argument.
* 2) If there are multiple such nodes, choose one of them (in some order).
* 3) If this is the first block, then choose the the writer node.
* 4) If the writer node is not good, choose a random node within the same
* rack as the writer node.
* 5) If the writer node is null or if all of the above tries fail, then
* just choose based on the the parent class's policy.
*
* @param fileName The name of the file for which the block is to be
* placed.
* @param writer The writer node.
* @param blocks The block locations that are to be used as reference
* for placing the current block. For a data file, it
* is the blocks for that file itself. For a raid file,
* it is the blocks of the source file.
* @param chosenNodes @see chooseTarget
* @param excludedNodes @see chooseTarget
* @param blocksize @see chooseTarget
*/
private DatanodeDescriptor[] chooseLocalTarget(
String fileName,
DatanodeDescriptor writer,
LocatedBlocks blocks,
List<Node> excludedNodes,
List<DatanodeDescriptor> chosenNodes,
long blocksize) throws IOException, NotEnoughReplicasException {
// First try the same node as the one where other blocks reside.
HashMap<String, DatanodeInfo> hostMap =
new HashMap<String, DatanodeInfo>();
for (LocatedBlock b : blocks.getLocatedBlocks()) {
for (DatanodeInfo i : b.getLocations()) {
hostMap.put(i.getNetworkLocation() + "/" + i.getName(), i);
}
}
for (Map.Entry<String, DatanodeInfo> entry : hostMap.entrySet()) {
DatanodeDescriptor result = null;
DatanodeInfo i = entry.getValue();
result = new DatanodeDescriptor(i,
i.getNetworkLocation(),
i.getHostName(),
i.getCapacity(),
i.getDfsUsed(),
i.getRemaining(),
i.getNamespaceUsed(),
i.getXceiverCount());
if (this.isGoodTarget(result,
blocksize,
Integer.MAX_VALUE,
this.considerLoad,
new ArrayList<DatanodeDescriptor>())) {
// I dont care about per rack load.
DatanodeDescriptor[] r = {result};
return r;
}
}
// Try something in the same rack as the writer.
if (writer == null) {
return super.chooseTarget(
fileName, 1, writer, chosenNodes, excludedNodes, blocksize);
} else if (this.isGoodTarget(writer,
blocksize,
Integer.MAX_VALUE,
this.considerLoad,
new ArrayList<DatanodeDescriptor>())) {
DatanodeDescriptor[] r = {writer};
return r;
}
HashMap<Node, Node> exclNodes = new HashMap<Node, Node>();
for (Node n : excludedNodes) {
exclNodes.put(n, n);
}
List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>();
chooseRandom(
1, writer.getNetworkLocation(), exclNodes, blocksize, 1, results);
return results.toArray(new DatanodeDescriptor[results.size()]);
}
/// A helper function that says some hosts are bad based on test config.
@Override
protected boolean isGoodTarget(DatanodeDescriptor node,
long blockSize,
int maxPerRack,
boolean considerLoad,
List<DatanodeDescriptor> results) {
if (badRacks.contains(node.getNetworkLocation()) ||
badHosts.contains(node.getName())) {
return false;
}
return super.isGoodTarget(
node, blockSize, maxPerRack, considerLoad, results);
}
@Override
public DatanodeDescriptor[] chooseTarget(
String srcInode,
int numOfReplicas,
DatanodeDescriptor writer,
List<DatanodeDescriptor> chosenNodes,
List<Node> excludesNodes,
long blocksize) {
return chooseTargetF4(
srcInode, numOfReplicas, writer, chosenNodes, excludesNodes, blocksize);
}
private String getSourceFileFromParity(String fileName,
FileInfo info)
throws IOException {
NameWithINode nameWithINode;
switch (info.type) {
case PARITY:
// We need to support the following cases
// parity = /raidrs/staging/X, source = /X
// parity = /raidrs/X, source = /X
nameWithINode = null;
if (isStaging(fileName)) {
nameWithINode = getSourceFile(fileName,
getParityStagingDir(fileName));
}
if (nameWithINode == null) {
Codec c = getCodec(fileName);
nameWithINode = getSourceFile(fileName, c.parityDirectory);
}
return ((nameWithINode == null) ? null : nameWithINode.name);
case TEMP_PARITY:
Codec c = getCodec(fileName);
nameWithINode = getSourceFile(fileName, c.tmpParityDirectory);
return ((nameWithINode == null) ? null : nameWithINode.name);
default:
FSNamesystem.LOG.error("file type bad");
return null;
}
}
/**
* This is the main driver function that dictates block placement.
*
* This function figures out the kind of file (staging or not, raid or not)
* and invokes the appropriate functions
*/
private DatanodeDescriptor[] chooseTargetF4(
String fileName,
int numOfReplicas,
DatanodeDescriptor writer,
List<DatanodeDescriptor> chosenNodes,
List<Node> exclNodes,
long blocksize) {
FSNamesystem.LOG.info("F4: F4 policy invoked for file: " + fileName +
", with replica count: " + numOfReplicas);
// If replica>1 then just default back to RAID
if (numOfReplicas > 1) {
return super.chooseTarget(
numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
}
FileInfo info;
LocatedBlocks blocks;
int blockIndex = -1;
try {
blocks = this.namesystem.getBlockLocations(fileName, 0, Long.MAX_VALUE);
info = getFileInfo(null, fileName);
blockIndex = blocks.getLocatedBlocks().size();
} catch (IOException e) {
FSNamesystem.LOG.error(
"F4: Error happened when calling getFileInfo/getBlockLocations");
return super.chooseTarget(
fileName, numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
}
FSNamesystem.LOG.info(
"F4: The file: " + fileName + " has a type: " + info.type);
HashMap<String, HashSet<Node>> rackToHosts =
new HashMap<String, HashSet<Node>>();
try {
// First handle the "localdir" case
if (isLocal(fileName)) {
return chooseLocalTarget(fileName,
writer,
blocks,
exclNodes,
chosenNodes,
blocksize);
}
// For a data file, the locations of its own blocks as the reference
int stripeIndex = -1;
String srcFileName = null;
String parityFileName = null;
int parityLength = 0;
int stripeLength = 0;
switch (info.type) {
case NOT_RAID:
case SOURCE:
srcFileName = fileName;
parityFileName = null;
stripeLength = this.stripeLen;
stripeIndex = blockIndex / stripeLength;
break;
case TEMP_PARITY:
case PARITY:
srcFileName = getSourceFileFromParity(fileName, info);
parityFileName = fileName;
if (srcFileName == null ||
this.namesystem.getHdfsFileInfo(srcFileName) == null) {
srcFileName = null;
FSNamesystem.LOG.error("F4: " + srcFileName + " does not exist");
}
Codec c = getCodec(fileName);
parityLength = c.parityLength;
stripeLength = c.stripeLength;
stripeIndex = blockIndex / parityLength;
break;
default:
return super.chooseTarget(
numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
}
rackToHosts = getRackToHostsMapForStripe(srcFileName,
parityFileName,
stripeLength,
parityLength,
stripeIndex);
} catch (IOException e) {
FSNamesystem.LOG.error("F4: Error happened when calling "
+ "getParityFile/getSourceFileFromParity");
return super.chooseTarget(
numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
} catch (NotEnoughReplicasException e) {
FSNamesystem.LOG.error("F4: Error happend when calling "
+ "getCompanionSourceNodes/getSourceFile");
return super.chooseTarget(
numOfReplicas, writer, chosenNodes, exclNodes, blocksize);
}
return chooseTargetOnNewFailureDomain(fileName,
writer,
chosenNodes,
exclNodes,
rackToHosts,
blocksize);
}
// Given a stripe index returns all racks in which the blocks of the stripe
// reside and the hosts within those racks that host those blocks
private HashMap<String, HashSet<Node>> getRackToHostsMapForStripe(
String srcFileName,
String parityFileName,
int stripeLen,
int parityLen,
int stripeIndex) throws IOException {
HashMap<String, HashSet<Node>> rackToHosts =
new HashMap<String, HashSet<Node>>();
if (srcFileName != null) {
rackToHosts = getRackToHostsMapForStripe(srcFileName,
stripeIndex,
stripeLen);
}
if (parityFileName != null) {
HashMap<String, HashSet<Node>> rackToHostsForParity =
getRackToHostsMapForStripe(parityFileName,
stripeIndex,
parityLen);
for (Map.Entry<String, HashSet<Node>> e :
rackToHostsForParity.entrySet()) {
HashSet<Node> nodes = rackToHosts.get(e.getKey());
if (nodes == null) {
nodes = new HashSet<Node>();
rackToHosts.put(e.getKey(), nodes);
}
for (Node n : e.getValue()) {
nodes.add(n);
}
}
}
for (Map.Entry<String, HashSet<Node>> e : rackToHosts.entrySet()) {
if (e.getValue().size() > 1) {
FSNamesystem.LOG.warn("F4: Rack " + e.getKey() +
" being overused for stripe: " + stripeIndex);
}
}
return rackToHosts;
}
private HashMap<String, HashSet<Node>> getRackToHostsMapForStripe(
String src,
int stripeIndex,
int stripeLen) throws IOException {
int sourceStart = stripeIndex * stripeLen;
int sourceEnd = sourceStart + stripeLen;
LocatedBlocks blocks = this.namesystem.getBlockLocations(src,
0,
Long.MAX_VALUE);
List<LocatedBlock> sourceBlocks = blocks.getLocatedBlocks();
sourceEnd = Math.min(sourceEnd, sourceBlocks.size());
HashMap<String, HashSet<Node>> rackNodes =
new HashMap<String, HashSet<Node>>();
if (sourceStart < sourceBlocks.size()) {
for (LocatedBlock b : sourceBlocks.subList(sourceStart, sourceEnd)) {
for (Node n : b.getLocations()) {
String rack = n.getNetworkLocation();
FSNamesystem.LOG.info("F4: Block info for file: " + src
+ ", offset: " + b.getStartOffset() + ", rack: " + rack);
HashSet<Node> nodes = rackNodes.get(rack);
if (nodes == null) {
nodes = new HashSet<Node>();
rackNodes.put(rack, nodes);
}
nodes.add(n);
}
}
}
return rackNodes;
}
/**
* This function uses the rackToHosts map (that contains the rack and the
* corresponding nodes in those racks that contain the relevant blocks).
*
* The definition of "relevant blocks" is flexible. It can be used in a
* variety of contexts. In the F4 placement policy, the relevant blocks
* are all the peer blocks of the block to be placed. The peer blocks would
* be all blocks in the raid stripe (data and parity included).
*
* It gets the racks that contain the least number of blocks for the stripe.
* it gets the nodes within those racks and tries one-by-one all such
* hosts as potential locations for the blocks. The check is based on
* the host:
* 1) The host passing the isGoodTarget check.
* 2) If 1) fails and the "considerLoad" is true, then the same check is
* done with considerLoad = false.
* 3) If 2) fails, then a node is chosen randomly while excluding any hosts
* that contain a block in the same stripe as the block to be placed.
*/
private DatanodeDescriptor[] chooseTargetOnNewFailureDomain(
String fileName,
DatanodeDescriptor writer,
List<DatanodeDescriptor> chosenNodes,
List<Node> exclNodes,
HashMap<String, HashSet<Node>> rackToHosts,
long blockSize) {
HashMap<Node, Node> excludedNodes = new HashMap<Node, Node>();
for (String rack : this.clusterMap.getAllRacks()) {
if (rackToHosts.get(rack) == null) {
rackToHosts.put(rack, new HashSet<Node>());
}
}
// Get the min occupancy in the racks.
int minCount = Integer.MAX_VALUE;
for (Map.Entry<String, HashSet<Node>> entry : rackToHosts.entrySet()) {
if (entry.getValue().size() < minCount) {
minCount = entry.getValue().size();
}
// DO NOT choose a host that has already been chosen for this stripe.
for (Node n : entry.getValue()) {
excludedNodes.put(n, n);
}
}
if (exclNodes != null) {
for (Node node:exclNodes) {
excludedNodes.put(node, node);
}
}
HashMap<String, HashSet<Node>> candidateNodesByRacks =
new HashMap<String, HashSet<Node>>();
for (Map.Entry<String, HashSet<Node>> entry : rackToHosts.entrySet()) {
if (entry.getValue().size() == minCount) {
for (Node n : this.clusterMap.getDatanodesInRack(entry.getKey())) {
if (excludedNodes.get(n) == null) {
HashSet<Node> candidateNodes =
candidateNodesByRacks.get(entry.getKey());
if (candidateNodes == null) {
candidateNodes = new HashSet<Node>();
candidateNodesByRacks.put(entry.getKey(), candidateNodes);
}
candidateNodes.add(n);
}
}
}
}
List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>();
if (getGoodNode(candidateNodesByRacks,
this.considerLoad,
blockSize,
results)) {
return results.toArray(new DatanodeDescriptor[results.size()]);
}
if (this.considerLoad) {
FSNamesystem.LOG.info("F4: Retrying without considering load for file: "
+ fileName);
if (getGoodNode(candidateNodesByRacks, false, blockSize, results)) {
return results.toArray(new DatanodeDescriptor[results.size()]);
}
}
FSNamesystem.LOG.error("F4: No datanode in a non-overlapping rack for file:"
+ fileName);
// Final effort to get something. But it will always try to get something
// that is not a host that contains a peer block (block in the same stripe)
// We assume that this step should succeed. In this step all nodes in the
// cluster are available except for atmost 13 hosts for placement. So it is
// highly unlikely that this step would fail.
try {
super.chooseRandom(
1, NodeBase.ROOT, excludedNodes, blockSize, 1, results);
return results.toArray(new DatanodeDescriptor[results.size()]);
} catch (Exception e) {
FSNamesystem.LOG.error("F4: Could not find a data node using "
+ "the normal F4 policy. Switching to default of parent");
return super.chooseTarget(fileName, 1, writer,
chosenNodes, null, blockSize);
}
}
private class RackComparator
implements Comparator<Map.Entry<String, HashSet<Node>>> {
public RackComparator(long blockSize) {
this.blockSize = blockSize;
}
public int compare(Map.Entry<String, HashSet<Node>> o1,
Map.Entry<String, HashSet<Node>> o2) {
long ret = 0;
for (Node node : o1.getValue()) {
DatanodeDescriptor n = (DatanodeDescriptor)node;
ret += (n.getRemaining() - (n.getBlocksScheduled() * this.blockSize));
}
for (Node node : o2.getValue()) {
DatanodeDescriptor n = (DatanodeDescriptor)node;
ret -= (n.getRemaining() - (n.getBlocksScheduled() * this.blockSize));
}
return ret == 0 ? 0 : (ret > 0) ? -1 : 1;
}
private long blockSize;
}
// Helper function to choose less occupied racks first.
private boolean getGoodNode(
HashMap<String, HashSet<Node>> candidateNodesByRacks,
boolean considerLoad,
long blockSize,
List<DatanodeDescriptor> results) {
List<Map.Entry<String, HashSet<Node>>> sorted =
new ArrayList<Map.Entry<String, HashSet<Node>>>();
for (Map.Entry<String, HashSet<Node>> entry :
candidateNodesByRacks.entrySet()) {
sorted.add(entry);
}
Collections.sort(sorted, new RackComparator(blockSize));
int count = sorted.size() / 4;
Collections.shuffle(sorted.subList(0, count));
for (Map.Entry<String, HashSet<Node>> e : sorted) {
if (getGoodNode(e.getValue(), considerLoad, blockSize, results)) {
return true;
}
}
return false;
}
// Helper function to find a good node. Returns true if found.
private boolean getGoodNode(Set<Node> candidateNodes,
boolean considerLoad,
long blockSize,
List<DatanodeDescriptor> results) {
List<DatanodeDescriptor> sorted = new ArrayList<DatanodeDescriptor>();
for (Node n : candidateNodes) {
sorted.add((DatanodeDescriptor)n);
}
final long blocksize = blockSize;
Collections.sort(sorted, new Comparator<DatanodeDescriptor>() {
public int compare(DatanodeDescriptor n1, DatanodeDescriptor n2) {
long ret = (n2.getRemaining() - (n2.getBlocksScheduled() * blocksize)) -
(n1.getRemaining() - (n1.getBlocksScheduled() * blocksize));
return ret == 0 ? 0 : (ret > 0) ? 1 : -1;
}
});
// Also, add some randomness. We are doing so because it seems
// that if there are many copies scheduled at the same time, namenode
// does not have the uptodate information. So, we need to add some
// randomness so that there is not a lot of copies targeted to
// the same node, which will overload the hosts and may lead to
// timeouts.
int count = sorted.size() / 2;
Collections.shuffle(sorted.subList(0, count));
for (DatanodeDescriptor n : sorted) {
if (this.isGoodTarget((DatanodeDescriptor)n,
blocksize,
1, // MaxTargerPerLoc (per rack)
considerLoad,
results)) {
results.add((DatanodeDescriptor)n);
return true;
}
}
return false;
}
}