/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.namenode; import org.apache.commons.logging.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.server.namenode.BlockPlacementPolicy.NotEnoughReplicasException; import org.apache.hadoop.net.DNSToSwitchMapping; import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.net.Node; import org.apache.hadoop.net.NodeBase; import org.apache.hadoop.util.HostsFileReader; import java.util.*; /** The class is responsible for choosing the desired number of targets * for placing block replicas. * The replica placement strategy is that if the writer is on a datanode, * the 1st replica is placed on the local machine, * otherwise a random datanode. The 2nd replica is placed on a datanode * that is on a different rack. The 3rd replica is placed on a datanode * which is on a different node of the rack as the second replica. */ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy { private boolean considerLoad; protected NetworkTopology clusterMap; private FSClusterStats stats; private int attemptMultiplier = 0; private int minBlocksToWrite = FSConstants.MIN_BLOCKS_FOR_WRITE; BlockPlacementPolicyDefault(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap) { initialize(conf, stats, clusterMap, null, null, null); } BlockPlacementPolicyDefault() { } /** {@inheritDoc} */ public void initialize(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap, HostsFileReader hostsReader, DNSToSwitchMapping dnsToSwitchMapping, FSNamesystem ns) { this.considerLoad = conf.getBoolean("dfs.replication.considerLoad", true); this.minBlocksToWrite = conf.getInt("dfs.replication.minBlocksToWrite", FSConstants.MIN_BLOCKS_FOR_WRITE); this.stats = stats; this.clusterMap = clusterMap; Configuration newConf = new Configuration(); this.attemptMultiplier = newConf.getInt("dfs.replication.attemptMultiplier", 200); FSNamesystem.LOG.info("Value for min blocks to write " + this.minBlocksToWrite); } @Override public void hostsUpdated() { // Do nothing in this case } /** {@inheritDoc} */ public DatanodeDescriptor[] chooseTarget(String srcPath, int numOfReplicas, DatanodeDescriptor writer, List<DatanodeDescriptor> chosenNodes, long blocksize) { return chooseTarget(numOfReplicas, writer, chosenNodes, null, blocksize); } /** {@inheritDoc} */ @Override public DatanodeDescriptor[] chooseTarget(String srcInode, int numOfReplicas, DatanodeDescriptor writer, List<DatanodeDescriptor> chosenNodes, List<Node> excludesNodes, long blocksize) { return chooseTarget(numOfReplicas, writer, chosenNodes, excludesNodes, blocksize); } /** {@inheritDoc} */ @Override public DatanodeDescriptor[] chooseTarget(FSInodeInfo srcInode, int numOfReplicas, DatanodeDescriptor writer, List<DatanodeDescriptor> chosenNodes, List<Node> excludesNodes, long blocksize) { return chooseTarget(numOfReplicas, writer, chosenNodes, null, blocksize); } final protected int[] getActualReplicas(int numOfReplicas, List<DatanodeDescriptor> chosenNodes) { int clusterSize = clusterMap.getNumOfLeaves(); int totalNumOfReplicas = chosenNodes.size() + numOfReplicas; if (totalNumOfReplicas > clusterSize) { numOfReplicas -= (totalNumOfReplicas - clusterSize); totalNumOfReplicas = clusterSize; } int maxNodesPerRack = (totalNumOfReplicas - 1) / clusterMap.getNumOfRacks() + 2; return new int[] { numOfReplicas, maxNodesPerRack }; } final protected void updateExcludedAndChosen(List<Node> exlcNodes, HashMap<Node, Node> excludedNodes, List<DatanodeDescriptor> results, List<DatanodeDescriptor> chosenNodes) { if (exlcNodes != null) { for (Node node : exlcNodes) { excludedNodes.put(node, node); } } for (DatanodeDescriptor node : chosenNodes) { excludedNodes.put(node, node); if ((!node.isDecommissionInProgress()) && (!node.isDecommissioned())) { results.add(node); } } } final protected DatanodeDescriptor[] finalizeTargets( List<DatanodeDescriptor> results, List<DatanodeDescriptor> chosenNodes, DatanodeDescriptor writer, DatanodeDescriptor localNode) { results.removeAll(chosenNodes); // sorting nodes to form a pipeline DatanodeDescriptor[] pipeline = results .toArray(new DatanodeDescriptor[results.size()]); clusterMap.getPipeline((writer == null) ? localNode : writer, pipeline); return pipeline; } /** * This is not part of the public API but is used by the unit tests. */ DatanodeDescriptor[] chooseTarget(int numOfReplicas, DatanodeDescriptor writer, List<DatanodeDescriptor> chosenNodes, List<Node> exlcNodes, long blocksize) { if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) { return new DatanodeDescriptor[0]; } int[] result = getActualReplicas(numOfReplicas, chosenNodes); numOfReplicas = result[0]; int maxNodesPerRack = result[1]; HashMap<Node, Node> excludedNodes = new HashMap<Node, Node>(); List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>( chosenNodes.size() + numOfReplicas); updateExcludedAndChosen(exlcNodes, excludedNodes, results, chosenNodes); if (!clusterMap.contains(writer)) { writer=null; } DatanodeDescriptor localNode = chooseTarget(numOfReplicas, writer, excludedNodes, blocksize, maxNodesPerRack, results, chosenNodes.isEmpty()); return this.finalizeTargets(results, chosenNodes, writer, localNode); } /** * all the chosen nodes are on the same rack, choose a node on a new rack for * the next replica according to where the writer is */ private void choose2ndRack(DatanodeDescriptor writer, HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeDescriptor> results) throws NotEnoughReplicasException { if (!clusterMap.isOnSameRack(writer, results.get(0))) { DatanodeDescriptor localNode = chooseLocalNode(writer, excludedNodes, blocksize, maxNodesPerRack, results); if (clusterMap.isOnSameRack(localNode, results.get(0))) { // should not put 2nd replica on the same rack as the first replica results.remove(localNode); } else { return; } } chooseRemoteRack(1, results.get(0), excludedNodes, blocksize, maxNodesPerRack, results); } /* choose <i>numOfReplicas</i> from all data nodes */ protected DatanodeDescriptor chooseTarget(int numOfReplicas, DatanodeDescriptor writer, HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeDescriptor> results, boolean newBlock) { if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) { return writer; } int numOfResults = results.size(); boolean inClusterWriter = writer != null; if (writer == null && !newBlock) { writer = results.get(0); } try { if (numOfResults == 0) { chooseLocalNode(writer, excludedNodes, blocksize, maxNodesPerRack, results); if (newBlock && writer == null) { writer = results.get(0); } if (--numOfReplicas == 0) { return writer; } } if (numOfResults <= 1) { choose2ndRack(writer, excludedNodes, blocksize, maxNodesPerRack, results); if (--numOfReplicas == 0) { return writer; } } if (numOfResults <= 2) { if (clusterMap.isOnSameRack(results.get(0), results.get(1))) { choose2ndRack(writer, excludedNodes, blocksize, maxNodesPerRack, results); } else if (newBlock) { if (inClusterWriter) { place3rdReplicaForInClusterWriter( excludedNodes, blocksize, maxNodesPerRack, results); } else { chooseLocalRack(results.get(1), excludedNodes, blocksize, maxNodesPerRack, results); } } else { chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack, results); } if (--numOfReplicas == 0) { return writer; } } chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results); } catch (NotEnoughReplicasException e) { FSNamesystem.LOG.warn("Not able to place enough replicas, still in need of " + numOfReplicas); } return writer; } /** * Place the third replica for a new block when the writer is * in the HDFS cluster and first two replicas are in the same rack * The default policy places the third replica on the same rack * as the 2nd replica * * @param excludedNodes exluded nodes * @param blocksize blocksize * @param maxNodesPerRack max number of nodes per rack * @param results chosen nodes * @throws NotEnoughReplicasException */ protected void place3rdReplicaForInClusterWriter( HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack,List<DatanodeDescriptor> results ) throws NotEnoughReplicasException { chooseLocalRack(results.get(1), excludedNodes, blocksize, maxNodesPerRack, results); } /* choose <i>localMachine</i> as the target. * if <i>localMachine</i> is not available, * choose a node on the same rack * @return the chosen node */ protected DatanodeDescriptor chooseLocalNode( DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeDescriptor> results) throws NotEnoughReplicasException { // if no local machine, randomly choose one node if (localMachine == null) return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results); // otherwise try local machine first Node oldNode = excludedNodes.put(localMachine, localMachine); if (oldNode == null) { // was not in the excluded list if (isGoodTarget(localMachine, blocksize, maxNodesPerRack, false, results)) { results.add(localMachine); return localMachine; } } // try a node on local rack return chooseLocalRack(localMachine, excludedNodes, blocksize, maxNodesPerRack, results); } /* choose one node from the rack that <i>localMachine</i> is on. * if no such node is available, choose one node from the rack where * a second replica is on. * if still no such node is available, choose a random node * in the cluster. * @return the chosen node */ protected DatanodeDescriptor chooseLocalRack( DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeDescriptor> results) throws NotEnoughReplicasException { // no local machine, so choose a random machine if (localMachine == null) { return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results); } // choose one from the local rack try { return chooseRandom( localMachine.getNetworkLocation(), excludedNodes, blocksize, maxNodesPerRack, results); } catch (NotEnoughReplicasException e1) { // find the second replica DatanodeDescriptor newLocal=null; for(Iterator<DatanodeDescriptor> iter=results.iterator(); iter.hasNext();) { DatanodeDescriptor nextNode = iter.next(); if (nextNode != localMachine) { newLocal = nextNode; break; } } if (newLocal != null) { try { return chooseRandom( newLocal.getNetworkLocation(), excludedNodes, blocksize, maxNodesPerRack, results); } catch(NotEnoughReplicasException e2) { //otherwise randomly choose one from the network return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results); } } else { //otherwise randomly choose one from the network return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results); } } } /* choose <i>numOfReplicas</i> nodes from the racks * that <i>localMachine</i> is NOT on. * if not enough nodes are available, choose the remaining ones * from the local rack */ protected void chooseRemoteRack(int numOfReplicas, DatanodeDescriptor localMachine, HashMap<Node, Node> excludedNodes, long blocksize, int maxReplicasPerRack, List<DatanodeDescriptor> results) throws NotEnoughReplicasException { int oldNumOfReplicas = results.size(); // randomly choose one node from remote racks try { chooseRandom(numOfReplicas, "~"+localMachine.getNetworkLocation(), excludedNodes, blocksize, maxReplicasPerRack, results); } catch (NotEnoughReplicasException e) { chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas), localMachine.getNetworkLocation(), excludedNodes, blocksize, maxReplicasPerRack, results); } } /* Randomly choose one target from <i>nodes</i>. * @return the chosen node */ private DatanodeDescriptor chooseRandom( String nodes, HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeDescriptor> results) throws NotEnoughReplicasException { int numOfAvailableNodes = clusterMap.countNumOfAvailableNodes(nodes, excludedNodes.keySet()); while(numOfAvailableNodes > 0) { DatanodeDescriptor chosenNode = (DatanodeDescriptor)(clusterMap.chooseRandom(nodes)); if (chosenNode == null) { break; // no more node to choose, cluster topology must be changed } Node oldNode = excludedNodes.put(chosenNode, chosenNode); if (oldNode == null) { // choosendNode was not in the excluded list numOfAvailableNodes--; if (isGoodTarget(chosenNode, blocksize, maxNodesPerRack, results)) { results.add(chosenNode); return chosenNode; } } } throw new NotEnoughReplicasException( "Not able to place enough replicas"); } /* Randomly choose <i>numOfReplicas</i> targets from <i>nodes</i>. */ void chooseRandom(int numOfReplicas, String nodes, HashMap<Node, Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeDescriptor> results) throws NotEnoughReplicasException { int numOfAvailableNodes = clusterMap.countNumOfAvailableNodes(nodes, excludedNodes.keySet()); int numAttempts = numOfAvailableNodes * this.attemptMultiplier; while(numOfReplicas > 0 && numOfAvailableNodes > 0 && --numAttempts > 0) { DatanodeDescriptor chosenNode = (DatanodeDescriptor)(clusterMap.chooseRandom(nodes)); Node oldNode = excludedNodes.put(chosenNode, chosenNode); if (oldNode == null) { numOfAvailableNodes--; if (isGoodTarget(chosenNode, blocksize, maxNodesPerRack, results)) { numOfReplicas--; results.add(chosenNode); } } } if (numOfReplicas>0) { throw new NotEnoughReplicasException( "Not able to place enough replicas"); } } /* judge if a node is a good target. * return true if <i>node</i> has enough space, * does not have too much load, and the rack does not have too many nodes */ protected boolean isGoodTarget(DatanodeDescriptor node, long blockSize, int maxTargetPerLoc, List<DatanodeDescriptor> results) { return isGoodTarget(node, blockSize, maxTargetPerLoc, this.considerLoad, results); } protected boolean isGoodTarget(DatanodeDescriptor node, long blockSize, int maxTargetPerLoc, boolean considerLoad, List<DatanodeDescriptor> results) { Log logr = FSNamesystem.LOG; // check if the node is (being) decommissed if (node.isDecommissionInProgress() || node.isDecommissioned()) { if (logr.isDebugEnabled()) { logr.debug("Node "+ NodeBase.getPath(node) + " is not chosen because the node is (being) decommissioned"); } return false; } long remaining = node.getRemaining() - (node.getBlocksScheduled() * blockSize); // check the remaining capacity of the target machine if (blockSize* this.minBlocksToWrite>remaining) { if (logr.isDebugEnabled()) { logr.debug("Node "+ NodeBase.getPath(node) + " is not chosen because the node does not have enough space" + " for block size " + blockSize + " with Remaining = " + node.getRemaining() + " and Scheduled = " + node.getBlocksScheduled()); } return false; } // check the communication traffic of the target machine if (considerLoad) { double avgLoad = 0; int size = clusterMap.getNumOfLeaves(); if (size != 0 && stats != null) { avgLoad = (double)stats.getTotalLoad()/size; } if (node.getXceiverCount() > (2.0 * avgLoad)) { if (logr.isDebugEnabled()) { logr.debug("Node "+NodeBase.getPath(node)+ " is not chosen because the node is too busy"); } return false; } } // check if the target rack has chosen too many nodes String rackname = node.getNetworkLocation(); int counter=1; for(Iterator<DatanodeDescriptor> iter = results.iterator(); iter.hasNext();) { Node result = iter.next(); if (rackname.equals(result.getNetworkLocation())) { counter++; } } if (counter>maxTargetPerLoc) { if (logr.isDebugEnabled()) { logr.debug("Node "+NodeBase.getPath(node)+ " is not chosen because the rack has too many chosen nodes"); } return false; } if (DatanodeInfo.shouldSuspectNodes() && node.isSuspectFail()) { return false; } return true; } /** {@inheritDoc} */ public int verifyBlockPlacement(String srcPath, LocatedBlock lBlk, int minRacks) { DatanodeInfo[] locs = lBlk.getLocations(); if (locs == null) locs = new DatanodeInfo[0]; int numRacks = clusterMap.getNumOfRacks(); if(numRacks <= 1) // only one rack return 0; minRacks = Math.min(minRacks, numRacks); // 1. Check that all locations are different. // 2. Count locations on different racks. Set<String> racks = new TreeSet<String>(); for (DatanodeInfo dn : locs) racks.add(dn.getNetworkLocation()); return minRacks - racks.size(); } /** * The algorithm is first to pick a node with least free space from nodes * that are on a rack holding more than one replicas of the block. * So removing such a replica won't remove a rack. * If no such a node is available, * then pick a node with least free space * {@inheritDoc} */ public DatanodeDescriptor chooseReplicaToDelete(FSInodeInfo inode, Block block, short replicationFactor, Collection<DatanodeDescriptor> first, Collection<DatanodeDescriptor> second) { long minSpace = Long.MAX_VALUE; DatanodeDescriptor cur = null; // pick replica from the first Set. If first is empty, then pick replicas // from second set. Iterator<DatanodeDescriptor> iter = first.isEmpty() ? second.iterator() : first.iterator(); // pick node with least free space while (iter.hasNext() ) { DatanodeDescriptor node = iter.next(); long free = node.getRemaining(); if (minSpace > free) { minSpace = free; cur = node; } } return cur; } }