/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.blockmanagement; import java.util.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.StorageType; import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.net.NetworkTopologyWithNodeGroup; import org.apache.hadoop.net.Node; import org.apache.hadoop.net.NodeBase; /** The class is responsible for choosing the desired number of targets * for placing block replicas on environment with node-group layer. * The replica placement strategy is adjusted to: * If the writer is on a datanode, the 1st replica is placed on the local * node (or local node-group), otherwise a random datanode. * The 2nd replica is placed on a datanode that is on a different rack with 1st * replica node. * The 3rd replica is placed on a datanode which is on a different node-group * but the same rack as the second replica node. */ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefault { protected BlockPlacementPolicyWithNodeGroup(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap, DatanodeManager datanodeManager) { initialize(conf, stats, clusterMap, host2datanodeMap); } protected BlockPlacementPolicyWithNodeGroup() { } public void initialize(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap, Host2NodesMap host2datanodeMap) { super.initialize(conf, stats, clusterMap, host2datanodeMap); } /** choose local node of localMachine as the target. * if localMachine is not available, choose a node on the same nodegroup or * rack instead. * @return the chosen node */ @Override protected DatanodeStorageInfo chooseLocalStorage(Node localMachine, Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeStorageInfo> results, boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes, boolean fallbackToLocalRack) throws NotEnoughReplicasException { // if no local machine, randomly choose one node if (localMachine == null) return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); // otherwise try local machine first if (localMachine instanceof DatanodeDescriptor) { DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine; if (excludedNodes.add(localMachine)) { // was not in the excluded list for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes .entrySet().iterator(); iter.hasNext(); ) { Map.Entry<StorageType, Integer> entry = iter.next(); for (DatanodeStorageInfo localStorage : DFSUtil.shuffle( localDataNode.getStorageInfos())) { StorageType type = entry.getKey(); if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize, maxNodesPerRack, false, results, avoidStaleNodes, type) >= 0) { int num = entry.getValue(); if (num == 1) { iter.remove(); } else { entry.setValue(num - 1); } return localStorage; } } } } } // try a node on local node group DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup( (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); if (chosenStorage != null) { return chosenStorage; } if (!fallbackToLocalRack) { return null; } // try a node on local rack return chooseLocalRack(localMachine, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } /** @return the node of the second replica */ private static DatanodeDescriptor secondNode(Node localMachine, List<DatanodeStorageInfo> results) { // find the second replica for(DatanodeStorageInfo nextStorage : results) { DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor(); if (nextNode != localMachine) { return nextNode; } } return null; } @Override protected DatanodeStorageInfo chooseLocalRack(Node localMachine, Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeStorageInfo> results, boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes) throws NotEnoughReplicasException { // no local machine, so choose a random machine if (localMachine == null) { return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } // choose one from the local rack, but off-nodegroup try { final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation()); return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } catch (NotEnoughReplicasException e1) { // find the second replica final DatanodeDescriptor newLocal = secondNode(localMachine, results); if (newLocal != null) { try { return chooseRandom( clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } catch(NotEnoughReplicasException e2) { //otherwise randomly choose one from the network return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } } else { //otherwise randomly choose one from the network return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } } } /** * {@inheritDoc} */ @Override protected void chooseRemoteRack(int numOfReplicas, DatanodeDescriptor localMachine, Set<Node> excludedNodes, long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results, boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes) throws NotEnoughReplicasException { int oldNumOfReplicas = results.size(); final String rackLocation = NetworkTopology.getFirstHalf( localMachine.getNetworkLocation()); try { // randomly choose from remote racks chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize, maxReplicasPerRack, results, avoidStaleNodes, storageTypes); } catch (NotEnoughReplicasException e) { // fall back to the local rack chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas), rackLocation, excludedNodes, blocksize, maxReplicasPerRack, results, avoidStaleNodes, storageTypes); } } /* choose one node from the nodegroup that <i>localMachine</i> is on. * if no such node is available, choose one node from the nodegroup where * a second replica is on. * if still no such node is available, choose a random node in the cluster. * @return the chosen node */ private DatanodeStorageInfo chooseLocalNodeGroup( NetworkTopologyWithNodeGroup clusterMap, Node localMachine, Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeStorageInfo> results, boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes) throws NotEnoughReplicasException { // no local machine, so choose a random machine if (localMachine == null) { return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } // choose one from the local node group try { return chooseRandom( clusterMap.getNodeGroup(localMachine.getNetworkLocation()), excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } catch (NotEnoughReplicasException e1) { final DatanodeDescriptor newLocal = secondNode(localMachine, results); if (newLocal != null) { try { return chooseRandom( clusterMap.getNodeGroup(newLocal.getNetworkLocation()), excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } catch(NotEnoughReplicasException e2) { //otherwise randomly choose one from the network return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } } else { //otherwise randomly choose one from the network return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } } } @Override protected String getRack(final DatanodeInfo cur) { String nodeGroupString = cur.getNetworkLocation(); return NetworkTopology.getFirstHalf(nodeGroupString); } /** * Find other nodes in the same nodegroup of <i>localMachine</i> and add them * into <i>excludeNodes</i> as replica should not be duplicated for nodes * within the same nodegroup * @return number of new excluded nodes */ @Override protected int addToExcludedNodes(DatanodeDescriptor chosenNode, Set<Node> excludedNodes) { int countOfExcludedNodes = 0; String nodeGroupScope = chosenNode.getNetworkLocation(); List<Node> leafNodes = clusterMap.getLeaves(nodeGroupScope); for (Node leafNode : leafNodes) { if (excludedNodes.add(leafNode)) { // not a existing node in excludedNodes countOfExcludedNodes++; } } countOfExcludedNodes += addDependentNodesToExcludedNodes( chosenNode, excludedNodes); return countOfExcludedNodes; } /** * Add all nodes from a dependent nodes list to excludedNodes. * @return number of new excluded nodes */ private int addDependentNodesToExcludedNodes(DatanodeDescriptor chosenNode, Set<Node> excludedNodes) { if (this.host2datanodeMap == null) { return 0; } int countOfExcludedNodes = 0; for(String hostname : chosenNode.getDependentHostNames()) { DatanodeDescriptor node = this.host2datanodeMap.getDataNodeByHostName(hostname); if(node!=null) { if (excludedNodes.add(node)) { countOfExcludedNodes++; } } else { LOG.warn("Not able to find datanode " + hostname + " which has dependency with datanode " + chosenNode.getHostName()); } } return countOfExcludedNodes; } /** * Pick up replica node set for deleting replica as over-replicated. * First set contains replica nodes on rack with more than one * replica while second set contains remaining replica nodes. * If first is not empty, divide first set into two subsets: * moreThanOne contains nodes on nodegroup with more than one replica * exactlyOne contains the remaining nodes in first set * then pickup priSet if not empty. * If first is empty, then pick second. */ @Override public Collection<DatanodeStorageInfo> pickupReplicaSet( Collection<DatanodeStorageInfo> first, Collection<DatanodeStorageInfo> second) { // If no replica within same rack, return directly. if (first.isEmpty()) { return second; } // Split data nodes in the first set into two sets, // moreThanOne contains nodes on nodegroup with more than one replica // exactlyOne contains the remaining nodes Map<String, List<DatanodeStorageInfo>> nodeGroupMap = new HashMap<String, List<DatanodeStorageInfo>>(); for(DatanodeStorageInfo storage : first) { final String nodeGroupName = NetworkTopology.getLastHalf( storage.getDatanodeDescriptor().getNetworkLocation()); List<DatanodeStorageInfo> storageList = nodeGroupMap.get(nodeGroupName); if (storageList == null) { storageList = new ArrayList<DatanodeStorageInfo>(); nodeGroupMap.put(nodeGroupName, storageList); } storageList.add(storage); } final List<DatanodeStorageInfo> moreThanOne = new ArrayList<DatanodeStorageInfo>(); final List<DatanodeStorageInfo> exactlyOne = new ArrayList<DatanodeStorageInfo>(); // split nodes into two sets for(List<DatanodeStorageInfo> datanodeList : nodeGroupMap.values()) { if (datanodeList.size() == 1 ) { // exactlyOne contains nodes on nodegroup with exactly one replica exactlyOne.add(datanodeList.get(0)); } else { // moreThanOne contains nodes on nodegroup with more than one replica moreThanOne.addAll(datanodeList); } } return moreThanOne.isEmpty()? exactlyOne : moreThanOne; } }