/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.net; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; /** * The class extends NetworkTopology to represents a cluster of computer with * a 4-layers hierarchical network topology. * In this network topology, leaves represent data nodes (computers) and inner * nodes represent switches/routers that manage traffic in/out of data centers, * racks or physical host (with virtual switch). * * @see NetworkTopology */ @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) @InterfaceStability.Unstable public class NetworkTopologyWithNodeGroup extends NetworkTopology { public final static String DEFAULT_NODEGROUP = "/default-nodegroup"; public NetworkTopologyWithNodeGroup() { clusterMap = new InnerNodeWithNodeGroup(InnerNode.ROOT); } @Override protected Node getNodeForNetworkLocation(Node node) { // if node only with default rack info, here we need to add default // nodegroup info if (NetworkTopology.DEFAULT_RACK.equals(node.getNetworkLocation())) { node.setNetworkLocation(node.getNetworkLocation() + DEFAULT_NODEGROUP); } Node nodeGroup = getNode(node.getNetworkLocation()); if (nodeGroup == null) { nodeGroup = new InnerNodeWithNodeGroup(node.getNetworkLocation()); } return getNode(nodeGroup.getNetworkLocation()); } @Override public String getRack(String loc) { netlock.readLock().lock(); try { loc = InnerNode.normalize(loc); Node locNode = getNode(loc); if (locNode instanceof InnerNodeWithNodeGroup) { InnerNodeWithNodeGroup node = (InnerNodeWithNodeGroup) locNode; if (node.isRack()) { return loc; } else if (node.isNodeGroup()) { return node.getNetworkLocation(); } else { // may be a data center return null; } } else { // not in cluster map, don't handle it return loc; } } finally { netlock.readLock().unlock(); } } /** * Given a string representation of a node group for a specific network * location * * @param loc * a path-like string representation of a network location * @return a node group string */ public String getNodeGroup(String loc) { netlock.readLock().lock(); try { loc = InnerNode.normalize(loc); Node locNode = getNode(loc); if (locNode instanceof InnerNodeWithNodeGroup) { InnerNodeWithNodeGroup node = (InnerNodeWithNodeGroup) locNode; if (node.isNodeGroup()) { return loc; } else if (node.isRack()) { // not sure the node group for a rack return null; } else { // may be a leaf node return getNodeGroup(node.getNetworkLocation()); } } else { // not in cluster map, don't handle it return loc; } } finally { netlock.readLock().unlock(); } } @Override public boolean isOnSameRack( Node node1, Node node2) { if (node1 == null || node2 == null || node1.getParent() == null || node2.getParent() == null) { return false; } netlock.readLock().lock(); try { return isSameParents(node1.getParent(), node2.getParent()); } finally { netlock.readLock().unlock(); } } /** * Check if two nodes are on the same node group (hypervisor) The * assumption here is: each nodes are leaf nodes. * * @param node1 * one node (can be null) * @param node2 * another node (can be null) * @return true if node1 and node2 are on the same node group; false * otherwise * @exception IllegalArgumentException * when either node1 or node2 is null, or node1 or node2 do * not belong to the cluster */ @Override public boolean isOnSameNodeGroup(Node node1, Node node2) { if (node1 == null || node2 == null) { return false; } netlock.readLock().lock(); try { return isSameParents(node1, node2); } finally { netlock.readLock().unlock(); } } /** * Check if network topology is aware of NodeGroup */ @Override public boolean isNodeGroupAware() { return true; } /** Add a leaf node * Update node counter & rack counter if necessary * @param node node to be added; can be null * @exception IllegalArgumentException if add a node to a leave * or node to be added is not a leaf */ @Override public void add(Node node) { if (node==null) return; if( node instanceof InnerNode ) { throw new IllegalArgumentException( "Not allow to add an inner node: "+NodeBase.getPath(node)); } netlock.writeLock().lock(); try { Node rack = null; // if node only with default rack info, here we need to add default // nodegroup info if (NetworkTopology.DEFAULT_RACK.equals(node.getNetworkLocation())) { node.setNetworkLocation(node.getNetworkLocation() + NetworkTopologyWithNodeGroup.DEFAULT_NODEGROUP); } Node nodeGroup = getNode(node.getNetworkLocation()); if (nodeGroup == null) { nodeGroup = new InnerNodeWithNodeGroup(node.getNetworkLocation()); } rack = getNode(nodeGroup.getNetworkLocation()); // rack should be an innerNode and with parent. // note: rack's null parent case is: node's topology only has one layer, // so rack is recognized as "/" and no parent. // This will be recognized as a node with fault topology. if (rack != null && (!(rack instanceof InnerNode) || rack.getParent() == null)) { throw new IllegalArgumentException("Unexpected data node " + node.toString() + " at an illegal network location"); } if (clusterMap.add(node)) { LOG.info("Adding a new node: " + NodeBase.getPath(node)); if (rack == null) { // We only track rack number here numOfRacks++; } } if(LOG.isDebugEnabled()) { LOG.debug("NetworkTopology became:\n" + this.toString()); } } finally { netlock.writeLock().unlock(); } } /** Remove a node * Update node counter and rack counter if necessary * @param node node to be removed; can be null */ @Override public void remove(Node node) { if (node==null) return; if( node instanceof InnerNode ) { throw new IllegalArgumentException( "Not allow to remove an inner node: "+NodeBase.getPath(node)); } LOG.info("Removing a node: "+NodeBase.getPath(node)); netlock.writeLock().lock(); try { if (clusterMap.remove(node)) { Node nodeGroup = getNode(node.getNetworkLocation()); if (nodeGroup == null) { nodeGroup = new InnerNode(node.getNetworkLocation()); } InnerNode rack = (InnerNode)getNode(nodeGroup.getNetworkLocation()); if (rack == null) { numOfRacks--; } } if(LOG.isDebugEnabled()) { LOG.debug("NetworkTopology became:\n" + this.toString()); } } finally { netlock.writeLock().unlock(); } } /** Sort nodes array by their distances to <i>reader</i> * It linearly scans the array, if a local node is found, swap it with * the first element of the array. * If a local node group node is found, swap it with the first element * following the local node. * If a local rack node is found, swap it with the first element following * the local node group node. * If neither local node, node group node or local rack node is found, put a * random replica location at position 0. * It leaves the rest nodes untouched. * @param reader the node that wishes to read a block from one of the nodes * @param nodes the list of nodes containing data for the reader */ @Override public void pseudoSortByDistance( Node reader, Node[] nodes ) { if (reader != null && !this.contains(reader)) { // if reader is not a datanode (not in NetworkTopology tree), we will // replace this reader with a sibling leaf node in tree. Node nodeGroup = getNode(reader.getNetworkLocation()); if (nodeGroup != null && nodeGroup instanceof InnerNode) { InnerNode parentNode = (InnerNode) nodeGroup; // replace reader with the first children of its parent in tree reader = parentNode.getLeaf(0, null); } else { return; } } int tempIndex = 0; int localRackNode = -1; int localNodeGroupNode = -1; if (reader != null) { //scan the array to find the local node & local rack node for (int i = 0; i < nodes.length; i++) { if (tempIndex == 0 && reader == nodes[i]) { //local node //swap the local node and the node at position 0 if (i != 0) { swap(nodes, tempIndex, i); } tempIndex=1; if (localRackNode != -1 && (localNodeGroupNode !=-1)) { if (localRackNode == 0) { localRackNode = i; } if (localNodeGroupNode == 0) { localNodeGroupNode = i; } break; } } else if (localNodeGroupNode == -1 && isOnSameNodeGroup(reader, nodes[i])) { //local node group localNodeGroupNode = i; // node local and rack local are already found if(tempIndex != 0 && localRackNode != -1) break; } else if (localRackNode == -1 && isOnSameRack(reader, nodes[i])) { localRackNode = i; if (tempIndex != 0 && localNodeGroupNode != -1) break; } } // swap the local nodegroup node and the node at position tempIndex if(localNodeGroupNode != -1 && localNodeGroupNode != tempIndex) { swap(nodes, tempIndex, localNodeGroupNode); if (localRackNode == tempIndex) { localRackNode = localNodeGroupNode; } tempIndex++; } // swap the local rack node and the node at position tempIndex if(localRackNode != -1 && localRackNode != tempIndex) { swap(nodes, tempIndex, localRackNode); tempIndex++; } } // put a random node at position 0 if there is not a local/local-nodegroup/ // local-rack node if (tempIndex == 0 && localNodeGroupNode == -1 && localRackNode == -1 && nodes.length != 0) { swap(nodes, 0, r.nextInt(nodes.length)); } } /** InnerNodeWithNodeGroup represents a switch/router of a data center, rack * or physical host. Different from a leaf node, it has non-null children. */ static class InnerNodeWithNodeGroup extends InnerNode { public InnerNodeWithNodeGroup(String name, String location, InnerNode parent, int level) { super(name, location, parent, level); } public InnerNodeWithNodeGroup(String name, String location) { super(name, location); } public InnerNodeWithNodeGroup(String path) { super(path); } @Override boolean isRack() { // it is node group if (getChildren().isEmpty()) { return false; } Node firstChild = children.get(0); if (firstChild instanceof InnerNode) { Node firstGrandChild = (((InnerNode) firstChild).children).get(0); if (firstGrandChild instanceof InnerNode) { // it is datacenter return false; } else { return true; } } return false; } /** * Judge if this node represents a node group * * @return true if it has no child or its children are not InnerNodes */ boolean isNodeGroup() { if (children.isEmpty()) { return true; } Node firstChild = children.get(0); if (firstChild instanceof InnerNode) { // it is rack or datacenter return false; } return true; } @Override protected boolean isLeafParent() { return isNodeGroup(); } @Override protected InnerNode createParentNode(String parentName) { return new InnerNodeWithNodeGroup(parentName, getPath(this), this, this.getLevel() + 1); } @Override protected boolean areChildrenLeaves() { return isNodeGroup(); } } }