package org.apache.hadoop.corona; import java.util.ArrayList; import java.util.Collections; import java.util.IdentityHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.net.Node; import org.apache.hadoop.util.StringUtils; public class NodeManager implements Configurable { public static final Log LOG = LogFactory.getLog(NodeManager.class); protected CoronaConf conf; protected ClusterManager clusterManager; volatile private int nodeReservedMemoryMB; volatile private int nodeReservedDiskGB; // secondary indices maintained for each resource type public class RunnableIndices { private static final int RACK_SHUFFLE_PERIOD = 100; private int getRunnableNodeForRackCounter = 0; protected ConcurrentMap<String, List<ClusterNode>> hostToRunnableNode = new ConcurrentHashMap<String, List<ClusterNode>> (); protected ConcurrentMap<Node, List<ClusterNode>> rackToRunnableNode = new ConcurrentHashMap<Node, List<ClusterNode>> (); String type; public RunnableIndices(String type) { this.type = type; } public ClusterNode getRunnableNodeForAny(Set<String> excluded) { for(Map.Entry<String, List<ClusterNode>> e: hostToRunnableNode.entrySet()) { String host = e.getKey(); synchronized (topologyCache.getNode(host)) { List<ClusterNode> nlist = e.getValue(); if (nlist == null) { return null; } for (ClusterNode node : nlist) { if (excluded == null || !excluded.contains(node.getHost())) { if (hasEnoughResource(node)) { return node; } } } } } return null; } public ClusterNode getRunnableNodeForHost(String host) { synchronized (topologyCache.getNode(host)) { // there should only be one node per host in the common case List<ClusterNode> nlist = hostToRunnableNode.get(host); if (nlist == null) { return null; } for (ClusterNode node : nlist) { if (hasEnoughResource(node)) { return node; } } return null; } } public ClusterNode getRunnableNodeForRack(Node rack, Set<String> excluded) { synchronized (rack) { List<ClusterNode> nlist = rackToRunnableNode.get(rack); getRunnableNodeForRackCounter += 1; if (nlist == null) { return null; } if (getRunnableNodeForRackCounter % RACK_SHUFFLE_PERIOD == 0) { // This balances more evenly across nodes in a rack Collections.shuffle(nlist); } for (ClusterNode node : nlist) { if (excluded == null || !excluded.contains(node.getHost())) { if (hasEnoughResource(node)) { return node; } } } return null; } } private boolean hasEnoughResource(ClusterNode node) { return hasEnoughMemory(node) && hasEnoughDiskSpace(node); } private boolean hasEnoughMemory(ClusterNode node) { int used = node.clusterNodeInfo.getUsed().memoryMB; int total = node.clusterNodeInfo.getTotal().memoryMB; int free = total - used; if (free < nodeReservedMemoryMB) { LOG.info(node.getHost() + " not enough memory." + " totalMB:" + total + " used:" + used + " free:" + free + " limit:" + nodeReservedMemoryMB); return false; } return true; } private boolean hasEnoughDiskSpace(ClusterNode node) { int used = node.clusterNodeInfo.getUsed().diskGB; int total = node.clusterNodeInfo.getTotal().diskGB; int free = total - used; if (free < nodeReservedDiskGB) { LOG.info(node.getHost() + " not enough disk space." + " totalMB:" + total + " used:" + used + " free:" + free + " limit:" + nodeReservedDiskGB); return false; } return true; } public boolean existRunnableNodes() { return (hostToRunnableNode.size() > 0); } public void addRunnable(ClusterNode clusterNode) { String host = clusterNode.clusterNodeInfo.address.host; if (LOG.isDebugEnabled()) LOG.debug(clusterNode.getName() + " added to runnable list for type: " + type); synchronized (clusterNode.hostNode) { List<ClusterNode> nlist = hostToRunnableNode.get(host); if (nlist == null) { nlist = new ArrayList<ClusterNode> (1); hostToRunnableNode.put(host, nlist); } nlist.add(clusterNode); } Node rack = clusterNode.hostNode.getParent(); synchronized (rack) { List<ClusterNode> nlist = rackToRunnableNode.get(rack); if (nlist == null) { nlist = new ArrayList<ClusterNode> (1); rackToRunnableNode.put(rack, nlist); } nlist.add(clusterNode); } } public void deleteRunnable(ClusterNode node) { String host = node.getHost(); if (LOG.isDebugEnabled()) LOG.debug (node.getName() + " deleted from runnable list for type: " + type); synchronized (node.hostNode) { List<ClusterNode> nlist = hostToRunnableNode.get(host); if (nlist != null) { Utilities.removeReference(nlist, node); if (nlist.isEmpty()) hostToRunnableNode.remove(host); } else { // this is expected if the host was not runnable } } Node rack = node.hostNode.getParent(); synchronized (rack) { List<ClusterNode> nlist = rackToRunnableNode.get(rack); if (nlist != null) { Utilities.removeReference(nlist, node); if (nlist.isEmpty()) rackToRunnableNode.remove(rack); } else { // this is expected if the host was not runnable } } } } // primary data structure mapping the unique name of the // node to the node object protected ConcurrentMap<String, ClusterNode> nameToNode = new ConcurrentHashMap<String, ClusterNode> (); // secondary indices maintained for each resource type protected IdentityHashMap<String, RunnableIndices> typeToIndices = new IdentityHashMap<String, RunnableIndices> (); protected TopologyCache topologyCache; protected Map<Integer, Map<String, Integer>> cpuToResourcePartitioning; protected volatile boolean shutdown = false; // monitoring for node death/hang protected static int nodeExpiryInterval; protected Thread expireNodesThread = null; ExpireNodes expireNodes = new ExpireNodes(); public boolean existRunnableNodes() { for (Map.Entry<String, RunnableIndices> entry: typeToIndices.entrySet()) { RunnableIndices r = entry.getValue(); if (r.existRunnableNodes()) return true; } return false; } public boolean existRunnableNodes(String type) { RunnableIndices r = typeToIndices.get(type); return r.existRunnableNodes(); } /** * Find the best matching node for this host subject to the maxLevel * constraint */ public ClusterNode getRunnableNode(String host, LocalityLevel maxLevel, String type, Set<String> excluded) { ClusterNode node = null; RunnableIndices r = typeToIndices.get(type); // find host local if (host != null) node = r.getRunnableNodeForHost(host); // find rack local if required and allowed if (node == null) { if ((host != null) && (maxLevel.compareTo(LocalityLevel.NODE) > 0)) { Node rack = topologyCache.getNode(host).getParent(); node = r.getRunnableNodeForRack(rack, excluded); } } // find any node if required and allowed if ((node == null) && (maxLevel.compareTo(LocalityLevel.RACK) > 0)) { node = r.getRunnableNodeForAny(excluded); } return node; } protected void addNode(ClusterNode node) { synchronized (node) { // 1: primary nameToNode.put(node.getName(), node); clusterManager.getMetrics().setAliveNodes(nameToNode.size()); // 2: update runnable indices for (Map.Entry<String, RunnableIndices> entry: typeToIndices.entrySet()) { String type = entry.getKey(); if (node.checkForGrant(Utilities.getUnitResourceRequest(type))) { RunnableIndices r = entry.getValue(); r.addRunnable(node); } } } } public Set<ClusterNode.GrantId> deleteNode(String nodeName) { ClusterNode node = nameToNode.get(nodeName); if (node == null) { LOG.warn("Trying to delete non-existent node: " + nodeName); return null; } return deleteNode(node); } protected Set<ClusterNode.GrantId> deleteNode(ClusterNode node) { synchronized (node) { if (node.deleted) return null; node.deleted = true; // 1: primary nameToNode.remove(node.getName()); clusterManager.getMetrics().setAliveNodes(nameToNode.size()); // 2: update runnable index for (RunnableIndices r: typeToIndices.values()) { r.deleteRunnable(node); } return node.getGrants(); } } public void cancelGrant(String nodeName, String sessionId, int requestId) { ClusterNode node = nameToNode.get(nodeName); if (node == null) { LOG.warn("Canceling grant for non-existent node: " + nodeName); return; } synchronized (node) { if (node.deleted) { LOG.warn("Canceling grant for deleted node: " + nodeName); return; } ResourceRequest req = node.getRequestForGrant(sessionId, requestId); if (req != null) { ResourceRequest unitReq = Utilities.getUnitResourceRequest(req.type); boolean previouslyRunnable = node.checkForGrant(unitReq); node.cancelGrant(sessionId, requestId); if (!previouslyRunnable && node.checkForGrant(unitReq)) { RunnableIndices r = typeToIndices.get(req.type); r.addRunnable(node); } } } } public boolean addGrant(ClusterNode node, String sessionId, ResourceRequest req) { synchronized (node) { if (node.deleted) return false; node.addGrant(sessionId, req); if (!node.checkForGrant(Utilities.getUnitResourceRequest(req.type))) { RunnableIndices r = typeToIndices.get(req.type); r.deleteRunnable(node); } } return true; } public NodeManager(ClusterManager clusterManager) { this.clusterManager = clusterManager; this.expireNodesThread = new Thread(this.expireNodes, "expireNodes"); this.expireNodesThread.setDaemon(true); this.expireNodesThread.start(); } @Override public void setConf(Configuration _conf) { this.conf = (CoronaConf) _conf; nodeExpiryInterval = conf.getNodeExpiryInterval(); if (this.expireNodesThread != null) this.expireNodesThread.interrupt(); topologyCache = new TopologyCache(conf); cpuToResourcePartitioning = conf.getCpuToResourcePartitioning(); for(Map.Entry<Integer, Map<String, Integer>> entry: cpuToResourcePartitioning.entrySet()) { for (String type: entry.getValue().keySet()) { type = type.intern(); if (typeToIndices.get(type) == null) { typeToIndices.put(type, new RunnableIndices(type)); } } } nodeReservedMemoryMB = conf.getNodeReservedMemoryMB(); nodeReservedDiskGB = conf.getNodeReservedDiskGB(); } @Override public Configuration getConf() { return conf; } /** * return true if a new node has been added - else return false */ public boolean heartbeat(ClusterNodeInfo clusterNodeInfo) { boolean newNode = false; ClusterNode node = nameToNode.get(clusterNodeInfo.name); if (node == null) { LOG.info("Adding node with heartbeat: " + clusterNodeInfo.toString()); node = new ClusterNode(clusterNodeInfo, topologyCache.getNode(clusterNodeInfo.address.host), cpuToResourcePartitioning); addNode(node); newNode = true; } node.heartbeat(); return newNode; } class ExpireNodes implements Runnable { public ExpireNodes() { } @Override public void run() { while (!shutdown) { try { Thread.sleep(nodeExpiryInterval/2); long now = ClusterManager.clock.getTime(); for(ClusterNode node: nameToNode.values()) { if (now - node.lastHeartbeatTime > nodeExpiryInterval) { LOG.warn("Timing out node: " + node.getName()); clusterManager.nodeTimeout(node.getName()); } } } catch (InterruptedException iex) { // ignore. if shutting down, while cond. will catch it } catch (Exception t) { LOG.error("Node Expiry Thread got exception: " + StringUtils.stringifyException(t)); } } } } public List<String> getResourceTypes() { List<String> ret = new ArrayList<String> (); ret.addAll(typeToIndices.keySet()); return (ret); } public int getMaxCpuForType(String type) { int total = 0; for (ClusterNode node: nameToNode.values()) { synchronized (node) { if (node.deleted) continue; total += node.getMaxCpuForType(type); } } return total; } public int getAllocatedCpuForType(String type) { int total = 0; for (ClusterNode node: nameToNode.values()) { synchronized (node) { if (node.deleted) continue; total += node.getAllocatedCpuForType(type); } } return total; } public int getTotalNodeCount() { return nameToNode.size(); } public void setNodeReservedMemoryMB(int mb) { LOG.info("nodeReservedMemoryMB changed" + " from " + nodeReservedMemoryMB + " to " + mb); this.nodeReservedMemoryMB = mb; } public void setNodeReservedDiskGB(int gb) { LOG.info("nodeReservedDiskGB changed" + " from " + nodeReservedDiskGB + " to " + gb); this.nodeReservedDiskGB = gb; } }