/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.corona; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.EnumMap; import java.util.EnumSet; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.net.Node; import org.apache.hadoop.net.TopologyCache; import org.apache.hadoop.util.CoronaSerializer; import org.apache.hadoop.util.HostsFileReader; import org.codehaus.jackson.JsonGenerator; import org.codehaus.jackson.JsonToken; /** * Manages all the nodes known in the cluster. */ public class NodeManager implements Configurable { /** Class logger */ public static final Log LOG = LogFactory.getLog(NodeManager.class); /** Configuration. */ protected CoronaConf conf; /** The Cluster Manager. */ protected ClusterManager clusterManager; /** * Secondary index on nodes. This is an index of runnable nodes for a resource * type. There is one instance of this for each resource type. */ public class RunnableIndices { /** Controls how frequently we shuffle the list of rack-runnable nodes. */ private static final int RACK_SHUFFLE_PERIOD = 100; /** The lookup table of requested node for host */ protected ConcurrentMap<String, RequestedNode> hostToRequestedNode = new ConcurrentHashMap<String, RequestedNode>(); /** The lookup table of runnable nodes on hosts */ protected ConcurrentMap<String, NodeContainer> hostToRunnableNodes = new ConcurrentHashMap<String, NodeContainer>(); /** The lookup table of runnable nodes in racks */ protected ConcurrentMap<Node, NodeContainer> rackToRunnableNodes = new ConcurrentHashMap<Node, NodeContainer>(); /** Number of nodes that are still runnable */ private AtomicInteger hostsWithRunnableNodes = new AtomicInteger(0); /** The type of resource this RunnableIndices is tracking */ private final ResourceType type; /** * Counter for checking if we need to shuffle the list of rack-runnable * nodes. */ private int getRunnableNodeForRackCounter = 0; /** * Create a runnable indices for a given resource type * @param type the type of resource */ public RunnableIndices(ResourceType type) { this.type = type; } /** * Get any runnable node that is not one of the excluded nodes * @param excluded the list of nodes to ignore * @return the runnable node, null if no runnable node can be found */ public ClusterNode getRunnableNodeForAny(Set<String> excluded) { double avgLoad = loadManager.getAverageLoad(type); // Make two passes over the nodes. In the first pass, try to find a // node that has lower than average number of grants on it. If that does // not find a node, try looking at all nodes. for (int pass = 0; pass < 2; pass++) { for (Map.Entry<String, NodeContainer> e : hostToRunnableNodes.entrySet()) { NodeContainer nodeContainer = e.getValue(); if (nodeContainer == null) { continue; } synchronized (nodeContainer) { if (nodeContainer.isEmpty()) { continue; } for (ClusterNode node : nodeContainer) { if (excluded == null || !excluded.contains(node.getHost())) { if (resourceLimit.hasEnoughResource(node)) { // When pass == 0, try to average out the load. if (pass == 0) { if (node.getGrantCount(type) < avgLoad) { return node; } } else { return node; } } } } } } } return null; } /** * Get runnable node local to the given host * @param requestedNode the requested node that needs local scheduling * @return the node that is local to the host, null if * there are no runnable nodes local to the host */ public ClusterNode getRunnableNodeForHost(RequestedNode requestedNode) { // there should only be one node per host in the common case NodeContainer nodeContainer = requestedNode.getHostNodes(); if (nodeContainer == null) { return null; } synchronized (nodeContainer) { if (nodeContainer.isEmpty()) { return null; } for (ClusterNode node : nodeContainer) { if (resourceLimit.hasEnoughResource(node)) { return node; } } } return null; } /** * Get a runnable node in the given rack that is not present in the * excluded list * @param requestedNode the node to look up rack locality for * @param excluded the list of nodes to ignore * @return the runnable node from the rack satisfying conditions, null if * the node was not found */ public ClusterNode getRunnableNodeForRack( RequestedNode requestedNode, Set<String> excluded) { NodeContainer nodeContainer = requestedNode.getRackNodes(); getRunnableNodeForRackCounter += 1; if (nodeContainer == null) { return null; } synchronized (nodeContainer) { if (nodeContainer.isEmpty()) { return null; } if (getRunnableNodeForRackCounter % RACK_SHUFFLE_PERIOD == 0) { // This balances more evenly across nodes in a rack nodeContainer.shuffle(); } for (ClusterNode node : nodeContainer) { if (excluded == null || !excluded.contains(node.getHost())) { if (resourceLimit.hasEnoughResource(node)) { return node; } } } } return null; } /** * Check if there are any runnable nodes * @return true if there are any runnable nodes, false otherwise */ public boolean existRunnableNodes() { return hostsWithRunnableNodes.get() > 0; } /** * Return an existing NodeContainer representing the node or if it * does not exist - create a new NodeContainer and return it. * * @param host the host to get the node container for * @return the node container representing this host */ private NodeContainer getOrCreateHostRunnableNode(String host) { NodeContainer nodeContainer = hostToRunnableNodes.get(host); if (nodeContainer == null) { nodeContainer = new NodeContainer(); NodeContainer oldList = hostToRunnableNodes.putIfAbsent(host, nodeContainer); if (oldList != null) { nodeContainer = oldList; } } return nodeContainer; } /** * Return an existing NodeContainer representing the rack or if it * does not exist - create a new NodeContainer and return it. * * @param rack the rack to return the node container for * @return the node container representing the rack */ private NodeContainer getOrCreateRackRunnableNode(Node rack) { NodeContainer nodeContainer = rackToRunnableNodes.get(rack); if (nodeContainer == null) { nodeContainer = new NodeContainer(); NodeContainer oldList = rackToRunnableNodes.putIfAbsent(rack, nodeContainer); if (oldList != null) { nodeContainer = oldList; } } return nodeContainer; } /** * Return a RequestedNode for a given host. * Returns a RequestedNode representing a given host by either getting * and existing RequestedNode or creating a new one. * * @param host the host to get the RequestedNode for * @return the RequestedNode object representing the host */ private RequestedNode getOrCreateRequestedNode(String host) { RequestedNode node = hostToRequestedNode.get(host); if (node == null) { NodeContainer nodeRunnables = getOrCreateHostRunnableNode(host); Node rack = topologyCache.getNode(host).getParent(); NodeContainer rackRunnables = getOrCreateRackRunnableNode(rack); node = new RequestedNode( type, host, rack, nodeRunnables, rackRunnables); RequestedNode oldNode = hostToRequestedNode.putIfAbsent(host, node); if (oldNode != null) { node = oldNode; } } return node; } /** * Add a node to the runnable indices * @param clusterNode the node to add */ public void addRunnable(ClusterNode clusterNode) { String host = clusterNode.getHost(); if (LOG.isDebugEnabled()) { LOG.debug(clusterNode.getName() + " added to runnable list for type: " + type); } NodeContainer nodeContainer = getOrCreateHostRunnableNode(host); synchronized (nodeContainer) { nodeContainer.addNode(clusterNode); hostsWithRunnableNodes.incrementAndGet(); } Node rack = clusterNode.hostNode.getParent(); nodeContainer = getOrCreateRackRunnableNode(rack); synchronized (nodeContainer) { nodeContainer.addNode(clusterNode); } } /** * Remove the node from the runnable indices * @param node node to remove */ public void deleteRunnable(ClusterNode node) { String host = node.getHost(); if (LOG.isDebugEnabled()) { LOG.debug(node.getName() + " deleted from runnable list for type: " + type); } NodeContainer nodeContainer = hostToRunnableNodes.get(host); if (nodeContainer != null) { synchronized (nodeContainer) { if (nodeContainer.removeNode(node)) { /** * We are not removing the nodeContainer from runnable nodes map * since we are synchronizing operations with runnable indices * on it */ hostsWithRunnableNodes.decrementAndGet(); } } } Node rack = node.hostNode.getParent(); nodeContainer = rackToRunnableNodes.get(rack); if (nodeContainer != null) { synchronized (nodeContainer) { /** * We are not removing the nodeContainer from runnable nodes map * since we are synchronizing operations with runnable indices * on it */ nodeContainer.removeNode(node); } } } /** * Checks if a node is present as runnable in this index. Should be called * while holding the node lock. * @param clusterNode The node. * @return A boolean indicating if the node is present. */ public boolean hasRunnable(ClusterNode clusterNode) { String host = clusterNode.getHost(); NodeContainer nodeContainer = hostToRunnableNodes.get(host); return (nodeContainer != null) && !nodeContainer.isEmpty(); } /** * Create a snapshot of runnable nodes. * @return The snapshot. */ public NodeSnapshot getNodeSnapshot() { int nodeCount = 0; Map<String, NodeContainer> hostRunnables = new HashMap<String, NodeContainer>(); for (Map.Entry<String, NodeContainer> entry : hostToRunnableNodes.entrySet()) { NodeContainer value = entry.getValue(); synchronized (value) { if (!value.isEmpty()) { hostRunnables.put(entry.getKey(), value.copy()); nodeCount += value.size(); } } } Map<Node, NodeContainer> rackRunnables = new HashMap<Node, NodeContainer>(); for (Map.Entry<Node, NodeContainer> entry : rackToRunnableNodes.entrySet()) { NodeContainer value = entry.getValue(); synchronized (value) { if (!value.isEmpty()) { rackRunnables.put(entry.getKey(), value.copy()); } } } return new NodeSnapshot( topologyCache, hostRunnables, rackRunnables, nodeCount); } } /** primary data structure mapping the unique name of the node to the node object */ protected ConcurrentMap<String, ClusterNode> nameToNode = new ConcurrentHashMap<String, ClusterNode>(); /** The registry of sessions running on the nodes */ protected ConcurrentMap<ClusterNode, Set<String>> hostsToSessions = new ConcurrentHashMap<ClusterNode, Set<String>>(); /** Tracks the applications active on the node. */ protected ConcurrentMap<String, Map<ResourceType, String>> nameToApps = new ConcurrentHashMap<String, Map<ResourceType, String>>(); /** Fault manager for the nodes */ protected final FaultManager faultManager; /** secondary indices maintained for each resource type */ protected Map<ResourceType, RunnableIndices> typeToIndices = new EnumMap<ResourceType, RunnableIndices>(ResourceType.class); /** Track the load on nodes. */ protected LoadManager loadManager; /** The cache for local node lookups */ protected TopologyCache topologyCache; /** The configuration of resources based on the CPUs */ protected Map<Integer, Map<ResourceType, Integer>> cpuToResourcePartitioning; /** Shutdown flag */ protected volatile boolean shutdown = false; /** The time before the node is declared dead if it doesn't heartbeat */ protected int nodeExpiryInterval; /** A thread running expireNodes */ protected Thread expireNodesThread = null; /** A runnable that is responsible for expiring nodes that don't heartbeat */ private ExpireNodes expireNodes = new ExpireNodes(); /** Resource limits. */ private final ResourceLimit resourceLimit = new ResourceLimit(); /** Hosts reader. */ private final HostsFileReader hostsReader; /** * NodeManager constructor given a cluster manager and a * {@link HostsFileReader} for includes/excludes lists * @param clusterManager the cluster manager * @param hostsReader the host reader for includes/excludes */ public NodeManager( ClusterManager clusterManager, HostsFileReader hostsReader) { this.hostsReader = hostsReader; LOG.info("Included hosts: " + hostsReader.getHostNames().size() + " Excluded hosts: " + hostsReader.getExcludedHosts().size()); this.clusterManager = clusterManager; this.expireNodesThread = new Thread(this.expireNodes, "expireNodes"); this.expireNodesThread.setDaemon(true); this.expireNodesThread.start(); this.faultManager = new FaultManager(this); } /** * Constructor for the NodeManager, used when reading back the state of * NodeManager from disk. * @param clusterManager The ClusterManager instance * @param hostsReader The HostsReader instance * @param coronaSerializer The CoronaSerializer instance, which will be used * to read JSON from disk * @throws IOException */ public NodeManager(ClusterManager clusterManager, HostsFileReader hostsReader, CoronaSerializer coronaSerializer) throws IOException { this(clusterManager, hostsReader); // Expecting the START_OBJECT token for nodeManager coronaSerializer.readStartObjectToken("nodeManager"); readNameToNode(coronaSerializer); readHostsToSessions(coronaSerializer); readNameToApps(coronaSerializer); // Expecting the END_OBJECT token for ClusterManager coronaSerializer.readEndObjectToken("nodeManager"); // topologyCache need not be serialized, it will eventually be rebuilt. // cpuToResourcePartitioning and resourceLimit need not be serialized, // they can be read from the conf. } /** * Reads the nameToNode map from the JSON stream * @param coronaSerializer The CoronaSerializer instance to be used to * read the JSON * @throws IOException */ private void readNameToNode(CoronaSerializer coronaSerializer) throws IOException { coronaSerializer.readField("nameToNode"); // Expecting the START_OBJECT token for nameToNode coronaSerializer.readStartObjectToken("nameToNode"); JsonToken current = coronaSerializer.nextToken(); while (current != JsonToken.END_OBJECT) { // nodeName is the key, and the ClusterNode is the value here String nodeName = coronaSerializer.getFieldName(); ClusterNode clusterNode = new ClusterNode(coronaSerializer); if (!nameToNode.containsKey(nodeName)) { nameToNode.put(nodeName, clusterNode); } current = coronaSerializer.nextToken(); } // Done with reading the END_OBJECT token for nameToNode } /** * Reads the hostsToSessions map from the JSON stream * @param coronaSerializer The CoronaSerializer instance to be used to * read the JSON * @throws java.io.IOException */ private void readHostsToSessions(CoronaSerializer coronaSerializer) throws IOException { coronaSerializer.readField("hostsToSessions"); // Expecting the START_OBJECT token for hostsToSessions coronaSerializer.readStartObjectToken("hostsToSessions"); JsonToken current = coronaSerializer.nextToken(); while (current != JsonToken.END_OBJECT) { String host = coronaSerializer.getFieldName(); Set<String> sessionsSet = coronaSerializer.readValueAs(Set.class); hostsToSessions.put(nameToNode.get(host), sessionsSet); current = coronaSerializer.nextToken(); } } /** * Reads the nameToApps map from the JSON stream * @param coronaSerializer The CoronaSerializer instance to be used to * read the JSON * @throws IOException */ private void readNameToApps(CoronaSerializer coronaSerializer) throws IOException { coronaSerializer.readField("nameToApps"); // Expecting the START_OBJECT token for nameToApps coronaSerializer.readStartObjectToken("nameToApps"); JsonToken current = coronaSerializer.nextToken(); while (current != JsonToken.END_OBJECT) { String nodeName = coronaSerializer.getFieldName(); // Expecting the START_OBJECT token for the Apps coronaSerializer.readStartObjectToken(nodeName); Map<String, String> appMap = coronaSerializer.readValueAs(Map.class); Map<ResourceType, String> appsOnNode = new HashMap<ResourceType, String>(); for (Map.Entry<String, String> entry : appMap.entrySet()) { appsOnNode.put(ResourceType.valueOf(entry.getKey()), entry.getValue()); } nameToApps.put(nodeName, appsOnNode); current = coronaSerializer.nextToken(); } } /** * See if there are any runnable nodes of a given type * @param type the type to look for * @return true if there are runnable nodes for this type, false otherwise */ public boolean existRunnableNodes(ResourceType type) { RunnableIndices r = typeToIndices.get(type); return r.existRunnableNodes(); } /** * Create node snapshot of runnable nodes of a certain type. * @param type The resource type * @return The snapshot */ public NodeSnapshot getNodeSnapshot(ResourceType type) { return typeToIndices.get(type).getNodeSnapshot(); } /** * Find the best matching node for this host subject to the maxLevel * constraint * @param host the host of the request * @param maxLevel the max locality level to consider * @param type the type of resource needed on the node * @param excluded the list of nodes to exclude from consideration * @return the runnable node satisfying the constraints */ public ClusterNode getRunnableNode(String host, LocalityLevel maxLevel, ResourceType type, Set<String> excluded) { if (host == null) { RunnableIndices r = typeToIndices.get(type); return r.getRunnableNodeForAny(excluded); } RequestedNode node = resolve(host, type); return getRunnableNode(node, maxLevel, type, excluded); } /** * Get a runnable node. * @param requestedNode The request information. * @param maxLevel The maximum locality level that we can go to. * @param type The type of resource. * @param excluded The excluded nodes. * @return The runnable node that can be used. */ public ClusterNode getRunnableNode(RequestedNode requestedNode, LocalityLevel maxLevel, ResourceType type, Set<String> excluded) { ClusterNode node = null; RunnableIndices r = typeToIndices.get(type); // find host local node = r.getRunnableNodeForHost(requestedNode); if (maxLevel == LocalityLevel.NODE || node != null) { return node; } node = r.getRunnableNodeForRack(requestedNode, excluded); if (maxLevel == LocalityLevel.RACK || node != null) { return node; } // find any node node = r.getRunnableNodeForAny(excluded); return node; } /** * Add a node to be managed. * * @param node Node to be managed * @param resourceInfos Mapping of the resource type to runnable indices */ protected void addNode(ClusterNode node, Map<ResourceType, String> resourceInfos) { synchronized (node) { // 1: primary nameToNode.put(node.getName(), node); faultManager.addNode(node.getName(), resourceInfos.keySet()); nameToApps.put(node.getName(), resourceInfos); hostsToSessions.put(node, new HashSet<String>()); clusterManager.getMetrics().restartTaskTracker(1); setAliveDeadMetrics(); // 2: update runnable indices for (Map.Entry<ResourceType, RunnableIndices> entry : typeToIndices.entrySet()) { ResourceType type = entry.getKey(); if (resourceInfos.containsKey(type)) { if (node.checkForGrant(Utilities.getUnitResourceRequest(type), resourceLimit)) { RunnableIndices r = entry.getValue(); r.addRunnable(node); } } } } } /** * Update the runnable status of a node based on resources available. * This checks both resources and slot availability. * @param node The node */ private void updateRunnability(ClusterNode node) { synchronized (node) { for (Map.Entry<ResourceType, RunnableIndices> entry : typeToIndices.entrySet()) { ResourceType type = entry.getKey(); RunnableIndices r = entry.getValue(); ResourceRequest unitReq = Utilities.getUnitResourceRequest(type); boolean currentlyRunnable = r.hasRunnable(node); boolean shouldBeRunnable = node.checkForGrant(unitReq, resourceLimit); if (currentlyRunnable && !shouldBeRunnable) { LOG.info("Node " + node.getName() + " is no longer " + type + " runnable"); r.deleteRunnable(node); } else if (!currentlyRunnable && shouldBeRunnable) { LOG.info("Node " + node.getName() + " is now " + type + " runnable"); r.addRunnable(node); } } } } /** * Register a new application on the node * @param node the node to register on * @param type the type of an application * @param appInfo the appInfo string for the application */ protected void addAppToNode( ClusterNode node, ResourceType type, String appInfo) { synchronized (node) { // Update primary index. Map<ResourceType, String> apps = nameToApps.get(node.getName()); apps.put(type, appInfo); // Update runnable indices. for (Map.Entry<ResourceType, RunnableIndices> entry : typeToIndices.entrySet()) { if (type.equals(entry.getKey())) { if (node.checkForGrant(Utilities.getUnitResourceRequest(type), resourceLimit)) { RunnableIndices r = entry.getValue(); r.addRunnable(node); } } } } } /** * Get all the sessions that have grants on the node * @param nodeName the name of the node * @return the set of session ids that are running on the node */ public Set<String> getNodeSessions(String nodeName) { ClusterNode node = nameToNode.get(nodeName); if (node == null) { LOG.warn("Trying to get the sessions for a non-existent node " + nodeName); return new HashSet<String>(); } synchronized (node) { return new HashSet<String>(hostsToSessions.get(node)); } } /** * Remove the references to the session * @param session the session to be deleted */ public void deleteSession(String session) { for (Set<String> sessions : hostsToSessions.values()) { sessions.remove(session); } } /** * Delete the node from the cluster. This happens when the node times out * or is being decommissioned. * @param nodeName the name of the node to remove * @return the list of grants that are running on the node */ public Set<ClusterNode.GrantId> deleteNode(String nodeName) { ClusterNode node = nameToNode.get(nodeName); if (node == null) { LOG.warn("Trying to delete non-existent node: " + nodeName); return null; } return deleteNode(node); } /** * Delete the node from the cluster. This happens when the node times out * or is being decommissioned. * @param node the node to remove * @return the list of grants that are running on the node */ protected Set<ClusterNode.GrantId> deleteNode(ClusterNode node) { synchronized (node) { if (node.deleted) { return null; } node.deleted = true; // 1: primary nameToNode.remove(node.getName()); faultManager.deleteNode(node.getName()); nameToApps.remove(node.getName()); hostsToSessions.remove(node); setAliveDeadMetrics(); // 2: update runnable index for (RunnableIndices r : typeToIndices.values()) { r.deleteRunnable(node); } return node.getGrants(); } } /** * Remove one application type from the node. Happens when the daemon * responsible for handling this application type on the node goes down * @param nodeName the name of the node * @param type the type of the resource * @return the list of grants that belonged to the application on this node */ public Set<ClusterNode.GrantId> deleteAppFromNode( String nodeName, ResourceType type) { ClusterNode node = nameToNode.get(nodeName); if (node == null) { LOG.warn("Trying to delete type " + type + " from non-existent node: " + nodeName); return null; } return deleteAppFromNode(node, type); } /** * Remove one application type from the node. Happens when the daemon * responsible for handling this application type on the node goes down * @param node the node * @param type the type of the resource * @return the list of grants that belonged to the application on this node */ protected Set<ClusterNode.GrantId> deleteAppFromNode( ClusterNode node, ResourceType type) { synchronized (node) { if (node.deleted) { return null; } nameToApps.remove(node.getName()); RunnableIndices r = typeToIndices.get(type); r.deleteRunnable(node); return node.getGrants(type); } } /** * Cancel grant on a node * @param nodeName the node the grant is on * @param sessionId the session the grant was given to * @param requestId the request this grant satisfied */ public void cancelGrant(String nodeName, String sessionId, int requestId) { ClusterNode node = nameToNode.get(nodeName); if (node == null) { LOG.warn("Canceling grant for non-existent node: " + nodeName); return; } synchronized (node) { if (node.deleted) { LOG.warn("Canceling grant for deleted node: " + nodeName); return; } String hoststr = node.getClusterNodeInfo().getAddress().getHost(); if (!canAllowNode(hoststr)) { LOG.warn("Canceling grant for excluded node: " + hoststr); return; } ResourceRequestInfo req = node.getRequestForGrant(sessionId, requestId); if (req != null) { ResourceRequest unitReq = Utilities.getUnitResourceRequest( req.getType()); boolean previouslyRunnable = node.checkForGrant(unitReq, resourceLimit); node.cancelGrant(sessionId, requestId); loadManager.decrementLoad(req.getType()); if (!previouslyRunnable && node.checkForGrant(unitReq, resourceLimit)) { RunnableIndices r = typeToIndices.get(req.getType()); if (!faultManager.isBlacklisted(node.getName(), req.getType())) { r.addRunnable(node); } } } } } /** * Add a grant to a node * @param node the node the grant is on * @param sessionId the session the grant is given to * @param req the request this grant satisfies * @return true if the grant can be added to the node, false otherwise */ public boolean addGrant( ClusterNode node, String sessionId, ResourceRequestInfo req) { synchronized (node) { if (node.deleted) { return false; } if (!node.checkForGrant(Utilities.getUnitResourceRequest( req.getType()), resourceLimit)) { return false; } node.addGrant(sessionId, req); loadManager.incrementLoad(req.getType()); hostsToSessions.get(node).add(sessionId); if (!node.checkForGrant(Utilities.getUnitResourceRequest( req.getType()), resourceLimit)) { RunnableIndices r = typeToIndices.get(req.getType()); r.deleteRunnable(node); } } return true; } @Override public void setConf(Configuration newConf) { this.conf = (CoronaConf) newConf; nodeExpiryInterval = conf.getNodeExpiryInterval(); if (this.expireNodesThread != null) { this.expireNodesThread.interrupt(); } loadManager = new LoadManager(this); topologyCache = new TopologyCache(conf); cpuToResourcePartitioning = conf.getCpuToResourcePartitioning(); for (Map.Entry<Integer, Map<ResourceType, Integer>> entry : cpuToResourcePartitioning.entrySet()) { for (ResourceType type : entry.getValue().keySet()) { if (!typeToIndices.containsKey(type)) { typeToIndices.put(type, new RunnableIndices(type)); } } } resourceLimit.setConf(conf); faultManager.setConf(conf); } /** * This method rebuilds members related to the NodeManager instance, which * were not directly persisted themselves. * @throws IOException */ public void restoreAfterSafeModeRestart() throws IOException { if (!clusterManager.safeMode) { throw new IOException("restoreAfterSafeModeRestart() called while the " + "Cluster Manager was not in Safe Mode"); } // Restoring all the ClusterNode(s) for (ClusterNode clusterNode : nameToNode.values()) { restoreClusterNode(clusterNode); } // Restoring all the RequestedNodes(s) for (ClusterNode clusterNode : nameToNode.values()) { for (ResourceRequestInfo resourceRequestInfo : clusterNode.grants.values()) { // Fix the RequestedNode(s) restoreResourceRequestInfo(resourceRequestInfo); loadManager.incrementLoad(resourceRequestInfo.getType()); } } } /** * This method rebuilds members related to a ResourceRequestInfo instance, * which were not directly persisted themselves. * @param resourceRequestInfo The ResourceRequestInfo instance to be restored */ public void restoreResourceRequestInfo(ResourceRequestInfo resourceRequestInfo) { List<RequestedNode> requestedNodes = null; List<String> hosts = resourceRequestInfo.getHosts(); if (hosts != null && hosts.size() > 0) { requestedNodes = new ArrayList<RequestedNode>(hosts.size()); for (String host : hosts) { requestedNodes.add(resolve(host, resourceRequestInfo.getType())); } } resourceRequestInfo.nodes = requestedNodes; } private void restoreClusterNode(ClusterNode clusterNode) { clusterNode.hostNode = topologyCache.getNode(clusterNode.getHost()); // This will reset the lastHeartbeatTime clusterNode.heartbeat(clusterNode.getClusterNodeInfo()); clusterNode.initResourceTypeToMaxCpuMap(cpuToResourcePartitioning); updateRunnability(clusterNode); } @Override public Configuration getConf() { return conf; } /** * return true if a new node has been added - else return false * @param clusterNodeInfo the node that is heartbeating * @return true if this is a new node that has been added, false otherwise */ public boolean heartbeat(ClusterNodeInfo clusterNodeInfo) throws DisallowedNode { ClusterNode node = nameToNode.get(clusterNodeInfo.name); if (!canAllowNode(clusterNodeInfo.getAddress().getHost())) { if (node != null) { node.heartbeat(clusterNodeInfo); } else { throw new DisallowedNode(clusterNodeInfo.getAddress().getHost()); } return false; } boolean newNode = false; Map<ResourceType, String> currentResources = clusterNodeInfo.getResourceInfos(); if (currentResources == null) { currentResources = new EnumMap<ResourceType, String>(ResourceType.class); } if (node == null) { LOG.info("Adding node with heartbeat: " + clusterNodeInfo.toString()); node = new ClusterNode(clusterNodeInfo, topologyCache.getNode(clusterNodeInfo.address.host), cpuToResourcePartitioning); addNode(node, currentResources); newNode = true; } node.heartbeat(clusterNodeInfo); boolean appsChanged = false; Map<ResourceType, String> prevResources = nameToApps.get(clusterNodeInfo.name); Set<ResourceType> deletedApps = null; for (Map.Entry<ResourceType, String> entry : prevResources.entrySet()) { String newAppInfo = currentResources.get(entry.getKey()); String oldAppInfo = entry.getValue(); if (newAppInfo == null || !newAppInfo.equals(oldAppInfo)) { if (deletedApps == null) { deletedApps = EnumSet.noneOf(ResourceType.class); } deletedApps.add(entry.getKey()); appsChanged = true; } } Map<ResourceType, String> addedApps = null; for (Map.Entry<ResourceType, String> entry : currentResources.entrySet()) { String newAppInfo = entry.getValue(); String oldAppInfo = prevResources.get(entry.getKey()); if (oldAppInfo == null || !oldAppInfo.equals(newAppInfo)) { if (addedApps == null) { addedApps = new EnumMap<ResourceType, String>(ResourceType.class); } addedApps.put(entry.getKey(), entry.getValue()); appsChanged = true; } } if (deletedApps != null) { for (ResourceType deleted : deletedApps) { clusterManager.nodeAppRemoved(clusterNodeInfo.name, deleted); } } if (addedApps != null) { for (Map.Entry<ResourceType, String> added: addedApps.entrySet()) { addAppToNode(node, added.getKey(), added.getValue()); } } updateRunnability(node); return newNode || appsChanged; } /** * Get information about applications running on a node. * @param node The node. * @param type The type of resources. * @return The application-specific information */ public String getAppInfo(ClusterNode node, ResourceType type) { Map<ResourceType, String> resourceInfos = nameToApps.get(node.getName()); if (resourceInfos == null) { return null; } else { return resourceInfos.get(type); } } /** * Check if a node has enough resources. * @param node The node * @return A boolean indicating if it has enough resources. */ public boolean hasEnoughResource(ClusterNode node) { return resourceLimit.hasEnoughResource(node); } /** * Expires dead nodes. */ class ExpireNodes implements Runnable { @Override public void run() { while (!shutdown) { try { Thread.sleep(nodeExpiryInterval / 2); if (clusterManager.safeMode) { // Do nothing but sleep continue; } long now = ClusterManager.clock.getTime(); for (ClusterNode node : nameToNode.values()) { if (now - node.lastHeartbeatTime > nodeExpiryInterval) { LOG.warn("Timing out node: " + node.getName()); clusterManager.nodeTimeout(node.getName()); } } } catch (InterruptedException iex) { // ignore. if shutting down, while cond. will catch it continue; } } } } /** * Used by the cm.jsp to get the list of resource types. * * @return Collection of resource types */ public Collection<ResourceType> getResourceTypes() { return typeToIndices.keySet(); } /** * Find capacity for a resource type. * @param type The resource type. * @return The capacity. */ public int getMaxCpuForType(ResourceType type) { int total = 0; for (ClusterNode node: nameToNode.values()) { synchronized (node) { if (node.deleted) { continue; } total += node.getMaxCpuForType(type); } } return total; } /** * Find allocation for a resource type. * @param type The resource type. * @return The allocation. */ public int getAllocatedCpuForType(ResourceType type) { int total = 0; for (ClusterNode node: nameToNode.values()) { synchronized (node) { if (node.deleted) { continue; } total += node.getAllocatedCpuForType(type); } } return total; } /** * Get a list nodes with free Cpu for a resource type */ public List<String> getFreeNodesForType(ResourceType type) { ArrayList<String> freeNodes = new ArrayList<String>(); for (Map.Entry<String, ClusterNode> entry: nameToNode.entrySet()) { ClusterNode node = entry.getValue(); synchronized (node) { if (!node.deleted && node.getMaxCpuForType(type) > node.getAllocatedCpuForType(type)) { freeNodes.add(entry.getKey() + ": " + node.getFree().toString()); } } } return freeNodes; } /** * @return The total number of configured hosts. */ public int getTotalNodeCount() { return hostsReader.getHosts().size(); } /** * @return All the configured hosts. */ public Set<String> getAllNodes() { return hostsReader.getHostNames(); } /** * @return The number of excluded hosts. */ public int getExcludedNodeCount() { return hostsReader.getExcludedHosts().size(); } /** * @return The excluded hosts. */ public Set<String> getExcludedNodes() { return hostsReader.getExcludedHosts(); } /** * @return The number of alive nodes. */ public int getAliveNodeCount() { return nameToNode.size(); } /** * @return The alive nodes. */ public List<String> getAliveNodes() { return new ArrayList<String>(nameToNode.keySet()); } /** * @return The alive nodes. */ public List<ClusterNode> getAliveClusterNodes() { return new ArrayList<ClusterNode>(nameToNode.values()); } /** * @return The fault manager. */ public FaultManager getFaultManager() { return faultManager; } /** * Refresh the includes/excludes information. * @throws IOException */ public synchronized void refreshNodes() throws IOException { hostsReader.refresh(); LOG.info("After refresh Included hosts: " + hostsReader.getHostNames().size() + " Excluded hosts: " + hostsReader.getExcludedHosts().size()); Set<String> newHosts = hostsReader.getHostNames(); Set<String> newExcludes = hostsReader.getExcludedHosts(); Set<ClusterNode> hostsToExclude = new HashSet<ClusterNode>(); for (ClusterNode tmpNode : nameToNode.values()) { String host = tmpNode.getHost(); // Check if not included or explicitly excluded. if (!newHosts.contains(host) || newExcludes.contains(host)) { hostsToExclude.add(tmpNode); } } for (ClusterNode node: hostsToExclude) { synchronized (node) { for (Map.Entry<ResourceType, RunnableIndices> entry : typeToIndices.entrySet()) { ResourceType type = entry.getKey(); RunnableIndices r = entry.getValue(); if (r.hasRunnable(node)) { LOG.info("Node " + node.getName() + " is no longer " + type + " runnable because it is excluded"); r.deleteRunnable(node); } } } } } /** * Process feedback about nodes. * @param handle The session handle. * @param resourceTypes The types of resource this feedback is about. * @param reportList The list of reports. */ public void nodeFeedback( String handle, List<ResourceType> resourceTypes, List<NodeUsageReport> reportList) { // Iterate over each report. for (NodeUsageReport usageReport : reportList) { faultManager.nodeFeedback(usageReport.getNodeName(), resourceTypes, usageReport); } } /** * Blacklist a resource on a node. * @param nodeName The node name * @param resourceType The resource type. */ void blacklistNode(String nodeName, ResourceType resourceType) { LOG.info("Node " + nodeName + " has been blacklisted for resource " + resourceType); clusterManager.getMetrics().setBlacklistedNodes( faultManager.getBlacklistedNodeCount()); deleteAppFromNode(nodeName, resourceType); } /** * Checks if a host is allowed to communicate with the cluster manager. * * @param host * The host * @return a boolean indicating if the host is allowed. */ private boolean canAllowNode(String host) { return hostsReader.isAllowedHost(host); } /** * Update metrics for alive/dead nodes. */ private void setAliveDeadMetrics() { clusterManager.getMetrics().setAliveNodes(nameToNode.size()); int totalHosts = hostsReader.getHosts().size(); if (totalHosts > 0) { clusterManager.getMetrics().setDeadNodes( totalHosts - nameToNode.size()); } } /** * Resolve a host name. * @param host The host. * @param type The resource type. * @return The resolved form. */ public RequestedNode resolve(String host, ResourceType type) { RunnableIndices indices = typeToIndices.get(type); return indices.getOrCreateRequestedNode(host); } public ResourceLimit getResourceLimit() { return resourceLimit; } /** * This is required when we come out of safe mode, and we need to reset * the lastHeartbeatTime for each node */ public void resetNodesLastHeartbeatTime() { long now = ClusterManager.clock.getTime(); for (ClusterNode node : nameToNode.values()) { node.lastHeartbeatTime = now; } } /** * This method writes the state of the NodeManager to disk * @param jsonGenerator The instance of JsonGenerator, which will be used to * write JSON to disk * @throws IOException */ public void write(JsonGenerator jsonGenerator) throws IOException { jsonGenerator.writeStartObject(); // nameToNode begins jsonGenerator.writeFieldName("nameToNode"); jsonGenerator.writeStartObject(); for (Map.Entry<String, ClusterNode> entry : nameToNode.entrySet()) { jsonGenerator.writeFieldName(entry.getKey()); entry.getValue().write(jsonGenerator); } jsonGenerator.writeEndObject(); // nameToNode ends // hostsToSessions begins // We create a new Map of type <ClusterNode.name, Set<SessionIds>>. // The original hostsToSessions map has the ClusterNode as its key, and // we do not need to persist the entire ClusterNode again, since we have // already done that with nameToNode. Map<String, Set<String>> hostsToSessionsMap = new HashMap<String, Set<String>>(); for (Map.Entry<ClusterNode, Set<String>> entry : hostsToSessions.entrySet()) { hostsToSessionsMap.put(entry.getKey().getName(), entry.getValue()); } jsonGenerator.writeObjectField("hostsToSessions", hostsToSessionsMap); // hostsToSessions ends jsonGenerator.writeObjectField("nameToApps", nameToApps); // faultManager is not required // We can rebuild the loadManager jsonGenerator.writeEndObject(); } }