/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.corona; import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; import java.util.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.http.HttpServer; import org.apache.hadoop.mapred.Clock; import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.util.CoronaSerializer; import org.apache.hadoop.util.HostsFileReader; import org.apache.thrift.TApplicationException; import org.apache.thrift.TException; import org.codehaus.jackson.JsonGenerator; /** * Manager of all the resources of the cluster. */ public class ClusterManager implements ClusterManagerService.Iface { /** Class logger */ public static final Log LOG = LogFactory.getLog(ClusterManager.class); /** Clock that is used for any general purpose system times. */ public static Clock clock = new Clock(); /** * The threshold to control if generating a log to say * somebody try to delete a number of active sessions */ public static final int KILL_SESSIONS_THRESHOLD = 2; /** Node manager manages collections of nodes */ protected NodeManager nodeManager; /** Session manager manages collections of sessions */ protected SessionManager sessionManager; /** Sessions history manager */ protected SessionHistoryManager sessionHistoryManager; /** http server */ protected HttpServer infoServer; /** Scheduler service matches free nodes to runnable sessions */ protected Scheduler scheduler; /** * The session notifier asynchronously notifies sessions about * various events */ protected SessionNotifier sessionNotifier; /** Metrics for the cluster manager */ protected ClusterManagerMetrics metrics; /** Configuration */ protected CoronaConf conf; /** Start time to show in UI. */ protected long startTime; /** When was the CM last restarted (either safely or otherwise) */ protected long lastRestartTime; /** Host name to show in UI. */ protected String hostName; /** Legal values for the "type" of a resource request. */ protected Set<ResourceType> legalTypeSet = EnumSet.noneOf(ResourceType.class); /** Is the Cluster Manager in Safe Mode */ protected volatile boolean safeMode; /** the thread to restart all the task trackers */ protected CoronaNodeRestarter nodeRestarter; /** * Simple constructor for testing help. */ public ClusterManager() { } /** * Primary constructor. * * @param conf Configuration to be used * @param recoverFromDisk True if we are restarting after going down while * in Safe Mode * @throws IOException */ public ClusterManager(Configuration conf, boolean recoverFromDisk) throws IOException { this(new CoronaConf(conf), recoverFromDisk); } /** * Constructor for ClusterManager, when it is not specified if we are * restarting after persisting the state. In this case we assume the * recoverFromDisk flag to be false. * * @param conf Configuration to be used * @throws IOException */ public ClusterManager(Configuration conf) throws IOException { this(new CoronaConf(conf), false); } /** * Construct ClusterManager given {@link CoronaConf} * * @param conf the configuration for the ClusterManager * @param recoverFromDisk true if we are restarting after going down while * in Safe Mode * @throws IOException */ public ClusterManager(CoronaConf conf, boolean recoverFromDisk) throws IOException { this.conf = conf; HostsFileReader hostsReader = new HostsFileReader(conf.getHostsFile(), conf.getExcludesFile()); initLegalTypes(); metrics = new ClusterManagerMetrics(getTypes()); if (recoverFromDisk) { recoverClusterManagerFromDisk(hostsReader); } else { File stateFile = new File(conf.getCMStateFile()); if (stateFile.exists()) { throw new IOException("State file " + stateFile.getAbsolutePath() + " exists, but recoverFromDisk is not set, delete the state file first"); } LOG.info("Starting Cluster Manager with clean state"); startTime = clock.getTime(); lastRestartTime = startTime; nodeManager = new NodeManager(this, hostsReader); nodeManager.setConf(conf); sessionManager = new SessionManager(this); sessionNotifier = new SessionNotifier(sessionManager, this, metrics); } sessionManager.setConf(conf); sessionNotifier.setConf(conf); sessionHistoryManager = new SessionHistoryManager(); sessionHistoryManager.setConf(conf); scheduler = new Scheduler(nodeManager, sessionManager, sessionNotifier, getTypes(), metrics, conf); scheduler.start(); metrics.registerUpdater(scheduler, sessionNotifier); InetSocketAddress infoSocAddr = NetUtils.createSocketAddr(conf.getClusterManagerHttpAddress()); infoServer = new HttpServer("cm", infoSocAddr.getHostName(), infoSocAddr.getPort(), infoSocAddr.getPort() == 0, conf); infoServer.setAttribute("cm", this); infoServer.start(); hostName = infoSocAddr.getHostName(); // We have not completely restored the nodeManager, sessionManager and the // sessionNotifier if (recoverFromDisk) { nodeManager.restoreAfterSafeModeRestart(); sessionManager.restoreAfterSafeModeRestart(); sessionNotifier.restoreAfterSafeModeRestart(); } nodeRestarter = new CoronaNodeRestarter(conf, nodeManager); nodeRestarter.start(); setSafeMode(false); } /** * This method is used when the ClusterManager is restarting after going down * while in Safe Mode. It starts the process of recovering the original * CM state by reading back the state in JSON form. * @param hostsReader The HostsReader instance * @throws IOException */ private void recoverClusterManagerFromDisk(HostsFileReader hostsReader) throws IOException { LOG.info("Restoring state from " + new java.io.File(conf.getCMStateFile()).getAbsolutePath()); // This will prevent the expireNodes and expireSessions threads from // expiring the nodes and sessions respectively safeMode = true; LOG.info("Safe mode is now: " + (this.safeMode ? "ON" : "OFF")); CoronaSerializer coronaSerializer = new CoronaSerializer(conf); // Expecting the START_OBJECT token for ClusterManager coronaSerializer.readStartObjectToken("ClusterManager"); coronaSerializer.readField("startTime"); startTime = coronaSerializer.readValueAs(Long.class); coronaSerializer.readField("nodeManager"); nodeManager = new NodeManager(this, hostsReader, coronaSerializer); nodeManager.setConf(conf); coronaSerializer.readField("sessionManager"); sessionManager = new SessionManager(this, coronaSerializer); coronaSerializer.readField("sessionNotifier"); sessionNotifier = new SessionNotifier(sessionManager, this, metrics, coronaSerializer); // Expecting the END_OBJECT token for ClusterManager coronaSerializer.readEndObjectToken("ClusterManager"); lastRestartTime = clock.getTime(); } /** * Prepare the legal types allowed based on the resources available */ protected void initLegalTypes() { Map<Integer, Map<ResourceType, Integer>> cpuToResourcePartitioning = conf.getCpuToResourcePartitioning(); for (Map.Entry<Integer, Map<ResourceType, Integer>> entry : cpuToResourcePartitioning.entrySet()) { for (ResourceType type : entry.getValue().keySet()) { legalTypeSet.add(type); } } legalTypeSet = Collections.unmodifiableSet(legalTypeSet); } public ClusterManagerMetrics getMetrics() { return metrics; } public SessionNotifier getSessionNotifier() { return sessionNotifier; } public SessionManager getSessionManager() { return sessionManager; } public NodeManager getNodeManager() { return nodeManager; } public Scheduler getScheduler() { return scheduler; } public Collection<ResourceType> getTypes() { return Collections.unmodifiableCollection(legalTypeSet); } /** * This is a helper method which simply checks if the safe mode flag is * turned on. If it is, the method which was called, cannot be executed * and, a SafeModeException is thrown. * @param methodName * @throws SafeModeException */ private void checkSafeMode(String methodName) throws SafeModeException { if (safeMode) { LOG.info(methodName + "() called while ClusterManager is in Safe Mode"); throw new SafeModeException(); } } @Override public PoolInfoStrings getActualPoolInfo(ActualPoolInfoArgs actualPoolInfoArgs) throws TException, InvalidPoolInfo, SafeModeException { checkSafeMode("getActualPoolInfo"); PoolInfoStrings userSpecifiedPoolInfo = actualPoolInfoArgs.poolInfoString; long jobSizeInfo = actualPoolInfoArgs.jobInputSize; ConfigManager configManager = getScheduler().getConfigManager(); PoolInfo poolInfo = PoolInfo.createPoolInfo(userSpecifiedPoolInfo); // Get Redirect pool info PoolInfo redirectedPoolInfo = configManager.getRedirect(poolInfo, jobSizeInfo); // Validate redirected pool information try { PoolGroupManager.checkPoolInfoIfStrict(redirectedPoolInfo, configManager, conf); } catch (InvalidSessionHandle ex) { throw new InvalidPoolInfo(ex.getHandle()); } PoolInfoStrings actualPoolInfo = PoolInfo.createPoolInfoStrings(redirectedPoolInfo); return actualPoolInfo; } @Override public ActualPoolInfoResponse getActualPoolInfoV2(ActualPoolInfoArgs actualPoolInfoArgs) throws InvalidPoolInfo, SafeModeException, TException { checkSafeMode("getActualPoolInfoV2"); PoolInfoStrings userSpecifiedPoolInfo = actualPoolInfoArgs.poolInfoString; long jobSizeInfo = actualPoolInfoArgs.jobInputSize; ConfigManager configManager = getScheduler().getConfigManager(); PoolInfo poolInfo = PoolInfo.createPoolInfo(userSpecifiedPoolInfo); // Get Redirect pool info PoolInfo redirectedPoolInfo = configManager.getRedirect(poolInfo, jobSizeInfo); // Validate redirected pool information try { PoolGroupManager.checkPoolInfoIfStrict(redirectedPoolInfo, configManager, conf); } catch (InvalidSessionHandle ex) { throw new InvalidPoolInfo(ex.getHandle()); } PoolInfoStrings actualPoolInfo = PoolInfo.createPoolInfoStrings(redirectedPoolInfo); ActualPoolInfoResponse actualPoolInfoResponse = new ActualPoolInfoResponse(); actualPoolInfoResponse.poolInfoString = actualPoolInfo; actualPoolInfoResponse.whitelist = configManager.getWhitelist(redirectedPoolInfo); return actualPoolInfoResponse; } @Override public String getNextSessionId() throws SafeModeException { checkSafeMode("getNextSessionId"); return sessionManager.getNextSessionId(); } @Override public SessionRegistrationData sessionStart(String handle, SessionInfo info) throws TException, InvalidSessionHandle, SafeModeException { checkSafeMode("sessionStart"); String sessionLogPath = sessionHistoryManager.getLogPath(handle); Session session = sessionManager.addSession(handle, info); return new SessionRegistrationData( session.getHandle(), new ClusterManagerInfo("", sessionLogPath), PoolInfo.createPoolInfoStrings(session.getPoolInfo())); } @Override public void sessionEnd(String handle, SessionStatus status) throws TException, InvalidSessionHandle, SafeModeException { checkSafeMode("sessionEnd"); try { Session session = sessionManager.getSession(handle); InetAddress sessionAddr = session.getAddress(); LOG.info("sessionEnd called for session: " + handle + " on " + sessionAddr.getHost() + ":" + sessionAddr.getPort() + " with status: " + status); if (status == SessionStatus.TIMED_OUT) { if (session.getUrl() != null && session.getUrl().indexOf(handle) < 0) { metrics.timeoutRemoteJT(1); } } if (status == SessionStatus.FAILED_JOBTRACKER) { metrics.recordCJTFailure(); } Collection<ResourceGrant> canceledGrants = sessionManager.deleteSession(handle, status); if (canceledGrants == null) { return; } for (ResourceGrant grant: canceledGrants) { nodeManager.cancelGrant(grant.nodeName, handle, grant.id); metrics.releaseResource(grant.type); } scheduler.notifyScheduler(); sessionNotifier.deleteSession(handle); } catch (RuntimeException e) { LOG.error("Error in sessionEnd of " + handle, e); throw new TApplicationException(e.getMessage()); } } @Override public void sessionUpdateInfo(String handle, SessionInfo info) throws TException, InvalidSessionHandle, SafeModeException { checkSafeMode("sessionUpdateInfo"); try { LOG.info("sessionUpdateInfo called for session: " + handle + " with info: " + info); sessionManager.heartbeat(handle); sessionManager.updateInfo(handle, info); } catch (RuntimeException e) { throw new TApplicationException(e.getMessage()); } } @Override public void sessionHeartbeat(String handle) throws TException, InvalidSessionHandle, SafeModeException { checkSafeMode("sessionHeartbeat"); try { sessionManager.heartbeat(handle); } catch (RuntimeException e) { throw new TApplicationException(e.getMessage()); } } @Override public void sessionHeartbeatV2(String handle, HeartbeatArgs jtInfo) throws TException, InvalidSessionHandle, SafeModeException { checkSafeMode("sessionHeartbeatV2"); try { Session session = sessionManager.getSession(handle); if (!session.checkHeartbeatInfo(jtInfo)) { sessionEnd(session.getSessionId(), SessionStatus.FAILED_JOBTRACKER); } sessionManager.heartbeatV2(handle, jtInfo); } catch (RuntimeException e) { throw new TApplicationException(e.getMessage()); } } /** * Check all the resource requests and ensure that they are legal. * * @param requestList List of resource requests to check * @return True if the resources are legal, false otherwise */ protected boolean checkResourceRequestType( List<ResourceRequest> requestList) { for (ResourceRequest req: requestList) { if (!legalTypeSet.contains(req.type)) { return false; } } return true; } protected boolean checkResourceRequestExcluded( List<ResourceRequest> requestList) { Set<String> excluded = new HashSet<String>(); for(ResourceRequest req: requestList) { if (req.getExcludeHosts() == null || req.getHosts() == null) { continue; } excluded.clear(); excluded.addAll(req.getExcludeHosts()); for (String host : req.getHosts()) { if (excluded.contains(host)) { return false; } } } return true; } /** * Count the resources requested and fail the job if they are above the limit * * @param requestList List of resource requests to check */ protected void checkResourceRequestLimit( List<ResourceRequest> requestList, String handle) throws InvalidSessionHandle { ConfigManager configManager = getScheduler().getConfigManager(); Session session = sessionManager.getSession(handle); PoolInfo poolInfo = session.getPoolInfo(); // Only check the resource requests if this pool is configured to not // accept more than a fixed number of requests at the same time if (!configManager.useRequestMax(poolInfo)) { return; } // Count the resources by type ResourceTypeCounter resourceTypeCounter = new ResourceTypeCounter(); for (ResourceRequest req : requestList) { resourceTypeCounter.incr(req.type); } // No resource type request should exceed the maximum for (ResourceType resourceType : ResourceType.values()) { if (configManager.getPoolMaximum(poolInfo, resourceType) < resourceTypeCounter.getCount(resourceType)) { String failureMessage = "Session " + handle + " requested " + resourceTypeCounter.getCount(resourceType) + " resources for resource type " + resourceType + " but was only allowed " + configManager.getPoolMaximum(poolInfo, resourceType) + ", " + "so failing the job"; LOG.error(failureMessage); throw new InvalidSessionHandle(failureMessage); } } } @Override public void requestResource(String handle, List<ResourceRequest> requestList) throws TException, InvalidSessionHandle, SafeModeException { checkSafeMode("requestResource"); try { LOG.info ("Request " + requestList.size() + " resources from session: " + handle); if (!checkResourceRequestType(requestList)) { LOG.error ("Bad resource type from session: " + handle); throw new TApplicationException("Bad resource type"); } if (!checkResourceRequestExcluded(requestList)) { LOG.error("Bad excluded hosts from session: " + handle); throw new TApplicationException("Requesting excluded hosts"); } checkResourceRequestLimit(requestList, handle); sessionManager.heartbeat(handle); sessionManager.getSession(handle).setResourceRequest(requestList); List<ResourceRequestInfo> reqInfoList = new ArrayList<ResourceRequestInfo>(requestList.size()); for (ResourceRequest request : requestList) { List<String> hosts = request.getHosts(); List<RequestedNode> requestedNodes = null; if (hosts != null && hosts.size() > 0) { requestedNodes = new ArrayList<RequestedNode>(hosts.size()); for (String host : hosts) { requestedNodes.add(nodeManager.resolve(host, request.type)); } } ResourceRequestInfo info = new ResourceRequestInfo(request, requestedNodes); reqInfoList.add(info); } sessionManager.requestResource(handle, reqInfoList); for (ResourceRequest req : requestList) { metrics.requestResource(req.type); } scheduler.notifyScheduler(); } catch (RuntimeException e) { e.printStackTrace(); throw new TApplicationException(e.getMessage()); } } @Override public void releaseResource(String handle, List<Integer> idList) throws TException, InvalidSessionHandle, SafeModeException { checkSafeMode("releaseResource"); try { LOG.info("Release " + idList.size() + " resources from session: " + handle); sessionManager.heartbeat(handle); Collection<ResourceGrant> canceledGrants = sessionManager.releaseResource(handle, idList); if (canceledGrants == null) { // LOG.info("No canceled grants for session " + handle); return; } for (ResourceGrant grant: canceledGrants) { nodeManager.cancelGrant(grant.nodeName, handle, grant.id); metrics.releaseResource(grant.type); } scheduler.notifyScheduler(); } catch (RuntimeException e) { throw new TApplicationException(e.getMessage()); } } @Override public NodeHeartbeatResponse nodeHeartbeat(ClusterNodeInfo node) throws TException, DisallowedNode, SafeModeException { checkSafeMode("nodeHeartbeat"); //LOG.info("heartbeat from node: " + node.toString()); if (nodeManager.heartbeat(node)) { scheduler.notifyScheduler(); } NodeHeartbeatResponse nodeHeartbeatResponse = new NodeHeartbeatResponse(); if (nodeRestarter != null && nodeRestarter.checkStatus(node)) { nodeHeartbeatResponse.setRestartFlag(true); } else { nodeHeartbeatResponse.setRestartFlag(false); } return nodeHeartbeatResponse; } @Override public void nodeFeedback(String handle, List<ResourceType> resourceTypes, List<NodeUsageReport> reportList) throws TException, InvalidSessionHandle, SafeModeException { checkSafeMode("nodeFeedback"); LOG.info("Received feedback from session " + handle); nodeManager.nodeFeedback(handle, resourceTypes, reportList); } @Override public void refreshNodes() throws TException, SafeModeException { checkSafeMode("refreshNodes"); try { nodeManager.refreshNodes(); } catch (IOException e) { throw new TException(e); } } @Override public RestartNodesResponse restartNodes( RestartNodesArgs restartNodesArgs) throws TException, SafeModeException { checkSafeMode("restartNode"); LOG.info("Got request to restart all the cluster nodes with batch size " + restartNodesArgs.getBatchSize()); List<ClusterNode> allNodes = nodeManager.getAliveClusterNodes(); if (allNodes.size() > 0 && nodeRestarter != null){ nodeRestarter.add(allNodes, restartNodesArgs.isForce(), restartNodesArgs.getBatchSize()); } else { LOG.info("There is no cluster node to restart"); } RestartNodesResponse restartNodesResponse = new RestartNodesResponse(); return restartNodesResponse; } /** * Sets the Safe Mode flag on the Cluster Manager, and on the ProxyJobTracker. * If we fail to set the flag on the ProxyJobTracker, return false, which * signals that setting the flag on the ProxyJobTracker failed. In that case, * we should run coronaadmin with the -forceSetSafeModeOnPJT or * -forceUnsetSafeModeOnPJT options. * * If we call this function multiple times, it wouldn't matter, because all * operations (apart from resetting of the last heartbeat time) in this * function, and in the setClusterManagerSafeModeFlag function in the * ProxyJobTracker are idempotent. * * @param safeMode The value of Safe Mode flag that we want to be set. * @return true, if setting the Safe Mode flag succeeded, false otherwise. */ @Override public synchronized boolean setSafeMode(boolean safeMode) { /** * If we are switching off the safe mode, so we need to reset the last * heartbeat timestamp for each of the sessions and nodes. */ if (safeMode == false) { LOG.info("Resetting the heartbeat times for all sessions"); sessionManager.resetSessionsLastHeartbeatTime(); LOG.info("Resetting the heartbeat times for all nodes"); nodeManager.resetNodesLastHeartbeatTime(); /** * If we are setting the safe mode to false, we should first set it * in-memory, before we set it at the CPJT. */ this.safeMode = false; } try { ClusterManagerAvailabilityChecker.getPJTClient(conf). setClusterManagerSafeModeFlag(safeMode); } catch (IOException e) { LOG.info("Exception while setting the safe mode flag in ProxyJobTracker: " + e.getMessage()); return false; } catch (TException e) { LOG.info("Exception while setting the safe mode flag in ProxyJobTracker: " + e.getMessage()); return false; } this.safeMode = safeMode; LOG.info("Flag successfully set in ProxyJobTracker"); LOG.info("Safe mode is now: " + (this.safeMode ? "ON" : "OFF")); return true; } /** * Get the current safe mode setting. */ public boolean getSafeMode() { return safeMode; } /** * This function saves the state of the ClusterManager to disk. * @return A boolean. True if saving the state succeeded, false otherwise. */ @Override public boolean persistState() { if (!safeMode) { LOG.info( "Cannot persist state because ClusterManager is not in Safe Mode"); return false; } try { JsonGenerator jsonGenerator = CoronaSerializer.createJsonGenerator(conf); jsonGenerator.writeStartObject(); jsonGenerator.writeFieldName("startTime"); jsonGenerator.writeNumber(startTime); jsonGenerator.writeFieldName("nodeManager"); nodeManager.write(jsonGenerator); jsonGenerator.writeFieldName("sessionManager"); sessionManager.write(jsonGenerator); jsonGenerator.writeFieldName("sessionNotifier"); sessionNotifier.write(jsonGenerator); jsonGenerator.writeEndObject(); jsonGenerator.close(); } catch (IOException e) { LOG.info("Could not persist the state: ", e); return false; } return true; } @Override public List<RunningSession> getSessions() throws TException, SafeModeException { checkSafeMode("getSessions"); List<RunningSession> runningSessions = new LinkedList<RunningSession>(); Set<String> sessions = sessionManager.getSessions(); for (String sessionId : sessions) { try { Session session = sessionManager.getSession(sessionId); synchronized (session) { RunningSession runningSession = new RunningSession(session.getHandle(), session.getName(), session.getUserId(), PoolInfo.createPoolInfoStrings(session.getPoolInfo())); runningSession.setDeadline(session.getDeadline()); runningSession.setPriority(session.getInfo().getPriority()); Map<ResourceType, Integer> runningResources = new EnumMap<ResourceType, Integer>(ResourceType.class); for (ResourceType type : ResourceType.values()) { runningResources.put(type, session.getGrantCountForType(type)); } runningSession.setRunningResources(runningResources); runningSessions.add(runningSession); } } catch (InvalidSessionHandle invalidSessionHandle) { // This is no big deal, just means that the session has finished } } return runningSessions; } @Override public SessionInfo getSessionInfo(String handle) throws TException, InvalidSessionHandle, SafeModeException { checkSafeMode("getSessionInfo"); Session session = sessionManager.getSession(handle); return session.getInfo(); } @Override public void killSession(String sessionId) throws TException, SafeModeException { checkSafeMode("killSession"); try { LOG.info("Killing session " + sessionId); sessionEnd(sessionId, SessionStatus.KILLED); } catch (InvalidSessionHandle e) { throw new TException(e); } } @Override public void killSessions(KillSessionsArgs killSessionsArgs) throws SafeModeException, TException { StringBuilder msg = new StringBuilder(); msg.append(killSessionsArgs.who); msg.append(" killed session"); int killed = 0; for (String id: killSessionsArgs.sessionIds) { try { killSession(id); ++ killed; msg.append(" "); msg.append(id); } finally { if (killed >= KILL_SESSIONS_THRESHOLD) { LOG.info(msg); } } } } /** * This is an internal api called to tell the cluster manager that a * a particular node seems dysfunctional and that it should be removed * from the cluster. * * @param nodeName Node to be removed */ public void nodeTimeout(String nodeName) { if (nodeRestarter != null) { nodeRestarter.delete(nodeName); } Set<String> sessions = nodeManager.getNodeSessions(nodeName); Set<ClusterNode.GrantId> grantsToRevoke = nodeManager.deleteNode(nodeName); if (grantsToRevoke == null) { return; } handleRevokedGrants(nodeName, grantsToRevoke); handleDeadNode(nodeName, sessions); scheduler.notifyScheduler(); } /** * This is an internal api called to tell the cluster manager that a * particular node is excluded from the cluster. * * @param nodeName * Node to be removed */ public void nodeDecommisioned(String nodeName) { LOG.info("Node decommissioned: " + nodeName); // The logic for decommisioning is the same as that for a timeout. nodeTimeout(nodeName); } /** * This is an internal api called to tell the cluster manager that a * particular type of resource is no longer available on a node. * * @param nodeName * Name of the node on which the resource is removed. * @param type * The type of resource to be removed. */ public void nodeAppRemoved(String nodeName, ResourceType type) { Set<String> sessions = nodeManager.getNodeSessions(nodeName); Set<ClusterNode.GrantId> grantsToRevoke = nodeManager.deleteAppFromNode(nodeName, type); if (grantsToRevoke == null) { return; } Set<String> affectedSessions = new HashSet<String>(); for (String sessionHandle : sessions) { try { if (sessionManager.getSession(sessionHandle). getTypes().contains(type)) { affectedSessions.add(sessionHandle); } } catch (InvalidSessionHandle ex) { // ignore LOG.warn("Found invalid session: " + sessionHandle + " while timing out node: " + nodeName); } } handleDeadNode(nodeName, affectedSessions); handleRevokedGrants(nodeName, grantsToRevoke); scheduler.notifyScheduler(); } /** * Process the grants removed from a node. * * @param nodeName * The node name. * @param grantsToRevoke * The grants to revoke. */ private void handleRevokedGrants( String nodeName, Set<ClusterNode.GrantId> grantsToRevoke) { for (ClusterNode.GrantId grantId: grantsToRevoke) { String sessionHandle = grantId.getSessionId(); try { sessionManager.revokeResource(sessionHandle, Collections.singletonList(grantId.getRequestId())); } catch (InvalidSessionHandle e) { // ignore LOG.warn("Found invalid session: " + sessionHandle + " while timing out node: " + nodeName); } } } /** * All the sessions that had grants on this node should get notified * @param nodeName the name of the node that went dead */ private void handleDeadNode(String nodeName, Set<String> sessions) { LOG.info("Notify sessions: " + sessions + " about dead node " + nodeName); for (String session : sessions) { sessionNotifier.notifyDeadNode(session, nodeName); } } public long getStartTime() { return startTime; } /** * Returns the last time the CM was restarted either safely, or otherwise. * @return Milliseconds since last restart */ public long getLastRestartTime() { return lastRestartTime; } public String getHostName() { return hostName; } }