/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.corona; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; /** * Aggregates reports from sessions and decides if a node should be blacklisted. */ public class FaultManager { /** Logger. */ private static final Log LOG = LogFactory.getLog(FaultManager.class); /** Configuration. */ private CoronaConf conf; /** Reference to the Node Manager. */ private final NodeManager nm; /** Map of nodeName -> list of fault statistics. */ private final ConcurrentMap<String, List<FaultStatsForType>> nodeToFaultStats = new ConcurrentHashMap<String, List<FaultStatsForType>>(); /** Lookup for blacklisted nodes: Node Name -> Resource Types */ private final ConcurrentMap<String, List<ResourceType>> blacklistedNodes = new ConcurrentHashMap<String, List<ResourceType>>(); /** Fault Statistics for a resource type. */ public static class FaultStatsForType { /** Type of resource. */ private final ResourceType type; /** Number of sessions with failed connections to the node. */ private int numSessionsWithFailedConnections; /** Number of sessions that saw too many failures on the node. */ private int numSessionsWithTooManyFailures; /** * Constructor. * * @param type * The type of the resource. */ public FaultStatsForType(ResourceType type) { this.type = type; } public ResourceType getType() { return type; } public int getNumSessionsWithFailedConnections() { return numSessionsWithFailedConnections; } public void setNumSessionsWithFailedConnections(int val) { numSessionsWithFailedConnections = val; } public int getNumSessionsWithTooManyFailures() { return numSessionsWithTooManyFailures; } public void setNumSessionsWithTooManyFailures(int val) { numSessionsWithTooManyFailures = val; } } /** * Constructor. * * @param nm The {@link NodeManager} that is using this FaultManager. */ public FaultManager(NodeManager nm) { this.nm = nm; } /** * Sets the configuration. * * @param conf The configuration. */ public void setConf(CoronaConf conf) { this.conf = conf; } /** * Notify the fault manager of a new node. * * @param name The node name. * @param resourceTypes The types of resource on this node. */ public void addNode(String name, Set<ResourceType> resourceTypes) { List<FaultStatsForType> faultStats = new ArrayList<FaultStatsForType>( resourceTypes.size()); for (ResourceType type : resourceTypes) { faultStats.add(new FaultStatsForType(type)); } nodeToFaultStats.put(name, faultStats); } /** * Notify the fault manager that a node has been deleted. * * @param name The node name. */ public void deleteNode(String name) { nodeToFaultStats.remove(name); blacklistedNodes.remove(name); } /** * Provide the fault manager with new feedback about a node. * * @param nodeName The node name. * @param resourceTypes The types of resources used on the node. * @param usageReport The {@link NodeUsageReport} for this node. */ public void nodeFeedback(String nodeName, List<ResourceType> resourceTypes, NodeUsageReport usageReport) { List<FaultStatsForType> faultStats = nodeToFaultStats.get(nodeName); if (faultStats == null) { LOG.info("Received node feedback for deleted node " + nodeName); return; } boolean statsModified = false; synchronized (faultStats) { if (tooManyFailedConnectionsInSession(usageReport)) { for (FaultStatsForType stat : faultStats) { if (resourceTypes.contains(stat.type)) { stat.numSessionsWithFailedConnections++; statsModified = true; } } } if (tooManyFailuresInSession(usageReport)) { for (FaultStatsForType stat : faultStats) { if (resourceTypes.contains(stat.type)) { stat.numSessionsWithTooManyFailures++; statsModified = true; } } } } if (statsModified) { blacklistIfNeeded(nodeName, faultStats); } } /** * Gets the fault statistics for a node. * @param nodeName The node name. * @return The list of fault statistics for the node, one element per type. */ public List<FaultStatsForType> getFaultStats(String nodeName) { synchronized (nodeToFaultStats) { return nodeToFaultStats.get(nodeName); } } /** * Check if a resource on a node is blacklisted. * * @param nodeName The node name. * @param type The type of resource to check for blacklisting. * @return A boolean value that is true if blacklisted, false if not. */ public boolean isBlacklisted(String nodeName, ResourceType type) { List<ResourceType> blacklistedResourceTypes = blacklistedNodes.get(nodeName); if (blacklistedResourceTypes != null) { synchronized (blacklistedResourceTypes) { return blacklistedResourceTypes.contains(type); } } else { return false; } } /** * Return the number of blacklisted nodes. * @return The number of blacklisted nodes. */ public int getBlacklistedNodeCount() { return blacklistedNodes.size(); } /** * Return the list of blacklisted nodes. * @return The list of blacklisted nodes. */ public List<String> getBlacklistedNodes() { List<String> ret = new ArrayList<String>(); for (String nodeName : blacklistedNodes.keySet()) { ret.add(nodeName); } return ret; } /** * Checks if a node needs to be blacklisted and blacklists it. * @param nodeName The node name. * @param faultStats The fault statistics for the node. */ private void blacklistIfNeeded( String nodeName, List<FaultStatsForType> faultStats) { for (FaultStatsForType stat : faultStats) { if (isBlacklisted(nodeName, stat.type)) { continue; } if (tooManyFailuresOnNode(stat) || tooManyConnectionFailuresOnNode(stat)) { nm.blacklistNode(nodeName, stat.type); blacklist(nodeName, stat.type); } } } /** * Blacklists a resource on a node. * @param nodeName The node name. * @param type The type of the resource. */ private void blacklist(String nodeName, ResourceType type) { List<ResourceType> blacklistedResourceTypes = blacklistedNodes.get(nodeName); if (blacklistedResourceTypes == null) { blacklistedResourceTypes = new ArrayList<ResourceType>(); blacklistedNodes.put(nodeName, blacklistedResourceTypes); } synchronized (blacklistedResourceTypes) { if (!blacklistedResourceTypes.contains(type)) { blacklistedResourceTypes.add(type); } } } /** * Checks if there have been too many failures for a session on a node. * * @param usageReport The usage report. * @return A boolean value indicating if there were too many failures. */ private boolean tooManyFailuresInSession(NodeUsageReport usageReport) { return usageReport.getNumFailed() > conf.getMaxFailuresPerSession(); } /** * Checks if there have been too many failed connection attempts for a * session on a node. * * @param usageReport The usage report to check. * @return A boolean value indicating if there were too failed connections. */ private boolean tooManyFailedConnectionsInSession( NodeUsageReport usageReport) { return usageReport.getNumFailedConnections() > conf.getMaxFailedConnectionsPerSession(); } /** * Checks if there have been too many failures on a node across sessions. * @param stat Failure stats for the node. * @return A boolean value indicating if there were too many failures. */ private boolean tooManyConnectionFailuresOnNode(FaultStatsForType stat) { return stat.numSessionsWithFailedConnections > conf.getMaxFailedConnections(); } /** * Checks if there have been too many failed connection attempts on a node * across sessions. * * @param stat Failure stats for the node. * @return A boolean value indicating if there were too many failures. */ private boolean tooManyFailuresOnNode(FaultStatsForType stat) { return stat.numSessionsWithTooManyFailures > conf.getMaxFailures(); } }