/* * Copyright (c) 2011-2014 The original author or authors * ------------------------------------------------------ * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * and Apache License v2.0 which accompanies this distribution. * * The Eclipse Public License is available at * http://www.eclipse.org/legal/epl-v10.html * * The Apache License v2.0 is available at * http://www.opensource.org/licenses/apache2.0.php * * You may elect to redistribute this code under either of these licenses. */ package io.vertx.core.impl; import io.vertx.core.AsyncResult; import io.vertx.core.DeploymentOptions; import io.vertx.core.Handler; import io.vertx.core.VertxException; import io.vertx.core.json.JsonArray; import io.vertx.core.json.JsonObject; import io.vertx.core.logging.Logger; import io.vertx.core.logging.LoggerFactory; import io.vertx.core.spi.cluster.ClusterManager; import io.vertx.core.spi.cluster.NodeListener; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Queue; import java.util.Set; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; import static java.util.concurrent.TimeUnit.*; /** * * Handles HA * * We compute failover and whether there is a quorum synchronously as we receive nodeAdded and nodeRemoved events * from the cluster manager. * * It's vital that this is done synchronously as the cluster manager only guarantees that the set of nodes retrieved * from getNodes() is the same for each node in the cluster when processing the exact same nodeAdded/nodeRemoved * event. * * As HA modules are deployed, if a quorum has been attained they are deployed immediately, otherwise the deployment * information is added to a list. * * Periodically we check the value of attainedQuorum and if true we deploy any HA deploymentIDs waiting for a quorum. * * If false, we check if there are any HA deploymentIDs current deployed, and if so undeploy them, and add them to the list * of deploymentIDs waiting for a quorum. * * By doing this check periodically we can avoid race conditions resulting in modules being deployed after a quorum has * been lost, and without having to resort to exclusive locking which is actually quite tricky here, and prone to * deadlock· * * We maintain a clustered map where the key is the node id and the value is some stringified JSON which describes * the group of the cluster and an array of the HA modules deployed on that node. * * There is an entry in the map for each node of the cluster. * * When a node joins the cluster or an HA module is deployed or undeployed that entry is updated. * * When a node leaves the cluster cleanly, it removes it's own entry before leaving. * * When the cluster manager sends us an event to say a node has left the cluster we check if its entry in the cluster * map is there, and if so we infer a clean close has happened and no failover will occur. * * If the map entry is there it implies the node died suddenly. In that case each node of the cluster must compute * whether it is the failover node for the failed node. * * First each node of the cluster determines whether it is in the same group as the failed node, if not then it will not * be a candidate for the failover node. Nodes in the cluster only failover to other nodes in the same group. * * If the node is in the same group then the node takes the UUID of the failed node, computes the hash-code and chooses * a node from the list of nodes in the cluster by taking the hash-code modulo the number of nodes as an index to the * list of nodes. * * The cluster manager guarantees each node in the cluster sees the same set of nodes for each membership event that is * processed. Therefore it is guaranteed that each node in the cluster will compute the same value. It is critical that * any cluster manager implementation provides this guarantee! * * Once the value has been computed, it is compared to the current node, and if it is the same the current node * assumes failover for the failed node. * * During failover the failover node deploys all the HA modules from the failed node, as described in the JSON with the * same values of config and instances. * * Once failover is complete the failover node removes the cluster map entry for the failed node. * * If the failover node itself fails while it is processing failover for another node, then this is also checked by * other nodes when they detect the failure of the second node. * * @author <a href="http://tfox.org">Tim Fox</a> */ public class HAManager { private static final Logger log = LoggerFactory.getLogger(HAManager.class); private static final String CLUSTER_MAP_NAME = "__vertx.haInfo"; private static final long QUORUM_CHECK_PERIOD = 1000; private final VertxInternal vertx; private final DeploymentManager deploymentManager; private final ClusterManager clusterManager; private final int quorumSize; private final String group; private final JsonObject haInfo; private final Map<String, String> clusterMap; private final String nodeID; private final Queue<Runnable> toDeployOnQuorum = new ConcurrentLinkedQueue<>(); private final boolean enabled; private long quorumTimerID; private volatile boolean attainedQuorum; private volatile FailoverCompleteHandler failoverCompleteHandler; private volatile boolean failDuringFailover; private volatile boolean stopped; private volatile boolean killed; private Consumer<Set<String>> clusterViewChangedHandler; public HAManager(VertxInternal vertx, DeploymentManager deploymentManager, ClusterManager clusterManager, int quorumSize, String group, boolean enabled) { this.vertx = vertx; this.deploymentManager = deploymentManager; this.clusterManager = clusterManager; this.quorumSize = enabled ? quorumSize : 0; this.group = enabled ? group : "__DISABLED__"; this.enabled = enabled; this.haInfo = new JsonObject(); haInfo.put("verticles", new JsonArray()); haInfo.put("group", this.group); this.clusterMap = clusterManager.getSyncMap(CLUSTER_MAP_NAME); this.nodeID = clusterManager.getNodeID(); synchronized (haInfo) { clusterMap.put(nodeID, haInfo.encode()); } clusterManager.nodeListener(new NodeListener() { @Override public void nodeAdded(String nodeID) { HAManager.this.nodeAdded(nodeID); } @Override public void nodeLeft(String leftNodeID) { HAManager.this.nodeLeft(leftNodeID); } }); quorumTimerID = vertx.setPeriodic(QUORUM_CHECK_PERIOD, tid -> checkHADeployments()); // Call check quorum to compute whether we have an initial quorum synchronized (this) { checkQuorum(); } } // Remove the information on the deployment from the cluster - this is called when an HA module is undeployed public void removeFromHA(String depID) { Deployment dep = deploymentManager.getDeployment(depID); if (dep == null || !dep.deploymentOptions().isHa()) { return; } synchronized (haInfo) { JsonArray haMods = haInfo.getJsonArray("verticles"); Iterator<Object> iter = haMods.iterator(); while (iter.hasNext()) { Object obj = iter.next(); JsonObject mod = (JsonObject) obj; if (mod.getString("dep_id").equals(depID)) { iter.remove(); } } clusterMap.put(nodeID, haInfo.encode()); } } public void addDataToAHAInfo(String key, JsonObject value) { synchronized (haInfo) { haInfo.put(key, value); clusterMap.put(nodeID, haInfo.encode()); } } // Deploy an HA verticle public void deployVerticle(final String verticleName, DeploymentOptions deploymentOptions, final Handler<AsyncResult<String>> doneHandler) { if (attainedQuorum) { doDeployVerticle(verticleName, deploymentOptions, doneHandler); } else { log.info("Quorum not attained. Deployment of verticle will be delayed until there's a quorum."); addToHADeployList(verticleName, deploymentOptions, doneHandler); } } public void stop() { if (!stopped) { if (clusterManager.isActive()) { clusterMap.remove(nodeID); } vertx.cancelTimer(quorumTimerID); stopped = true; } } public void simulateKill() { if (!stopped) { killed = true; CountDownLatch latch = new CountDownLatch(1); clusterManager.leave(ar -> { if (ar.failed()) { log.error("Failed to leave cluster", ar.cause()); } latch.countDown(); }); vertx.cancelTimer(quorumTimerID); boolean interrupted = false; try { long remainingNanos = MINUTES.toNanos(1); long end = System.nanoTime() + remainingNanos; while (true) { try { latch.await(remainingNanos, NANOSECONDS); break; } catch (InterruptedException e) { interrupted = true; remainingNanos = end - System.nanoTime(); } } } finally { if (interrupted) { Thread.currentThread().interrupt(); } } stopped = true; } } public void setFailoverCompleteHandler(FailoverCompleteHandler failoverCompleteHandler) { this.failoverCompleteHandler = failoverCompleteHandler; } public void setClusterViewChangedHandler(Consumer<Set<String>> handler) { this.clusterViewChangedHandler = handler; } public boolean isKilled() { return killed; } public boolean isEnabled() { return enabled; } // For testing: public void failDuringFailover(boolean fail) { failDuringFailover = fail; } private void doDeployVerticle(final String verticleName, DeploymentOptions deploymentOptions, final Handler<AsyncResult<String>> doneHandler) { final Handler<AsyncResult<String>> wrappedHandler = asyncResult -> { if (asyncResult.succeeded()) { // Tell the other nodes of the cluster about the verticle for HA purposes addToHA(asyncResult.result(), verticleName, deploymentOptions); } if (doneHandler != null) { doneHandler.handle(asyncResult); } else if (asyncResult.failed()) { log.error("Failed to deploy verticle", asyncResult.cause()); } }; deploymentManager.deployVerticle(verticleName, deploymentOptions, wrappedHandler); } // A node has joined the cluster // synchronize this in case the cluster manager is naughty and calls it concurrently private synchronized void nodeAdded(final String nodeID) { addHaInfoIfLost(); // This is not ideal but we need to wait for the group information to appear - and this will be shortly // after the node has been added checkQuorumWhenAdded(nodeID, System.currentTimeMillis()); } // A node has left the cluster // synchronize this in case the cluster manager is naughty and calls it concurrently private synchronized void nodeLeft(String leftNodeID) { addHaInfoIfLost(); checkQuorum(); if (attainedQuorum) { checkSubs(leftNodeID); // Check for failover String sclusterInfo = clusterMap.get(leftNodeID); if (sclusterInfo == null) { // Clean close - do nothing } else { JsonObject clusterInfo = new JsonObject(sclusterInfo); checkFailover(leftNodeID, clusterInfo); } // We also check for and potentially resume any previous failovers that might have failed // We can determine this if there any ids in the cluster map which aren't in the node list List<String> nodes = clusterManager.getNodes(); for (Map.Entry<String, String> entry: clusterMap.entrySet()) { if (!leftNodeID.equals(entry.getKey()) && !nodes.contains(entry.getKey())) { JsonObject haInfo = new JsonObject(entry.getValue()); checkFailover(entry.getKey(), haInfo); } } } } private void addHaInfoIfLost() { if (clusterManager.getNodes().contains(nodeID) && !clusterMap.containsKey(nodeID)) { synchronized (haInfo) { clusterMap.put(nodeID, haInfo.encode()); } } } private synchronized void checkQuorumWhenAdded(final String nodeID, final long start) { if (clusterMap.containsKey(nodeID)) { checkQuorum(); if (attainedQuorum) { checkSubs(nodeID); } } else { vertx.setTimer(200, tid -> { // This can block on a monitor so it needs to run as a worker vertx.executeBlockingInternal(() -> { if (System.currentTimeMillis() - start > 10000) { log.warn("Timed out waiting for group information to appear"); } else if (!stopped) { ContextImpl context = vertx.getContext(); try { // Remove any context we have here (from the timer) otherwise will screw things up when verticles are deployed ContextImpl.setContext(null); checkQuorumWhenAdded(nodeID, start); } finally { ContextImpl.setContext(context); } } return null; }, null); }); } } // Check if there is a quorum for our group private void checkQuorum() { if (quorumSize == 0) { this.attainedQuorum = true; } else { List<String> nodes = clusterManager.getNodes(); int count = 0; for (String node : nodes) { String json = clusterMap.get(node); if (json != null) { JsonObject clusterInfo = new JsonObject(json); String group = clusterInfo.getString("group"); if (group.equals(this.group)) { count++; } } } boolean attained = count >= quorumSize; if (!attainedQuorum && attained) { // A quorum has been attained so we can deploy any currently undeployed HA deploymentIDs log.info("A quorum has been obtained. Any deploymentIDs waiting on a quorum will now be deployed"); this.attainedQuorum = true; } else if (attainedQuorum && !attained) { // We had a quorum but we lost it - we must undeploy any HA deploymentIDs log.info("There is no longer a quorum. Any HA deploymentIDs will be undeployed until a quorum is re-attained"); this.attainedQuorum = false; } } } // Add some information on a deployment in the cluster so other nodes know about it private void addToHA(String deploymentID, String verticleName, DeploymentOptions deploymentOptions) { String encoded; synchronized (haInfo) { JsonObject verticleConf = new JsonObject().put("dep_id", deploymentID); verticleConf.put("verticle_name", verticleName); verticleConf.put("options", deploymentOptions.toJson()); JsonArray haMods = haInfo.getJsonArray("verticles"); haMods.add(verticleConf); encoded = haInfo.encode(); clusterMap.put(nodeID, encoded); } } // Add the deployment to an internal list of deploymentIDs - these will be executed when a quorum is attained private void addToHADeployList(final String verticleName, final DeploymentOptions deploymentOptions, final Handler<AsyncResult<String>> doneHandler) { toDeployOnQuorum.add(() -> { ContextImpl ctx = vertx.getContext(); try { ContextImpl.setContext(null); deployVerticle(verticleName, deploymentOptions, doneHandler); } finally { ContextImpl.setContext(ctx); } }); } private void checkHADeployments() { try { if (attainedQuorum) { deployHADeployments(); } else { undeployHADeployments(); } } catch (Throwable t) { log.error("Failed when checking HA deploymentIDs", t); } } // Undeploy any HA deploymentIDs now there is no quorum private void undeployHADeployments() { for (String deploymentID: deploymentManager.deployments()) { Deployment dep = deploymentManager.getDeployment(deploymentID); if (dep != null) { if (dep.deploymentOptions().isHa()) { ContextImpl ctx = vertx.getContext(); try { ContextImpl.setContext(null); deploymentManager.undeployVerticle(deploymentID, result -> { if (result.succeeded()) { log.info("Successfully undeployed HA deployment " + deploymentID + "-" + dep.verticleIdentifier() + " as there is no quorum"); addToHADeployList(dep.verticleIdentifier(), dep.deploymentOptions(), result1 -> { if (result1.succeeded()) { log.info("Successfully redeployed verticle " + dep.verticleIdentifier() + " after quorum was re-attained"); } else { log.error("Failed to redeploy verticle " + dep.verticleIdentifier() + " after quorum was re-attained", result1.cause()); } }); } else { log.error("Failed to undeploy deployment on lost quorum", result.cause()); } }); } finally { ContextImpl.setContext(ctx); } } } } } // Deploy any deploymentIDs that are waiting for a quorum private void deployHADeployments() { int size = toDeployOnQuorum.size(); if (size != 0) { log.info("There are " + size + " HA deploymentIDs waiting on a quorum. These will now be deployed"); Runnable task; while ((task = toDeployOnQuorum.poll()) != null) { try { task.run(); } catch (Throwable t) { log.error("Failed to run redeployment task", t); } } } } // Handle failover private void checkFailover(String failedNodeID, JsonObject theHAInfo) { try { JsonArray deployments = theHAInfo.getJsonArray("verticles"); String group = theHAInfo.getString("group"); String chosen = chooseHashedNode(group, failedNodeID.hashCode()); if (chosen != null && chosen.equals(this.nodeID)) { if (deployments != null && deployments.size() != 0) { log.info("node" + nodeID + " says: Node " + failedNodeID + " has failed. This node will deploy " + deployments.size() + " deploymentIDs from that node."); for (Object obj: deployments) { JsonObject app = (JsonObject)obj; processFailover(app); } } // Failover is complete! We can now remove the failed node from the cluster map clusterMap.remove(failedNodeID); runOnContextAndWait(() -> { if (failoverCompleteHandler != null) { failoverCompleteHandler.handle(failedNodeID, theHAInfo, true); } }); } } catch (Throwable t) { log.error("Failed to handle failover", t); runOnContextAndWait(() -> { if (failoverCompleteHandler != null) { failoverCompleteHandler.handle(failedNodeID, theHAInfo, false); } }); } } private void checkSubs(String failedNodeID) { if (clusterViewChangedHandler == null) { return; } String chosen = chooseHashedNode(null, failedNodeID.hashCode()); if (chosen != null && chosen.equals(this.nodeID)) { runOnContextAndWait(() -> clusterViewChangedHandler.accept(new HashSet<>(clusterManager.getNodes()))); } } private void runOnContextAndWait(Runnable runnable) { CountDownLatch latch = new CountDownLatch(1); // The testsuite requires that this is called on a Vert.x thread vertx.runOnContext(v -> { try { runnable.run(); } finally { latch.countDown(); } }); try { latch.await(30, TimeUnit.SECONDS); } catch (InterruptedException ignore) { } } // Process the failover of a deployment private void processFailover(JsonObject failedVerticle) { if (failDuringFailover) { throw new VertxException("Oops!"); } // This method must block until the failover is complete - i.e. the verticle is successfully redeployed final String verticleName = failedVerticle.getString("verticle_name"); final CountDownLatch latch = new CountDownLatch(1); final AtomicReference<Throwable> err = new AtomicReference<>(); // Now deploy this verticle on this node ContextImpl ctx = vertx.getContext(); if (ctx != null) { // We could be on main thread in which case we don't want to overwrite tccl ContextImpl.setContext(null); } JsonObject options = failedVerticle.getJsonObject("options"); try { doDeployVerticle(verticleName, new DeploymentOptions(options), result -> { if (result.succeeded()) { log.info("Successfully redeployed verticle " + verticleName + " after failover"); } else { log.error("Failed to redeploy verticle after failover", result.cause()); err.set(result.cause()); } latch.countDown(); Throwable t = err.get(); if (t != null) { throw new VertxException(t); } }); } finally { if (ctx != null) { ContextImpl.setContext(ctx); } } try { if (!latch.await(120, TimeUnit.SECONDS)) { throw new VertxException("Timed out waiting for redeploy on failover"); } } catch (InterruptedException e) { throw new IllegalStateException(e); } } // Compute the failover node private String chooseHashedNode(String group, int hashCode) { List<String> nodes = clusterManager.getNodes(); ArrayList<String> matchingMembers = new ArrayList<>(); for (String node: nodes) { String sclusterInfo = clusterMap.get(node); if (sclusterInfo != null) { JsonObject clusterInfo = new JsonObject(sclusterInfo); String memberGroup = clusterInfo.getString("group"); if (group == null || group.equals(memberGroup)) { matchingMembers.add(node); } } } if (!matchingMembers.isEmpty()) { // Hashcodes can be -ve so make it positive long absHash = (long)hashCode + Integer.MAX_VALUE; long lpos = absHash % matchingMembers.size(); return matchingMembers.get((int)lpos); } else { return null; } } }