/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.cluster; import java.io.IOException; import java.lang.Thread.UncaughtExceptionHandler; import java.net.InetAddress; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; import javax.inject.Inject; import org.apache.thrift.TException; import org.diqube.cluster.ClusterLayoutStateMachine.RemoveNode; import org.diqube.cluster.ClusterLayoutStateMachine.SetTablesOfNode; import org.diqube.config.Config; import org.diqube.config.ConfigKey; import org.diqube.connection.ClusterNodeStatusDetailListener; import org.diqube.connection.Connection; import org.diqube.connection.ConnectionException; import org.diqube.connection.ConnectionPool; import org.diqube.connection.NodeAddress; import org.diqube.connection.OurNodeAddressProvider; import org.diqube.consensus.ConsensusClient; import org.diqube.consensus.ConsensusClient.ClosableProvider; import org.diqube.consensus.ConsensusClient.ConsensusClusterUnavailableException; import org.diqube.consensus.ConsensusClusterNodeAddressProvider; import org.diqube.consensus.ConsensusIsLeaderProvider; import org.diqube.consensus.ConsensusServer; import org.diqube.consensus.ConsensusStateMachineClientInterruptedException; import org.diqube.context.AutoInstatiate; import org.diqube.context.InjectOptional; import org.diqube.context.shutdown.ContextShutdownListener; import org.diqube.context.shutdown.ShutdownBefore; import org.diqube.listeners.ClusterManagerListener; import org.diqube.listeners.ServingListener; import org.diqube.listeners.TableLoadListener; import org.diqube.listeners.providers.LoadedTablesProvider; import org.diqube.listeners.providers.OurNodeAddressStringProvider; import org.diqube.remote.cluster.thrift.ClusterManagementService; import org.diqube.threads.ExecutorManager; import org.diqube.thrift.base.thrift.RNodeAddress; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Iterables; /** * Manages state of the diqube-server cluster, this nodes state in other cluster nodes and shares information about * that. * * <p> * This class ensures that the information other nodes have about this node is correct, it manages our nodes address. * * @author Bastian Gloeckle */ @AutoInstatiate public class ClusterManager implements ServingListener, TableLoadListener, OurNodeAddressStringProvider, ClusterNodeStatusDetailListener, OurNodeAddressProvider, ConsensusClusterNodeAddressProvider, ContextShutdownListener { private static final Logger logger = LoggerFactory.getLogger(ClusterManager.class); private static final String OUR_HOST_AUTOMATIC = "*"; @Config(ConfigKey.OUR_HOST) private String ourHost; @Config(ConfigKey.PORT) private int ourPort; private NodeAddress ourHostAddr; @Config(ConfigKey.CLUSTER_NODES) private String clusterNodesConfigString; @Inject private ConnectionPool connectionPool; @InjectOptional private List<ClusterManagerListener> clusterManagerListeners; /** will contain "this", too! */ @InjectOptional private List<ClusterNodeStatusDetailListener> clusterNodeDiedListeners; private List<NodeAddress> consensusClusterNodes = new ArrayList<>(); @Inject private ClusterLayout clusterLayout; @Inject private ConsensusClient consensusClient; @Inject private ConsensusIsLeaderProvider consensusIsLeaderProvider; @Inject private LoadedTablesProvider loadedTablesProvider; @Inject private ExecutorManager executorManager; /** * Disable the methods of {@link ClusterNodeStatusDetailListener} on startup, until we have initially found some * cluster nodes. */ private boolean clusterNodeStatusDetailListenerDisabled = true; private ExecutorService executorService; @PostConstruct public void initialize() { if (ourHost.equals(OUR_HOST_AUTOMATIC)) { try { InetAddress foundAddr = InetAddress.getLocalHost(); ourHost = foundAddr.getHostAddress(); logger.info("Using {} as our host address. We expect that other cluster nodes will be able to reach this " + "node under that address. If not, define a different host in the configuration!", ourHost); } catch (UnknownHostException e) { logger.error("Configuration said to identify our host automatically, " + "but was not able to inspect network interfaces.", e); throw new RuntimeException("Configuration said to identify our host automatically, " + "but was not able to inspect network interfaces.", e); } } else logger.info("Using {} as our host address. We expect that other cluster nodes will be able to reach this node " + "under that address!", ourHost); ourHostAddr = new NodeAddress(ourHost, (short) ourPort); executorService = executorManager.newCachedThreadPool("clustermanager-%d", new UncaughtExceptionHandler() { @Override public void uncaughtException(Thread t, Throwable e) { logger.error("Error while executing asynchronous ClusterManager task", e); // swallow otherwise, as we'd like to continue as well as possible. } }); } @PreDestroy public void cleanup() { if (executorService != null) executorService.shutdownNow(); } @Override @ShutdownBefore({ ConsensusClient.class, ConsensusServer.class }) public void contextAboutToShutdown() { // try to gracefully tell the ClusterLayout that we're gone. If it does not work within a second, skip it. The other // nodes might then try to submit stuff to our node, but will soon discover that we're down and remove us from the // ClusterLayout themselves. try { logger.debug("Trying to remove ourselves from the cluster layout, as we're shutting down..."); executorService.submit(() -> { try (ClosableProvider<ClusterLayoutStateMachine> p = consensusClient.getStateMachineClient(ClusterLayoutStateMachine.class)) { p.getClient().removeNode(RemoveNode.local(ourHostAddr)); } catch (ConsensusClusterUnavailableException e) { logger.warn("Could not access consensus cluster to remove ourselves from cluster layout."); } }).get(1, TimeUnit.SECONDS); } catch (TimeoutException | InterruptedException | ExecutionException e) { logger.warn("Could not deregister from cluster layout gracefully. The other cluster nodes might show exceptions " + "about this soon, but the cluster should recover.", e); } } private List<NodeAddress> parseClusterNodes(String clusterNodes) { List<NodeAddress> res = new ArrayList<>(); for (String clusterNodeString : clusterNodes.split(",")) { int lastColon = clusterNodeString.lastIndexOf(":"); if (lastColon == -1) { logger.warn("No port specified in '{}'. Ignoring.", clusterNodeString); continue; } if (lastColon == 0) { logger.warn("No host specified in '{}'. Ignoring.", clusterNodeString); continue; } short port; try { port = Short.valueOf(clusterNodeString.substring(lastColon + 1)); } catch (NumberFormatException e) { logger.warn("Could not parse port in '{}'. Ignoring.", clusterNodeString); continue; } String host = clusterNodeString.substring(0, lastColon); res.add(new NodeAddress(host, port)); } if (res.isEmpty()) return null; return res; } @Override public void localServerStartedServing() { if (clusterNodesConfigString == null || "".equals(clusterNodesConfigString)) { logger.info("There are no cluster nodes configured, will therefore not connect anywhere."); if (clusterManagerListeners != null) clusterManagerListeners.forEach(l -> l.clusterInitialized()); return; } List<NodeAddress> initialClusterNodes = parseClusterNodes(this.clusterNodesConfigString); if (initialClusterNodes == null) { logger.warn("There are no cluster nodes configured, will therefore not connect anywhere."); if (clusterManagerListeners != null) clusterManagerListeners.forEach(l -> l.clusterInitialized()); return; } logger.debug("Starting to communicate to cluster using the configured hosts ({})...", initialClusterNodes); try { // use the first node we can contact to fetch a list of all cluster nodes it knows. That list will later be used // to startup the consensus node. Set<RNodeAddress> allClusterNodes = new HashSet<>(); for (NodeAddress nodeAddr : initialClusterNodes) { try (Connection<ClusterManagementService.Iface> conn = reserveConnection(nodeAddr)) { allClusterNodes.addAll(conn.getService().getAllKnownClusterNodes()); } catch (ConnectionException | TException | IOException e) { logger.warn("Could not contact cluster node at {}.", nodeAddr, e); } } if (allClusterNodes.isEmpty()) { logger.warn("There are no cluster nodes alive, will therefore not connect anywhere."); if (clusterManagerListeners != null) clusterManagerListeners.forEach(l -> l.clusterInitialized()); return; } allClusterNodes.forEach(remoteAddr -> consensusClusterNodes.add(new NodeAddress(remoteAddr))); } catch (InterruptedException e) { logger.error("Interrupted while starting to communicate with cluster", e); return; } logger.info("Gathered {} node addresses of the cluster (limit): {}", consensusClusterNodes.size(), Iterables.limit(consensusClusterNodes, 100)); // enable activity when dead or alive nodes are identified. clusterNodeStatusDetailListenerDisabled = false; if (clusterManagerListeners != null) clusterManagerListeners.forEach(l -> l.clusterInitialized()); } private Connection<ClusterManagementService.Iface> reserveConnection(NodeAddress addr) throws ConnectionException, InterruptedException { return connectionPool.reserveConnection(ClusterManagementService.Iface.class, addr.createRemote(), null /* node will be removed automatically from ClusterManager, therefore no separate listener needed */); } @Override public void localServerStoppedServing() { // noop. } @Override public void nodeDied(RNodeAddress diedAddr) { if (clusterNodeStatusDetailListenerDisabled) // Disable during startup, as we do not want to act on "dead" nodes of the config file. return; // This will typically be called when a connection to a node fails. We will not remove the node from the consensus // cluster (as that would allow split-brains), but we ensure that its information is removed from the clusterLayout // across the consensus cluster. That way, no connections will be opened to that cluster node for queries any more // etc. We have to ensure that we integrate the current information again as soon as the node gets back online (=the // node gets restarted which would be a normal join to the consensus cluster or if the e.g. network partition ends // and we can communicate with the node again without it re-joining the cluster). if (diedAddr.isSetDefaultAddr()) { NodeAddress addr = new NodeAddress(diedAddr); logger.trace("Cluster node died. Checking consensus cluster if we need to distribute that information..."); // execute asynchronously, as this might take some time and we might even still be in startup (e.g. internal // consensus cluster server startup). executorService.execute(() -> { try { if (clusterLayout.isNodeKnown(addr)) { logger.info( "Cluster node died: {}. Distributing information on changed cluster layout in consensus cluster.", addr); // This might actually be executed by multiple cluster nodes in parallel, but that does not hurt that much, // as node deaths should be rare. try (ClosableProvider<ClusterLayoutStateMachine> p = consensusClient.getStateMachineClient(ClusterLayoutStateMachine.class)) { p.getClient().removeNode(RemoveNode.local(addr)); } catch (ConsensusClusterUnavailableException e) { logger.warn("Could not remove node {} from cluster layout since consensus cluster is unavailable", addr); } } else logger.trace("Cluster node died. No need to distribute information since that node was unknown to the " + "consensus cluster anyway."); } catch (InterruptedException | ConsensusStateMachineClientInterruptedException | ConsensusClusterUnavailableException e) { // exit quietly. } }); } } @Override public void nodeAlive(RNodeAddress remoteNodeAddr) throws InterruptedException { if (clusterNodeStatusDetailListenerDisabled) // Disable during startup, as we are not yet interesting in "alive" nodes - we will receive cluster layout // information automatically if we join a cluster (= our consensus log will be filled) or if we're a single node // setup, there are no nodes anyway. return; // This will typically be called on the consensus master node when a new node joined or became alive again, as the // consensus master periodically sends keepAlives to all nodes. We ensure here that we get current information about // that new node. if (!consensusIsLeaderProvider.isLeader()) // Only let the consensus leader find new alive nodes. This is to reduce the number of times a new node is asked // to "publishLoadedTables" and also to limit the number of times "clusterLayout.isNodeKnown" is called: This is // pretty slow on non-leader nodes, but we will receive a lot of "nodeAlive" calls. return; if (remoteNodeAddr.isSetDefaultAddr()) { NodeAddress addr = new NodeAddress(remoteNodeAddr); try { if (!clusterLayout.isNodeKnown(addr)) { logger.info("Cluster node seems to be accessible now: {}. As we do not have information on the tables this " + "new node serves, we ask it to publicize that.", addr); try (Connection<ClusterManagementService.Iface> conn = reserveConnection(addr)) { conn.getService().publishLoadedTablesInConsensus(); } catch (ConnectionException | TException | IOException e) { logger.warn("Could not contact cluster node at {}.", addr, e); } } } catch (ConsensusClusterUnavailableException e) { logger.warn("Could not inform cluster about the node {} becoming alive, since the consensus " + "cluster is not reachable", e); } } } @Override public synchronized void tableLoaded(String newTableName) throws AbortTableLoadException { logger.info("Informing consensus cluster of our updated table list."); try (ClosableProvider<ClusterLayoutStateMachine> p = consensusClient.getStateMachineClient(ClusterLayoutStateMachine.class)) { p.getClient().setTablesOfNode(SetTablesOfNode.local(ourHostAddr, loadedTablesProvider.getNamesOfLoadedTables())); } catch (ConsensusClusterUnavailableException e) { logger.error("Table cannot be loaded because consensus cluster is not available", e); throw new AbortTableLoadException("Table cannot be loaded because consensus cluster is not available", e); } logger.trace("Informed consensus cluster of our updated table list."); } @Override public void tableUnloaded(String tableName) { logger.info("Informing consensus cluster of our updated table list."); try (ClosableProvider<ClusterLayoutStateMachine> p = consensusClient.getStateMachineClient(ClusterLayoutStateMachine.class)) { p.getClient().setTablesOfNode(SetTablesOfNode.local(ourHostAddr, loadedTablesProvider.getNamesOfLoadedTables())); logger.trace("Informed consensus cluster of our updated table list."); } catch (ConsensusClusterUnavailableException e) { logger.warn("Could not inform consensus cluster that we do not serve the table any more.", e); } } @Override public NodeAddress getOurNodeAddress() { return ourHostAddr; } @Override public String getOurNodeAddressAsString() { return ourHostAddr.toString(); } @Override public List<NodeAddress> getClusterNodeAddressesForConsensus() { return consensusClusterNodes; } }