package water; import water.util.Log; /** * Extension used for checking failed nodes */ public class FailedNodeWatchdogExtension extends AbstractH2OExtension { private long watchdogClientRetryTimeout = 10000; private long watchdogClientConnectTimeout = 60000; private boolean watchDogClient = false; private boolean watchDogStopWithout = false; @Override public String getExtensionName() { return "Watchdog"; } @Override public void printHelp() { System.out.println( "\nFailed node watchdog extension:\n" + " -watchdog_client_retry_timeout\n" + " Time in milliseconds specifying in which intervals the failed nodes are checked. If not \n" + " specified, the default value of 10000 ms is used. \n" + " -watchdog_client\n" + " Same as the client except the that cluster is stopped when this client \n" + " disconnects from the rest of the cloud or the cloud is stopped when it doesn't \n" + " hear heartbeat from the client for specified amount of time. \n" + " -watchdog_client_connect_timeout\n" + " Time in milliseconds specifying how long to wait for watchdog client to\n" + " connect to the cluster before the cluster is stopped. \n" + " The default value of 10000 ms is used \n" + " -watchdog_stop_without_client\n" + " When set to true this property ensures that this cloud kills itself \n" + " when no watchdog client doesn't connect to the cluster for the specified timeout" ); } private String[] parseClient(String[] args){ for (int i = 0; i < args.length; i++) { H2O.OptString s = new H2O.OptString(args[i]); if(s.matches("watchdog_client")){ watchDogClient = true; H2O.ARGS.client = true; String[] new_args = new String[args.length - 1]; System.arraycopy(args, 0, new_args, 0, i); System.arraycopy(args, i + 1, new_args, i, args.length - (i + 1)); return new_args; } } return args; } private String[] parseClientStopWithout(String[] args){ for (int i = 0; i < args.length; i++) { H2O.OptString s = new H2O.OptString(args[i]); if(s.matches("watchdog_stop_without_client")){ watchDogStopWithout = true; String[] new_args = new String[args.length - 1]; System.arraycopy(args, 0, new_args, 0, i); System.arraycopy(args, i + 1, new_args, i, args.length - (i + 1)); return new_args; } } return args; } private String[] parseRetryTimeout(String args[]){ for (int i = 0; i < args.length; i++) { H2O.OptString s = new H2O.OptString(args[i]); if(s.matches("watchdog_client_retry_timeout")){ watchdogClientRetryTimeout = s.parseInt(args[i + 1]); String[] new_args = new String[args.length - 2]; System.arraycopy(args, 0, new_args, 0, i); System.arraycopy(args, i + 2, new_args, i, args.length - (i + 2)); return new_args; } } return args; } private String[] parseConnectionTimeout(String args[]){ for (int i = 0; i < args.length; i++) { H2O.OptString s = new H2O.OptString(args[i]); if(s.matches("watchdog_client_connect_timeout")){ watchdogClientConnectTimeout = s.parseInt(args[i + 1]); String[] new_args = new String[args.length - 2]; System.arraycopy(args, 0, new_args, 0, i); System.arraycopy(args, i + 2, new_args, i, args.length - (i + 2)); return new_args; } } return args; } @Override public String[] parseArguments(String[] args) { return parseClient(parseClientStopWithout(parseRetryTimeout(parseConnectionTimeout(args)))); } public void validateArguments() { if (watchdogClientRetryTimeout < 0) { H2O.parseFailed("Watchdog client retry timeout has to be positive: " + watchdogClientRetryTimeout); } if(watchdogClientConnectTimeout < 0) { H2O.parseFailed("Watchdog client connect timeout has to be positive: " + watchdogClientConnectTimeout); } } @Override public void onLocalNodeStarted() { if(watchDogStopWithout){ new CheckWatchdogConnectedThread().start(); } new FailedNodeWatchdogThread().start(); H2O.SELF._heartbeat._watchdog_client = watchDogClient; } private class CheckWatchdogConnectedThread extends Thread { public CheckWatchdogConnectedThread() { super("CheckWatchdogConnectedThread"); } @Override public void run() { try { sleep(watchdogClientConnectTimeout); boolean watchDogConnected = false; for(H2ONode client: H2O.getClients()){ if(client._heartbeat._watchdog_client){ watchDogConnected = true; break; } } if(!watchDogConnected){ // in this case we expect the watchdog to connect, however it is still not available // this is not a planned situation, exit with negative status Log.fatal("Stopping H2O cloud since the watchdog client never connected"); H2O.shutdown(-1); } } catch (InterruptedException e) { e.printStackTrace(); } } } /** * This method checks whether the client is disconnected from this node due to some problem such as client or network * is unreachable. */ private static void handleClientDisconnect(H2ONode node, long watchdogClientRetryTimeout) { if(node._heartbeat._watchdog_client){ Log.warn("Watchdog client " + node + " disconnected!"); WatchdogClientDisconnectedTask tsk = new WatchdogClientDisconnectedTask(node, watchdogClientRetryTimeout); Log.warn("Asking the rest of the nodes in the cloud whether watchdog client is really gone."); if((tsk.doAllNodes()).clientDisconnectedConsensus) { Log.fatal("Stopping H2O cloud since the watchdog client is disconnected from all nodes in the cluster!"); // we should fail with negative status as this is not planned shutdown H2O.shutdown(-1); } }else if(node._heartbeat._client) { Log.warn("Client "+ node +" disconnected!"); } // in both cases remove the client since the timeout is out if(node._heartbeat._client){ H2O.removeClient(node); if(H2O.isFlatfileEnabled()){ H2O.removeNodeFromFlatfile(node); } } } /** * Helper MR task used to detect clientDisconnectedConsensus on the timeout we last heard from the watchdog client */ private static class WatchdogClientDisconnectedTask extends MRTask<WatchdogClientDisconnectedTask> { private boolean clientDisconnectedConsensus = false; private H2ONode clientNode; private long watchdogClientRetryTimeout; WatchdogClientDisconnectedTask(H2ONode clientNode, long watchdogClientRetryTimeout) { this.clientNode = clientNode; this.watchdogClientRetryTimeout = watchdogClientRetryTimeout; } @Override public void reduce(WatchdogClientDisconnectedTask mrt) { this.clientDisconnectedConsensus = this.clientDisconnectedConsensus && mrt.clientDisconnectedConsensus; } @Override protected void setupLocal() { final H2ONode foundClient = H2O.getClientsByKey().get(clientNode._key); if (foundClient == null || isTimeoutExceeded(foundClient, watchdogClientRetryTimeout )) { // Agree on the consensus if this node does not see the client at all or if this node sees the client // however the timeout is out clientDisconnectedConsensus = true; } } } private static boolean isTimeoutExceeded(H2ONode client, long timeout) { return (System.currentTimeMillis() - client._last_heard_from) >= timeout; } /** * Thread used to run disconnect hooks on nodes who disconnects from the cloud */ private class FailedNodeWatchdogThread extends Thread { public FailedNodeWatchdogThread() { super("FailedNodeWatchdogThread"); setDaemon(true); } @Override public void run() { while (true) { for(H2ONode client: H2O.getClients()){ if(isTimeoutExceeded(client, watchdogClientRetryTimeout)){ handleClientDisconnect(client, watchdogClientRetryTimeout); } } try { Thread.sleep(watchdogClientRetryTimeout); } catch (InterruptedException ignore) {} } } } }