package water; import java.util.Arrays; import water.H2ONode.H2Okey; import water.nbhm.NonBlockingHashMap; import water.util.Log; /** * (Not The) Paxos * * Used to define Cloud membership. See: * http://en.wikipedia.org/wiki/Paxos_%28computer_science%29 * * Detects and builds a "cloud" - a cooperating group of nodes, with mutual * knowledge of each other. Basically tracks all the nodes that *this* node * has ever heard of, and when *all* of the other nodes have all heard of each * other, declares the situation as "commonKnowledge", and a Cloud. This * algorithm differs from Paxos in a number of obvious ways: * - it is not robust against failing nodes * - it requires true global consensus (a Quorum of All) * - it is vastly simpler than Paxos * * @author <a href="mailto:cliffc@h2o.ai"></a> * @version 1.0 */ public abstract class Paxos { // Whether or not we have common knowledge public static volatile boolean _commonKnowledge = false; // Whether or not we're allowing distributed-writes. The cloud is not // allowed to change shape once we begin writing. public static volatile boolean _cloudLocked = false; public static final NonBlockingHashMap<H2Okey,H2ONode> PROPOSED = new NonBlockingHashMap<>(); // --- // This is a packet announcing what Cloud this Node thinks is the current // Cloud, plus other status bits static synchronized int doHeartbeat( H2ONode h2o ) { // Kill somebody if the jar files mismatch. Do not attempt to deal with // mismatched jars. if( !h2o._heartbeat.check_jar_md5() ) { if( H2O.CLOUD.size() > 1 ) { Log.warn("Killing "+h2o+" because of H2O version mismatch (md5 differs)."); UDPRebooted.T.mismatch.send(h2o); } else { H2O.die("Attempting to join "+h2o+" with an H2O version mismatch (md5 differs). (Is H2O already running?) Exiting."); } return 0; } if(h2o._heartbeat._cloud_name_hash != H2O.SELF._heartbeat._cloud_name_hash){ // ignore requests from this node as they are coming from different cluster return 0; } // I am not client but received client heartbeat in flatfile mode. // Means that somebody is trying to connect to this cloud. // => update list of static hosts (it needs clean up) if (!H2O.ARGS.client && H2O.isFlatfileEnabled() && h2o._heartbeat._client && !H2O.isNodeInFlatfile(h2o)) { // Extend static list of nodes to multicast to propagate information to client H2O.addNodeToFlatfile(h2o); H2O.reportClient(h2o); // A new client `h2o` is connected so we broadcast it around to other nodes // Note: this could cause a temporary flood of messages since the other // nodes will later inform about the connected client as well. // Note: It would be helpful to have a control over flatfile-based multicast to inject a small wait. UDPClientEvent.ClientEvent.Type.CONNECT.broadcast(h2o); } else if (H2O.ARGS.client && H2O.isFlatfileEnabled() && !H2O.isNodeInFlatfile(h2o)) { // This node is a client and using a flatfile to figure out a topology of the cluster. // In this case we do not expect that we have a complete flatfile but use information // provided by a host we received heartbeat from. // That means that the host is in our flatfile already or it was notified about this client node // via a node which is already in the flatfile) H2O.addNodeToFlatfile(h2o); } // Never heard of this dude? See if we want to kill him off for being cloud-locked if( !PROPOSED.contains(h2o) && !h2o._heartbeat._client ) { if( _cloudLocked ) { Log.warn("Killing "+h2o+" because the cloud is no longer accepting new H2O nodes."); UDPRebooted.T.locked.send(h2o); return 0; } if( _commonKnowledge ) { _commonKnowledge = false; // No longer sure about things H2O.SELF._heartbeat._common_knowledge = false; Log.debug("Cloud voting in progress"); } // Add to proposed set, update cloud hash. Do not add clients H2ONode res = PROPOSED.putIfAbsent(h2o._key,h2o); assert res==null; H2O.SELF._heartbeat._cloud_hash += h2o.hashCode(); } else if( _commonKnowledge ) { return 0; // Already know about you, nothing more to do } int chash = H2O.SELF._heartbeat._cloud_hash; assert chash == doHash() : "mismatched hash4, HB="+chash+" full="+doHash(); assert !_commonKnowledge; // Do we have consensus now? H2ONode h2os[] = PROPOSED.values().toArray(new H2ONode[PROPOSED.size()]); if( H2O.ARGS.client && h2os.length == 0 ) return 0; // Client stalls until it finds *some* cloud for( H2ONode h2o2 : h2os ) if( chash != h2o2._heartbeat._cloud_hash ) return print("Heartbeat hashes differ, self=0x"+Integer.toHexString(chash)+" "+h2o2+"=0x"+Integer.toHexString(h2o2._heartbeat._cloud_hash)+" ",PROPOSED); // Hashes are same, so accept the new larger cloud-size H2O.CLOUD.set_next_Cloud(h2os,chash); // Demand everybody has rolled forward to same size before consensus boolean same_size=true; for( H2ONode h2o2 : h2os ) same_size &= (h2o2._heartbeat._cloud_size == H2O.CLOUD.size()); if( !same_size ) return 0; H2O.SELF._heartbeat._common_knowledge = true; for( H2ONode h2o2 : h2os ) if( !h2o2._heartbeat._common_knowledge ) return print("Missing common knowledge from all nodes!" ,PROPOSED); _commonKnowledge = true; // Yup! Have global consensus Paxos.class.notifyAll(); // Also, wake up a worker thread stuck in DKV.put Paxos.print("Announcing new Cloud Membership: ", H2O.CLOUD._memary); Log.info("Cloud of size ", H2O.CLOUD.size(), " formed ", H2O.CLOUD.toString()); H2O.notifyAboutCloudSize(H2O.SELF_ADDRESS, H2O.API_PORT, H2O.CLOUD.size()); return 0; } static private int doHash() { int hash = 0; for( H2ONode h2o : PROPOSED.values() ) hash += h2o.hashCode(); assert hash != 0 || H2O.ARGS.client; return hash; } // Before we start doing distributed writes... block until the cloud // stabilizes. After we start doing distributed writes, it is an error to // change cloud shape - the distributed writes will be in the wrong place. static void lockCloud(Object reason) { if( _cloudLocked ) return; // Fast-path cutout lockCloud_impl(reason); } static private void lockCloud_impl(Object reason) { // Any fast-path cutouts must happen en route to here. Log.info("Locking cloud to new members, because "+reason.toString()); synchronized(Paxos.class) { while( !_commonKnowledge ) try { Paxos.class.wait(); } catch( InterruptedException ignore ) { } _cloudLocked = true; // remove nodes which are not in the cluster (e.g. nodes from flat-file which are not actually used) if(H2O.isFlatfileEnabled()){ for(H2ONode n: H2O.getFlatfile()){ if(!n._heartbeat._client && !PROPOSED.containsKey(n._key)){ Log.info("Flatile::" + n._key + " not active in this cloud. Removing it from the list."); n.stopSendThread(); } } } } } static int print( String msg, NonBlockingHashMap<H2Okey,H2ONode> p ) { return print(msg,p.values().toArray(new H2ONode[p.size()])); } static int print( String msg, H2ONode h2os[] ) { return print(msg,h2os,""); } static int print( String msg, H2ONode h2os[], String msg2 ) { Log.debug(msg,Arrays.toString(h2os),msg2); return 0; // handy flow-coding return } }