package water;
import java.util.Arrays;
import water.H2ONode.H2Okey;
import water.nbhm.NonBlockingHashMap;
import water.util.Log;
/**
* (Not The) Paxos
*
* Used to define Cloud membership. See:
* http://en.wikipedia.org/wiki/Paxos_%28computer_science%29
*
* Detects and builds a "cloud" - a cooperating group of nodes, with mutual
* knowledge of each other. Basically tracks all the nodes that *this* node
* has ever heard of, and when *all* of the other nodes have all heard of each
* other, declares the situation as "commonKnowledge", and a Cloud. This
* algorithm differs from Paxos in a number of obvious ways:
* - it is not robust against failing nodes
* - it requires true global consensus (a Quorum of All)
* - it is vastly simpler than Paxos
*
* @author <a href="mailto:cliffc@h2o.ai"></a>
* @version 1.0
*/
public abstract class Paxos {
// Whether or not we have common knowledge
public static volatile boolean _commonKnowledge = false;
// Whether or not we're allowing distributed-writes. The cloud is not
// allowed to change shape once we begin writing.
public static volatile boolean _cloudLocked = false;
public static NonBlockingHashMap<H2Okey,H2ONode> PROPOSED = new NonBlockingHashMap();
// ---
// This is a packet announcing what Cloud this Node thinks is the current
// Cloud, plus other status bits
static synchronized int doHeartbeat( H2ONode h2o ) {
// Kill somebody if the jar files mismatch. Do not attempt to deal with
// mismatched jars.
if( !H2O.OPT_ARGS.md5skip && !h2o._heartbeat.check_jar_md5() ) {
if( H2O.CLOUD.size() > 1 ) {
Log.warn("Killing "+h2o+" because of H2O version mismatch (md5 differs).");
UDPRebooted.T.mismatch.send(h2o);
} else {
Log.err("Attempting to join "+h2o+" with an H2O version mismatch (md5 differs). (Is H2O already running?) Exiting.");
H2O.exit(-1);
}
return 0;
}
// Never heard of this dude? See if we want to kill him off for being cloud-locked
if( !PROPOSED.contains(h2o) ) {
if( _cloudLocked ) {
Log.warn("Killing "+h2o+" because the cloud is no longer accepting new H2O nodes.");
UDPRebooted.T.locked.send(h2o);
return 0;
}
if( _commonKnowledge ) {
_commonKnowledge = false; // No longer sure about things
H2O.SELF._heartbeat._common_knowledge = false;
Log.debug("Cloud voting in progress");
}
// Add to proposed set, update cloud hash
H2ONode res = PROPOSED.putIfAbsent(h2o._key,h2o);
assert res==null;
H2O.SELF._heartbeat._cloud_hash += h2o.hashCode();
} else if( _commonKnowledge ) {
return 0; // Already know about you, nothing more to do
}
int chash = H2O.SELF._heartbeat._cloud_hash, dummy = 0;
assert chash == (dummy=doHash()) : "mismatched hash4, HB="+chash+" full="+dummy;
assert _commonKnowledge==false;
// Do we have consensus now?
H2ONode h2os[] = PROPOSED.values().toArray(new H2ONode[0]);
for( H2ONode h2o2 : h2os )
if( chash != h2o2._heartbeat._cloud_hash )
return print("Heartbeat hashes differ, self=0x"+Integer.toHexString(chash)+" "+h2o2+"=0x"+Integer.toHexString(h2o2._heartbeat._cloud_hash)+" ",PROPOSED);
// Hashes are same, so accept the new larger cloud-size
H2O.CLOUD.set_next_Cloud(h2os,chash);
// Demand everybody has rolled forward to same size before consensus
boolean same_size=true;
for( H2ONode h2o2 : h2os )
same_size &= (h2o2._heartbeat._cloud_size == H2O.CLOUD.size());
if( !same_size ) return 0;
H2O.SELF._heartbeat._common_knowledge = true;
for( H2ONode h2o2 : h2os )
if( !h2o2._heartbeat._common_knowledge ) {
return print("Missing common knowledge from all nodes!" ,PROPOSED);
}
_commonKnowledge = true; // Yup! Have global consensus
Paxos.class.notifyAll(); // Also, wake up a worker thread stuck in DKV.put
Paxos.print("Announcing new Cloud Membership: ", H2O.CLOUD._memary);
Log.info("Cloud of size ", H2O.CLOUD.size(), " formed ", H2O.CLOUD.toPrettyString());
H2O.notifyAboutCloudSize(H2O.SELF_ADDRESS, H2O.API_PORT, H2O.CLOUD.size());
return 0;
}
static private int doHash() {
int hash = 0;
for( H2ONode h2o : PROPOSED.values() )
hash += h2o.hashCode();
assert hash != 0;
return hash;
}
// Before we start doing distributed writes... block until the cloud
// stablizes. After we start doing distributed writes, it is an error to
// change cloud shape - the distributed writes will be in the wrong place.
static void lockCloud() {
if( _cloudLocked ) return; // Fast-path cutout
synchronized(Paxos.class) {
while( !_commonKnowledge )
try { Paxos.class.wait(); } catch( InterruptedException ie ) { }
_cloudLocked = true;
}
}
static int print( String msg, NonBlockingHashMap<H2Okey,H2ONode> p ) {
return print(msg,p.values().toArray(new H2ONode[0]));
}
static int print( String msg, H2ONode h2os[] ) { return print(msg,h2os,""); }
static int print( String msg, H2ONode h2os[], String msg2 ) {
Log.debug(msg,Arrays.toString(h2os),msg2);
return 0; // handy flow-coding return
}
}