package water;
/**
* Distributed Key/Value Store
*
* This class handles the distribution pattern.
*
* @author <a href="mailto:cliffc@h2o.ai"></a>
* @version 1.0
*/
public abstract class DKV {
// This put is a top-level user-update, and not a reflected or retried
// update. i.e., The User has initiated a change against the K/V store.
// This is a WEAK update: it is not strongly ordered with other updates
static public Value put( Key key, Value val ) { return put(key,val,null); }
static public Value put( Key key, Value val, Futures fs ) { return put(key,val,fs,false);}
static public Value put( Key key, Value val, Futures fs, boolean dontCache ) {
assert key != null;
assert val==null || val._key == key:"non-matching keys " + ((Object)key).toString() + " != " + ((Object)val._key).toString();
while( true ) {
Value old = H2O.raw_get(key); // Raw-get: do not lazy-manifest if overwriting
Value res = DputIfMatch(key,val,old,fs,dontCache);
if( res == old ) return old; // PUT is globally visible now?
if( val != null && val._key != key ) key = val._key;
}
}
static public Value put( Key key, Iced v ) { return put(key,v,null); }
static public Value put( Key key, Iced v, Futures fs ) {
return put(key,new Value(key,v),fs);
}
static public Value put( Key key, Iced v, Futures fs,boolean donCache ) {
return put(key,new Value(key,v),fs,donCache);
}
// Remove this Key
static public Value remove( Key key ) { return remove(key,null); }
static public Value remove( Key key, Futures fs ) { return put(key,null,fs); }
// Do a PUT, and on success trigger replication. Some callers need the old
// value, and some callers need the Futures so we can block later to ensure
// the result is there. Many callers don't need either value. So rather
// than making a special object to return the pair of values, I've settled
// for a "callers pay" model with a more complex return setup. The return
// value is a Futures if one is needed, or the old Value if not. If a
// Futures is returned the old Value is stashed inside of it for the caller
// to consume.
static public Value DputIfMatch( Key key, Value val, Value old, Futures fs) {
return DputIfMatch(key, val, old, fs, false);
}
static public Value DputIfMatch( Key key, Value val, Value old, Futures fs, boolean dontCache ) {
// First: I must block repeated remote PUTs to the same Key until all prior
// ones complete - the home node needs to see these PUTs in order.
// Repeated PUTs on the home node are already ordered.
if( old != null && !key.home() ) old.startRemotePut();
// local update first, since this is a weak update
Value res = H2O.putIfMatch(key,val,old);
if( res != old ) // Failed?
return res; // Return fail value
// Check for trivial success: no need to invalidate remotes if the new
// value equals the old.
if( old != null && old == val ) return old; // Trivial success?
if( old != null && val != null && val.equals(old) )
return old; // Less trivial success, but no network i/o
// Before we start doing distributed writes... block until the cloud
// stablizes. After we start doing distrubuted writes, it is an error to
// change cloud shape - the distributed writes will be in the wrong place.
Paxos.lockCloud();
// The 'D' part of DputIfMatch: do Distribution.
// If PUT is on HOME, invalidate remote caches
// If PUT is on non-HOME, replicate/push to HOME
if( key.home() ) { // On HOME?
if( old != null ) old.lockAndInvalidate(H2O.SELF,fs);
} else { // On non-HOME?
// Start a write, but do not block for it
TaskPutKey.put(key.home_node(),key,val,fs, dontCache);
}
return old;
}
// Stall until all existing writes have completed.
// Used to order successive writes.
static public void write_barrier() {
for( H2ONode h2o : H2O.CLOUD._memary )
for( RPC rpc : h2o.tasks() )
if( rpc._dt instanceof TaskPutKey || rpc._dt instanceof Atomic )
rpc.get();
}
// User-Weak-Get a Key from the distributed cloud.
static public Value get( Key key, int len, int priority ) {
while( true ) {
// Read the Cloud once per put-attempt, to keep a consistent snapshot.
H2O cloud = H2O.CLOUD;
Value val = H2O.get(key);
// Hit in local cache?
if( val != null ) {
if( len > val._max ) len = val._max; // See if we have enough data cached locally
if( len == 0 || val.rawMem() != null || val.rawPOJO() != null || val.isPersisted() ) return val;
assert !key.home(); // Master must have *something*; we got nothing & need to fetch
}
// While in theory we could read from any replica, we always need to
// inform the home-node that his copy has been Shared... in case it
// changes and he needs to issue an invalidate. For now, always and only
// fetch from the Home node.
H2ONode home = cloud._memary[key.home(cloud)];
// If we missed in the cache AND we are the home node, then there is
// no V for this K (or we have a disk failure).
if( home == H2O.SELF ) return null;
// Pending write to same key from this node? Take that write instead.
// Moral equivalent of "peeking into the cpu store buffer". Can happen,
// e.g., because a prior 'put' of a null (i.e. a remove) is still mid-
// send to the remote, so the local get has missed above, but a remote
// get still might 'win' because the remote 'remove' is still in-progress.
for( RPC<?> rpc : home.tasks() )
if( rpc._dt instanceof TaskPutKey ) {
assert rpc._target == home;
TaskPutKey tpk = (TaskPutKey)rpc._dt;
Key k = tpk._key;
if( k != null && key.equals(k) )
return tpk._xval;
}
return TaskGetKey.get(home,key,priority);
}
}
static public Value get( Key key ) { return get(key,Integer.MAX_VALUE,H2O.GET_KEY_PRIORITY); }
}