package water; /** A Distributed Key/Value Store. * <p> * Functions to Get and Put Values into the K/V store by Key. * <p> * The <em>Java Memory Model</em> is observed for all operations. Reads/Gets * will block until the data is available, and will pull from the local cache * is possible. * <p> * Writes/Puts do not block directly, but take a Futures argument. Typically * a Put requires some kind of coherency traffic and perhaps multiple network * hops. The Futures argument can be used to tell when when a given Put (or a * collection of them) has completed. Calls to Put without a Futures merely * make one internally and block till completion. * <p> * <em><b>Performance Concerns</b></em> * <p> * Keys can be cached locally, or not. Cached reads take no more time than a * NonBlockingHashMap lookup (typically a hundred nanos or so). Remote reads * require the serialized POJO to pass over the network, plus a little bit of * management logic; time is typically completely determined by network speeds * and object size. * <p> * Local Puts (one where the Key is homed on this Node) update directly in the * local K/V store, taking no more time than a NonBlockingHashMap write. * Remote Puts will serialize and ship data over the wire, taking time related * to object size and network speed. * <p> * Blocking for a Put to complete takes longer, requiring all invalidates to * have happened and perhaps a response from the home node (multiple * network-hop latencies); the invalidates and response are typically a single * UDP packet, but must make a round-trip. * <p> * Puts to unrelated Keys can all proceed in parallel, and will typically be * network bound, and can be blocked for in bulk by a single Futures argument. * <p> * Puts to the same Key will be serialized (the first Put will fully complete, * including all invalidates, before a 2nd Put to the same Key from the same * Node can proceed). Assuming no other Node does a Get on this Key, no * invalidates will be required for the 2nd and later Puts and they will need * only the single round-trip. * <p> * Note that this class works on one Key at a time, and does not understand * composite Key structures (such as a {@link water.fvec.Vec} Key and all its related * {@link water.fvec.Chunk} Keys - instead it serves as the building block for such * structures. * <p> * @author <a href="mailto:cliffc@h2o.ai"></a> * @version 1.0 */ public abstract class DKV { /** Make the mapping <em>key -> v</em>. Blocking, caching. */ static public Value put( Key key, Iced v ) { return put(key,new Value(key,v)); } /** Make the mapping <em>key -> v</em>. Caching. */ static public Value put( Key key, Iced v, Futures fs ) { return put(key,new Value(key,v),fs); } /** Make the mapping <em>key -> v</em>. */ static public Value put( Key key, Iced v, Futures fs,boolean dontCache ) { return put(key,new Value(key,v),fs,dontCache); } /** Make the mapping <em>keyed._key -> keyed</em>. Blocking, caching. */ static public Value put( Keyed keyed ) { return put(keyed._key,new Value(keyed._key,keyed)); } /** Make the mapping <em>keyed._key -> keyed</em>. Caching. */ static public Value put( Keyed keyed, Futures fs ) { return put(keyed._key,new Value(keyed._key,keyed),fs); } /** Make the mapping <em>key -> val</em>. Blocking, caching. */ static public Value put( Key key, Value val ) { Futures fs = new Futures(); Value old = put(key,val,fs); fs.blockForPending(); return old; } /** Make the mapping <em>key -> val</em>. Caching. */ static public Value put( Key key, Value val, Futures fs ) { return put(key,val,fs,false);} /** Make the mapping <em>key -> val</em>. */ static public Value put( Key key, Value val, Futures fs, boolean dontCache ) { assert key != null; assert val==null || val._key == key:"non-matching keys " + key + " != " + val._key; while( true ) { Value old = Value.STORE_get(key); // Raw-get: do not lazy-manifest if overwriting Value res = DputIfMatch(key,val,old,fs,dontCache); if( res == old ) return old; // PUT is globally visible now? if( val != null && val._key != key ) key = val._key; } } /** Remove any mapping for <em>key</em>. Blocking. */ static public Value remove( Key key ) { return put(key,null); } /** Remove any mapping for <em>key</em>. */ static public Value remove( Key key, Futures fs ) { return put(key,null,fs); } /** Default caching call to {@link #DputIfMatch(Key,Value,Value,Futures,boolean)} */ static public Value DputIfMatch( Key key, Value val, Value old, Futures fs) { return DputIfMatch(key, val, old, fs, false); } /** Update the mapping for Key <em>key</em>, from Value <em>old</em> to Value * <em>val</em>. Fails if the Key is not mapped to <em>old</em>, returning * the Value it IS mapped to. Takes a required {@link Futures}, which can * be used to note when the operation has completed globally. If the * <em>dontCache</em> hint is passed in, the Value <em>val</em> is NOT * cached locally, useful streaming a large dataset through and expecting * most of the data to eventually be homed remotely. * <p> * Additionally, this operation <em>locks</em> the Cloud to the current size. * No new Nodes may join after a Key is successfully entered into the DKV. * <p> * @return The Value this Key used to be mapped to; if the returned * Value.equals(old) then the update succeeded, else it failed. */ static public Value DputIfMatch( Key key, Value val, Value old, Futures fs, boolean dontCache ) { // For debugging where keys are created from // try { System.err.flush(); System.err.println(key); Thread.dumpStack(); System.err.flush(); } catch (Throwable t) {} // First: I must block repeated remote PUTs to the same Key until all prior // ones complete - the home node needs to see these PUTs in order. // Repeated PUTs on the home node are already ordered. if( old != null && !key.home() ) old.startRemotePut(); // local update first, since this is a weak update if( val == null && key.home() ) val = Value.makeNull(key); Value res = H2O.putIfMatch(key,val,old); if( res != old ) // Failed? return res; // Return fail value // Check for trivial success: no need to invalidate remotes if the new // value equals the old. if( old != null && old == val ) { System.out.println("No invalidate, new==old"); return old; // Trivial success? } if( old != null && val != null && val.equals(old) ) { System.out.println("No invalidate, new.equals(old)"); return old; // Less trivial success, but no network i/o } // Before we start doing distributed writes... block until the cloud // stabilizes. After we start doing distributed writes, it is an error to // change cloud shape - the distributed writes will be in the wrong place. Paxos.lockCloud(key); // The 'D' part of DputIfMatch: do Distribution. // If PUT is on HOME, invalidate remote caches // If PUT is on non-HOME, replicate/push to HOME if( key.home() ) { // On HOME? if( old != null ) old.lockAndInvalidate(H2O.SELF,val,fs); else val.lowerActiveGetCount(null); // Remove initial read-lock, accounting for pending inv counts } else { // On non-HOME? // Start a write, but do not block for it TaskPutKey.put(key.home_node(),key,val,fs, dontCache); } return old; } // Stall until all existing writes have completed. // Used to order successive writes. static void write_barrier() { for( H2ONode h2o : H2O.CLOUD._memary ) for( RPC rpc : h2o.tasks() ) if( rpc._dt instanceof TaskPutKey || rpc._dt instanceof Atomic ) rpc.get(); } static public <T extends Iced> T getGet(String key) { return key == null ? null : (T)getGet(Key.make(key)); } static public <T extends Iced> T getGet(Key key) { if (null == key) return null; Value v = get(key); if (null == v) return null; return v.get(); } /** Return the {@link Value} mapped to Key <em>key</em>, or null if no * mapping. Blocks till data available, always caches. * @return The {@link Value} mapped to Key <em>key</em>, or null if no * mapping. */ static public Value get ( Key key ) { return get(key,true ); } /** Prefetch and cache the Value for Key <em>key</em>. Non-blocking. */ static public void prefetch( Key key ) { get(key,false); } /** Return the {@link Value} mapped to Key formed by <em>key_name</em>, or * null if no mapping. Blocks till data available, always caches. * @return The {@link Value} mapped to Key formed by <em>key_name</em>, or * null if no mapping. */ static public Value get ( String key_name) { return get(Key.make(key_name),true ); } /** Prefetch and cache the Value for Key formed by <em>key_name</em>. * Non-blocking. */ static public void prefetch( String key_name ) { get(Key.make(key_name),false); } static private Value get( Key key, boolean blocking ) { // Read the Cloud once per put-attempt, to keep a consistent snapshot. H2O cloud = H2O.CLOUD; Value val = Value.STORE_get(key); // Hit in local cache? if( val != null ) { if( val.rawMem() != null || val.rawPOJO() != null || val.isPersisted() ) return val; assert !key.home(); // Master must have *something*; we got nothing & need to fetch } // While in theory we could read from any replica, we always need to // inform the home-node that his copy has been Shared... in case it // changes and he needs to issue an invalidate. For now, always and only // fetch from the Home node. H2ONode home = cloud._memary[key.home(cloud)]; // If we missed in the cache AND we are the home node, then there is // no V for this K (or we have a disk failure). if( home == H2O.SELF ) return null; // Pending write to same key from this node? Take that write instead. // Moral equivalent of "peeking into the cpu store buffer". Can happen, // e.g., because a prior 'put' of a null (i.e. a remove) is still mid- // send to the remote, so the local get has missed above, but a remote // get still might 'win' because the remote 'remove' is still in-progress. TaskPutKey tpk = home.pendingPutKey(key); if( tpk != null ) return tpk._xval == null || tpk._xval.isNull() ? null : tpk._xval; // Get data "the hard way" RPC<TaskGetKey> tgk = TaskGetKey.start(home,key); return blocking ? TaskGetKey.get(tgk) : null; } }