package water;
import java.io.IOException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
import jsr166y.ForkJoinPool;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.Log;
import water.util.StringUtils;
/** The core Value stored in the distributed K/V store, used to cache Plain Old
* Java Objects, and maintain coherency around the cluster. It contains an
* underlying byte[] which may be spilled to disk and freed by the {@link
* MemoryManager}, which is the {@link Iced} serialized version of the POJO,
* and a cached copy of the POJO itself.
* <p>
* Requests to extract the POJO from the Value object first try to return the
* cached POJO. If that is missing, then they will re-inflate the POJO from
* the {@link Iced} byte[]. If that is missing it is only because the byte[]
* was swapped to disk by the {@link Cleaner}. It will be reloaded from disk
* and then inflated as normal.
* <p>
* The H2O {@link DKV} supports the full <em>Java Memory Model</em> coherency
* but only with Gets and Puts. Normal Java updates to the cached POJO are
* local-node visible (due to X86 and Java coherency rules) but NOT cluster-wide
* visible until a Put completes after the update.
* <p>
* By the same token, updates ot the POJO are not reflected in the serialized
* form nor the disk-spill copy unless a Put is triggered. As long as a local
* thread keeps a pointer to the POJO, they can update it at will. If they
* wish to recover the POJO from the DKV at a later time with all updates
* intact, they <em>must</em> do a final Put after all updates.
* <p>
* Value objects maintain the needed coherency state, as well as any cached
* copies, plus a bunch of utility and convenience functions.
*/
public final class Value extends Iced implements ForkJoinPool.ManagedBlocker {
/** The Key part of a Key/Value store. Transient, because the Value is
* typically found via its Key, and so the Key is available before we get
* the Value and does not need to be passed around the wire. Not final,
* because Keys are interned slowly (for faster compares) and periodically a
* Value's Key will be updated to an interned but equivalent Key.
* <p>
* Should not be set by any user code. */
public transient Key _key;
// ---
// Type-id of serialized object; see TypeMap for the list.
// Might be a primitive array type, or a Iced POJO
private short _type;
public int type() { return _type; }
/** Class name of the embedded POJO, without needing an actual POJO. */
public String className() { return TypeMap.className(_type); }
// Max size of Values before we start asserting.
// Sizes around this big, or larger are probably true errors.
// In any case, they will cause issues with both GC (giant pause times on
// many collectors) and I/O (long term blocking of TCP I/O channels to
// service a single request, causing starvation of other requests).
public static final int MAX = Integer.MAX_VALUE; //DeepWater models can contain a single byte[] state as large as 3GB
/** Size of the serialized wad of bits. Values are wads of bits; known small
* enough to 'chunk' politely on disk, or fit in a Java heap (larger Vecs
* are built via Chunks) but (much) larger than a UDP packet. Values can
* point to either the disk or ram version or both. There's no compression
* smarts (done by the big data Chunks) nor de-dup smarts (done by the
* nature of a K/V). This is just a local placeholder for some user bits
* being held at this local Node. */
public int _max;
// ---
// A array of this Value when cached in DRAM, or NULL if not cached. The
// contents of _mem are immutable (Key/Value mappings can be changed by an
// explicit PUT action). Cleared to null asynchronously by the memory
// manager (but only if persisted to some disk or in a POJO). Can be filled
// in by reloading from disk, or by serializing a POJO.
private volatile byte[] _mem;
final byte[] rawMem() { return _mem; }
// ---
// A POJO version of the _mem array, or null if the _mem has not been
// serialized or if _mem is primitive data and not a POJO. Cleared to null
// asynchronously by the memory manager (but only if persisted to some disk,
// or in the _mem array). Can be filled in by deserializing the _mem array.
// NOTE THAT IF YOU MODIFY any fields of a POJO that is part of a Value,
// - this is NOT the recommended programming style,
// - those changes are visible to all CPUs on the writing node,
// - but not to other nodes, and
// - the POJO might be dropped by the MemoryManager and reconstituted from
// disk and/or the byte array back to it's original form, losing your changes.
private volatile Freezable _pojo;
Freezable rawPOJO() { return _pojo; }
/** Invalidate byte[] cache. Only used to eagerly free memory, for data
* which is expected to be read-once. */
public final void freeMem() {
assert isPersisted() || _pojo != null || _key.isChunkKey();
_mem = null;
}
/** Invalidate POJO cache. Only used to eagerly free memory, for data
* which is expected to be read-once. */
public final void freePOJO() {
assert isPersisted() || _mem != null;
_pojo = null;
}
/** The FAST path get-byte-array - final method for speed. Will (re)build
* the mem array from either the POJO or disk. Never returns NULL.
* @return byte[] holding the serialized POJO */
public final byte[] memOrLoad() {
byte[] mem = _mem; // Read once!
if( mem != null ) return mem;
Freezable pojo = _pojo; // Read once!
if( pojo != null ) // Has the POJO, make raw bytes
return _mem = pojo.asBytes();
if( _max == 0 ) return (_mem = new byte[0]);
return (_mem = loadPersist());
}
// Just an empty shell of a Value, no local data but the Value is "real".
// Any attempt to look at the Value will require a remote fetch.
final boolean isEmpty() { return _max > 0 && _mem==null && _pojo == null && !isPersisted(); }
/** The FAST path get-POJO as an {@link Iced} subclass - final method for
* speed. Will (re)build the POJO from the _mem array. Never returns NULL.
* @return The POJO, probably the cached instance. */
public final <T extends Iced> T get() {
touch();
Iced pojo = (Iced)_pojo; // Read once!
if( pojo != null ) return (T)pojo;
pojo = TypeMap.newInstance(_type);
return (T)(_pojo = pojo.reloadFromBytes(memOrLoad()));
}
/** The FAST path get-POJO as a {@link Freezable} - final method for speed.
* Will (re)build the POJO from the _mem array. Never returns NULL. This
* version has more type-checking.
* @return The POJO, probably the cached instance. */
public final <T extends Freezable> T get(Class<T> fc) {
T pojo = getFreezable();
assert fc.isAssignableFrom(pojo.getClass());
return pojo;
}
/** The FAST path get-POJO as a {@link Freezable} - final method for speed.
* Will (re)build the POJO from the _mem array. Never returns NULL.
* @return The POJO, probably the cached instance. */
public final <T extends Freezable> T getFreezable() {
touch();
Freezable pojo = _pojo; // Read once!
if( pojo != null ) return (T)pojo;
pojo = TypeMap.newFreezable(_type);
pojo.reloadFromBytes(memOrLoad());
return (T)(_pojo = pojo);
}
// ---
// Time of last access to this value.
transient long _lastAccessedTime = System.currentTimeMillis();
private void touch() {_lastAccessedTime = System.currentTimeMillis();}
// Exposed and used for testing only; used to trigger premature cleaning/disk-swapping
void touchAt(long time) {_lastAccessedTime = time;}
// ---
// Backend persistence info. 3 bits are reserved for 8 different flavors of
// backend storage. 1 bit for whether or not the latest _mem field is
// entirely persisted on the backend storage. The low 3 bits are final. The
// other bit monotonically changes from 0->1. The deleted bit ALSO
// monotonically changes 0->1. These two bits cannot be combined without the
// use of atomic operations.
private volatile byte _persist; // 1 bit of disk/notdisk; 3 bits of backend flavor
public final static byte ICE = 1<<0; // ICE: distributed local disks
public final static byte HDFS= 2<<0; // HDFS: backed by Hadoop cluster
public final static byte S3 = 3<<0; // Amazon S3
public final static byte NFS = 4<<0; // NFS: Standard file system
public final static byte TCP = 7<<0; // TCP: For profile purposes, not a storage system
private final static byte BACKEND_MASK = (8-1);
final byte backend() { return (byte)(_persist&BACKEND_MASK); }
boolean onICE (){ return (backend()) == ICE; }
private boolean onHDFS(){ return (backend()) == HDFS; }
private boolean onNFS (){ return (backend()) == NFS; }
private boolean onS3 (){ return (backend()) == S3; }
// Manipulate the on-disk bit
private final static byte NOTdsk = 0<<3; // latest _mem is persisted or not
private final static byte ON_dsk = 1<<3;
/** Check if the backing byte[] has been saved-to-disk */
public final boolean isPersisted() { return (_persist&ON_dsk)!=0; }
public final void setDsk() { _persist |= ON_dsk; } // note: not atomic, but only monotonically set bit
private volatile byte _deleted; // 1 bit of deleted
public final boolean isDeleted() { return _deleted != 0; }
public final void setDel() { _deleted=1; } // note: not atomic, but only monotonically set bit
/** Best-effort store complete Values to disk. */
void storePersist() throws java.io.IOException {
// 00 then start writing
// 01 delete requested; do not write
// 10 already written; do nothing
// 11 already written & deleted; do nothing
if( isDeleted() ) return; // 01 and 11 cases
if( isPersisted() ) return; // 10 case
H2O.getPM().store(backend(), this); // Write to disk
// 00 -> 10 expected, set write bit
// 10 assert; only Cleaner writes
// 01 delete-during-write; delete again
// 11 assert; only Cleaner writes
assert !isPersisted(); // Only Cleaner writes
setDsk(); // Not locked, not atomic, so can only called by one thread: Cleaner
if( isDeleted() ) // Check del bit AFTER setting persist bit; close race with deleting user thread
H2O.getPM().delete(backend(), this); // Possibly nothing to delete (race with writer)
}
/** Remove dead Values from disk */
public void removePersist() {
// do not yank memory, as we could have a racing get hold on to this
// free_mem();
// 00 -> 01 try to delete (racing, probably nothing to delete)
// 01 double delete; do nothing
// 10 -> 11 delete
// 11 double delete; do nothing
if( !onICE() ) return; // Wrong filestore?
if( isDeleted() ) return; // Already deleted?
setDel(); // Set del bit BEFORE testing isPersist
if( !isPersisted() ) return;// Nothing there
H2O.getPM().delete(backend(), this); // Possibly nothing to delete (race with writer)
}
/** Load some or all of completely persisted Values */
byte[] loadPersist() {
// 00 assert: not written yet
// 01 assert: load-after-delete
// 10 expected; read
// 11 assert: load-after-delete
assert isPersisted();
try {
byte[] res = H2O.getPM().load(backend(), this);
assert !isDeleted(); // Race in user-land: load-after-delete
return res;
} catch( IOException ioe ) { throw Log.throwErr(ioe); }
}
String nameOfPersist() { return nameOfPersist(backend()); }
/** One of ICE, HDFS, S3, NFS or TCP, according to where this Value is persisted.
* @return Short String of the persitance name */
public static String nameOfPersist(int x) {
switch( x ) {
case ICE : return "ICE";
case HDFS: return "HDFS";
case S3 : return "S3";
case NFS : return "NFS";
case TCP : return "TCP";
default : return null;
}
}
/** Check if the Value's POJO is a subtype of given type integer. Does not require the POJO.
* @return True if the Value's POJO is a subtype. */
public static boolean isSubclassOf(int type, Class clz) { return clz.isAssignableFrom(TypeMap.theFreezable(type).getClass()); }
/** Check if the Value's POJO is a {@link Key} subtype. Does not require the POJO.
* @return True if the Value's POJO is a {@link Key} subtype. */
public boolean isKey() { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof Key; }
/** Check if the Value's POJO is a {@link Frame} subtype. Does not require the POJO.
* @return True if the Value's POJO is a {@link Frame} subtype. */
public boolean isFrame() { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof Frame; }
/** Check if the Value's POJO is a {@link water.fvec.Vec.VectorGroup} subtype. Does not require the POJO.
* @return True if the Value's POJO is a {@link water.fvec.Vec.VectorGroup} subtype. */
public boolean isVecGroup() { return _type == TypeMap.VECGROUP; }
/** Check if the Value's POJO is a {@link water.fvec.Vec.ESPC} subtype. Does not require the POJO.
* @return True if the Value's POJO is a {@link water.fvec.Vec.ESPC} subtype. */
public boolean isESPCGroup() { return _type == TypeMap.ESPCGROUP; }
/** Check if the Value's POJO is a {@link Lockable} subtype. Does not require the POJO.
* @return True if the Value's POJO is a {@link Lockable} subtype. */
public boolean isLockable() { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof Lockable; }
/** Check if the Value's POJO is a {@link Vec} subtype. Does not require the POJO.
* @return True if the Value's POJO is a {@link Vec} subtype. */
public boolean isVec() { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof Vec; }
/** Check if the Value's POJO is a {@link hex.Model} subtype. Does not require the POJO.
* @return True if the Value's POJO is a {@link hex.Model} subtype. */
public boolean isModel() { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof hex.Model; }
/** Check if the Value's POJO is a {@link Job} subtype. Does not require the POJO.
* @return True if the Value's POJO is a {@link Job} subtype. */
public boolean isJob() { return _type != TypeMap.PRIM_B && TypeMap.theFreezable(_type) instanceof Job; }
public Class<? extends Freezable> theFreezableClass() { return TypeMap.theFreezable(this._type).getClass(); }
// --------------------------------------------------------------------------
/** Construct a Value from all parts; not needed for most uses. This special
* constructor is used by {@link water.fvec} to build Value objects over
* already-existing Files, so that the File contents will be lazily
* swapped-in as the Values are first used. */
public Value(Key k, int max, byte[] mem, short type, byte be ) {
assert mem==null || mem.length==max;
assert max < MAX : "Value size=0x"+Integer.toHexString(max);
_key = k;
_max = max;
_mem = mem;
_type = type;
_pojo = null;
// For the ICE backend, assume new values are not-yet-written.
// For HDFS & NFS backends, assume we from global data and preserve the
// passed-in persist bits
byte p = (byte)(be&BACKEND_MASK);
_persist = (p==ICE) ? p : be;
_rwlock = new AtomicInteger(1);
_replicas = null;
}
Value(Key k, byte[] mem ) { this(k, mem.length, mem, TypeMap.PRIM_B, ICE); }
Value(Key k, String s ) { this(k, StringUtils.bytesOf(s)); }
Value(Key k, Iced pojo ) { this(k,pojo,ICE); }
Value(Key k, Iced pojo, byte be ) {
_key = k;
_pojo = pojo;
_type = (short)pojo.frozenType();
_mem = pojo.asBytes();
_max = _mem.length;
assert _max < MAX : "Value size = " + _max + " (0x"+Integer.toHexString(_max) + ") >= (MAX=" + MAX + ").";
// For the ICE backend, assume new values are not-yet-written.
// For HDFS & NFS backends, assume we from global data and preserve the
// passed-in persist bits
byte p = (byte)(be&BACKEND_MASK);
_persist = (p==ICE) ? p : be;
_rwlock = new AtomicInteger(1);
_replicas = null;
}
public Value(Key k, Freezable pojo, int pojoByteSz, byte be) {
_key = k;
_pojo = pojo;
_type = (short)pojo.frozenType();
_mem = null;
_max = pojoByteSz;
byte p = (byte)(be&BACKEND_MASK);
_persist = (p==ICE) ? p : be;
_rwlock = new AtomicInteger(1);
_replicas = null;
}
/** Standard constructor to build a Value from a POJO and a Key. */
public Value(Key k, Freezable pojo) { this(k,pojo,ICE); }
Value(Key k, Freezable pojo, byte be) {
_key = k;
_pojo = pojo;
_type = (short)pojo.frozenType();
_mem = pojo.asBytes();
_max = _mem.length;
byte p = (byte)(be&BACKEND_MASK);
_persist = (p==ICE) ? p : be;
_rwlock = new AtomicInteger(1);
_replicas = null;
}
// Custom serializers: the _mem field is racily cleared by the MemoryManager
// and the normal serializer then might ship over a null instead of the
// intended byte[]. Also, the value is NOT on the deserialize'd machines disk
public final AutoBuffer write_impl( AutoBuffer ab ) {
return ab.put1(_persist).put2(_type).putA1(memOrLoad());
}
// Custom serializer: set _max from _mem length; set replicas & timestamp.
public final Value read_impl(AutoBuffer bb) {
assert _key == null; // Not set yet
// Set persistence backend but... strip off saved-to-disk bit
_persist = (byte)(bb.get1()&BACKEND_MASK);
_type = (short) bb.get2();
_mem = bb.getA1();
_max = _mem.length;
assert _max < MAX : "Value size=0x"+Integer.toHexString(_max)+" during read is larger than "+Integer.toHexString(MAX)+", type: "+TypeMap.className(_type);
_pojo = null;
// On remote nodes _rwlock is initialized to 1 (signaling a remote PUT is
// in progress) flips to -1 when the remote PUT is done, or +2 if a notify
// needs to happen.
_rwlock = new AtomicInteger(-1); // Set as 'remote put is done'
_replicas = null;
touch();
return this;
}
// ---------------------
// Ordering of K/V's! This field tracks a bunch of things used in ordering
// updates to the same Key. Ordering Rules:
// - Program Order. You see your own writes. All writes in a single thread
// strongly ordered (writes never roll back). In particular can:
// PUT(v1), GET, PUT(null) and The Right Thing happens.
// - Unrelated writes can race (unless fencing).
// - Writes are not atomic: some people can see a write ahead of others.
// - Last-write-wins: if we do a zillion writes to the same Key then wait "a
// long time", then do reads all reads will see the same last value.
// - Blocking on a PUT stalls until the PUT is cloud-wide visible
//
// For comparison to H2O get/put MM
// IA Memory Ordering, 8 principles from Rich Hudson, Intel
// 1. Loads are not reordered with other loads
// 2. Stores are not reordered with other stores
// 3. Stores are not reordered with older loads
// 4. Loads may be reordered with older stores to different locations but not
// with older stores to the same location
// 5. In a multiprocessor system, memory ordering obeys causality (memory
// ordering respects transitive visibility).
// 6. In a multiprocessor system, stores to the same location have a total order
// 7. In a multiprocessor system, locked instructions have a total order
// 8. Loads and stores are not reordered with locked instructions.
//
// My (KN, CNC) interpretation of H2O MM from today:
// 1. Gets are not reordered with other Gets
// 2 Puts may be reordered with Puts to different Keys.
// 3. Puts may be reordered with older Gets to different Keys, but not with
// older Gets to the same Key.
// 4. Gets may be reordered with older Puts to different Keys but not with
// older Puts to the same Key.
// 5. Get/Put amongst threads doesn't obey causality
// 6. Puts to the same Key have a total order.
// 7. no such thing. although RMW operation exists with Put-like constraints.
// 8. Gets and Puts may be reordered with RMW operations
// 9. A write barrier exists that creates Sequential Consistency. Same-key
// ordering (3-4) can't be used to create the effect.
//
// A Reader/Writer lock for the home node to control racing Gets and Puts.
// - 0 for unlocked
// - +N for locked by N concurrent GETs-in-flight
// - -1 for write-locked
//
// An ACKACK from the client GET lowers the reader lock count.
//
// Home node PUTs alter which Value is mapped to a Key, then they block until
// there are no active GETs, then atomically set the write-lock, then send
// out invalidates to all the replicas. PUTs return when all invalidates
// have reported back.
//
// An initial remote PUT will default the value to 1. A 2nd PUT attempt will
// block until the 1st one completes (multiple writes to the same Key from
// the same JVM block, so there is at most 1 outstanding write to the same
// Key from the same JVM). The 2nd PUT will CAS the value to 2, indicating
// the need for the finishing 1st PUT to call notify().
//
// Note that this sequence involves a lot of blocking on repeated writes with
// cached readers, but not the readers - i.e., writes are slow to complete.
private transient AtomicInteger _rwlock;
private boolean RW_CAS( int old, int nnn, String msg ) {
if( !_rwlock.compareAndSet(old,nnn) ) return false;
//System.out.println(_key+", "+old+" -> "+nnn+", "+msg);
return true;
}
// List of who is replicated where
private volatile byte[] _replicas;
private static final AtomicReferenceFieldUpdater<Value,byte[]> REPLICAS_UPDATER =
AtomicReferenceFieldUpdater.newUpdater(Value.class,byte[].class, "_replicas");
// Fills in the _replicas field atomically, on first set of a replica.
private byte[] replicas( ) {
byte[] r = _replicas;
if( r != null ) return r;
byte[] nr = new byte[H2O.CLOUD.size()+1/*1-based numbering*/+10/*limit of 10 clients*/];
if( REPLICAS_UPDATER.compareAndSet(this,null,nr) ) return nr;
r = _replicas/*read again, since CAS failed must be set now*/;
assert r!= null;
return r;
}
// Bump the read lock, once per pending-GET or pending-Invalidate
boolean read_lock() {
while( true ) { // Repeat, in case racing GETs are bumping the counter
int old = _rwlock.get();
if( old == -1 ) return false; // Write-locked; no new replications. Read fails to read *this* value
assert old >= 0; // Not negative
if( RW_CAS(old,old+1,"rlock+") ) return true;
}
}
/** Atomically insert h2o into the replica list; reports false if the Value
* flagged against future replication with a -1. Also bumps the active
* Get count, which remains until the Get completes (we receive an ACKACK). */
boolean setReplica( H2ONode h2o ) {
assert _key.home(); // Only the HOME node for a key tracks replicas
assert h2o != H2O.SELF; // Do not track self as a replica
if( !read_lock() ) return false; // Write-locked; no new replications. Read fails to read *this* value
// Narrow non-race here. Here is a time window where the rwlock count went
// up, but the replica list does not account for the new replica. However,
// the rwlock cannot go down until an ACKACK is received, and the ACK
// (hence ACKACK) doesn't go out until after this function returns.
replicas()[h2o._unique_idx] = 1;
// Both rwlock taken, and replica count is up now.
return true;
}
/** Atomically lower active GET and Invalidate count */
void lowerActiveGetCount( H2ONode h2o ) {
assert _key.home(); // Only the HOME node for a key tracks replicas
assert h2o != H2O.SELF;// Do not track self as a replica
while( true ) { // Repeat, in case racing GETs are bumping the counter
int old = _rwlock.get(); // Read the lock-word
assert old > 0; // Since lowering, must be at least 1
assert old != -1; // Not write-locked, because we are an active reader
assert (h2o==null) || (_replicas!=null && _replicas[h2o._unique_idx]==1); // Self-bit is set
if( RW_CAS(old,old-1,"rlock-") ) {
if( old-1 == 0 ) // GET count fell to zero?
synchronized( this ) { notifyAll(); } // Notify any pending blocked PUTs
return; // Repeat until count is lowered
}
}
}
/** This value was atomically extracted from the local STORE by a successful
* TaskPutKey attempt (only 1 thread can ever extract and thus call here).
* No future lookups will find this Value, but there may be existing uses.
* Atomically set the rwlock count to -1 locking it from further GETs and
* ship out invalidates to caching replicas. May need to block on active
* GETs. Updates a set of Future invalidates that can be blocked against. */
Futures lockAndInvalidate( H2ONode sender, Value newval, Futures fs ) {
assert _key.home(); // Only the HOME node for a key tracks replicas
assert newval._rwlock.get() >= 1; // starts read-locked
// Write-Lock against further GETs
while( true ) { // Repeat, in case racing GETs are bumping the counter
int old = _rwlock.get();
assert old >= 0 : _key+", rwlock="+old; // Count does not go negative
assert old != -1; // Only the thread doing a PUT ever locks
if( old !=0 ) { // has readers?
// Active readers: need to block until the GETs (of this very Value!)
// all complete, before we can invalidate this Value - lest a racing
// Invalidate bypass a GET.
try { ForkJoinPool.managedBlock(this); } catch( InterruptedException ignore ) { }
} else if( RW_CAS(0,-1,"wlock") )
break; // Got the write-lock!
}
// We have the set of Nodes with replicas now. Ship out invalidates.
// Bump the newval read-lock by 1 for each pending invalidate
byte[] r = _replicas;
if( r!=null ) { // No replicas, nothing to invalidate
int max = r.length;
for( int i=0; i<max; i++ )
if( r[i]==1 && H2ONode.IDX[i] != sender )
TaskInvalidateKey.invalidate(H2ONode.IDX[i],_key,newval,fs);
}
newval.lowerActiveGetCount(null); // Remove initial read-lock, accounting for pending inv counts
return fs;
}
void blockTillNoReaders( ) {
assert _key.home(); // Only the HOME node for a key tracks replicas
// Write-Lock against further GETs
while( true ) { // Repeat, in case racing GETs are bumping the counter
int old = _rwlock.get();
if( old <= 0) return; // No readers, or this Value already replaced with a later value
// Active readers: need to block until the GETs (of this very Value!) all complete
try { ForkJoinPool.managedBlock(this); } catch( InterruptedException ignore ) { }
}
}
/** Initialize the _replicas field for a PUT. On the Home node (for remote
* PUTs), it is initialized to the one replica we know about, and not
* read-locked. Used on a new Value about to be PUT on the Home node. */
void initReplicaHome( H2ONode h2o, Key key ) {
assert key.home();
assert _key == null; // This is THE initializing key write for serialized Values
assert h2o != H2O.SELF; // Do not track self as a replica
_key = key;
// Set the replica bit for the one node we know about, and leave the
// rest clear.
replicas()[h2o._unique_idx]=1;
_rwlock.set(1); // An initial read-lock, so a fast PUT cannot wipe this one out before invalidates have a chance of being counted
}
/** Block this thread until all prior remote PUTs complete - to force
* remote-PUT ordering on the home node. */
void startRemotePut() {
assert !_key.home();
int x;
// assert I am waiting on threads with higher priority?
while( (x=_rwlock.get()) != -1 ) // Spin until rwlock==-1
if( x == 2 || RW_CAS(1,2,"remote_need_notify") )
try { ForkJoinPool.managedBlock(this); } catch( InterruptedException ignore ) { }
}
/** The PUT for this Value has completed. Wakeup any blocked later PUTs. */
void completeRemotePut() {
assert !_key.home();
// Attempt an eager blind attempt, assuming no blocked pending notifies
if( RW_CAS(1, -1,"remote_complete") ) return;
synchronized(this) {
boolean res = RW_CAS(2, -1,"remote_do_notify");
assert res; // Must succeed
notifyAll(); // Wake up pending blocked PUTs
}
}
// Construct a Value which behaves like a "null" or "deleted" Value, but
// allows for counting pending invalidates on the delete operation... and can
// thus stall future Puts overriding the deletion until the delete completes.
static Value makeNull( Key key ) { assert key.home(); return new Value(key,0,null,(short)0,TCP); }
boolean isNull() { assert _type != 0 || _key.home(); return _type == 0; }
// Get from the local STORE. If we fetch out a special Null Value, and it is
// unlocked (it will never be write-locked, but may be read-locked if there
// are pending invalidates on it), upgrade it in-place to a true null.
// Return the not-Null value, or the true null.
public static Value STORE_get( Key key ) {
Value val = H2O.STORE.get(key);
if( val == null ) return null; // A true null
if( !val.isNull() ) return val; // Not a special Null
// One-shot throwaway attempt at upgrading the special Null to a true null
if( val._rwlock.get()==0 ) H2O.putIfMatch(key,null,val);
return null; // Special null, but missing from callers point of view
}
/** Return true if blocking is unnecessary.
* Alas, used in TWO places and the blocking API forces them to share here. */
@Override public boolean isReleasable() {
int r = _rwlock.get();
if( _key.home() ) { // Called from lock_and_invalidate
// Home-key blocking: wait for active-GET count to fall to zero, or blocking on deleted object
return r <= 0;
} else { // Called from start_put
// Remote-key blocking: wait for active-PUT lock to hit -1
assert r == 2 || r == -1; // Either waiting (2) or done (-1) but not started(1)
return r == -1; // done!
}
}
/** Possibly blocks the current thread. Returns true if isReleasable would
* return true. Used by the FJ Pool management to spawn threads to prevent
* deadlock is otherwise all threads would block on waits. */
@Override public synchronized boolean block() {
while( !isReleasable() ) { try { wait(); } catch( InterruptedException ignore ) { } }
return true;
}
}