package water;
import water.util.ReflectionUtils;
import water.util.StringUtils;
import water.util.UnsafeUtils;
import water.fvec.*;
import java.util.Arrays;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicLongFieldUpdater;
/**
* Keys! H2O supports a distributed Key/Value store, with exact Java Memory
* Model consistency. Keys are a means to find a {@link Value} somewhere in
* the Cloud, to cache it locally, to allow globally consistent updates to a
* {@link Value}. Keys have a *home*, a specific Node in the Cloud, which is
* computable from the Key itself. The Key's home node breaks ties on racing
* updates, and tracks caching copies (via a hardware-like MESI protocol), but
* otherwise is not involved in the DKV. All operations on the DKV, including
* Gets and Puts, are found in {@link DKV}.
* <p>
* Keys are defined as a simple byte-array, plus a hashCode and a small cache
* of Cloud-specific information. The first byte of the byte-array determines
* if this is a user-visible Key or an internal system Key; an initial byte of
* <32 is a system Key. User keys are generally externally visible, system
* keys are generally limited to things kept internal to the H2O Cloud. Keys
* might be a high-count item, hence we care about the size.
* <p>
* System keys for {@link Job}, {@link Vec}, {@link Chunk} and {@link
* water.fvec.Vec.VectorGroup} have special initial bytes; Keys for these classes can be
* determined without loading the underlying Value. Layout for {@link Vec} and
* {@link Chunk} is further restricted, so there is an efficient mapping
* between a numbered Chunk and it's associated Vec.
* <p>
* System keys (other than the restricted Vec and Chunk keys) can have their
* home node forced, by setting the desired home node in the first few Key
* bytes. Otherwise home nodes are selected by pseudo-random hash. Selecting
* a home node is sometimes useful for Keys with very high update rates coming
* from a specific Node.
* <p>
* @author <a href="mailto:cliffc@h2o.ai"></a>
* @version 1.0
*/
final public class Key<T extends Keyed> extends Iced<Key<T>> implements Comparable {
// The Key!!!
// Limited to 512 random bytes - to fit better in UDP packets.
static final int KEY_LENGTH = 512;
public final byte[] _kb; // Key bytes, wire-line protocol
transient final int _hash; // Hash on key alone (and not value)
// The user keys must be ASCII, so the values 0..31 are reserved for system
// keys. When you create a system key, please do add its number to this list
static final byte BUILT_IN_KEY = 2;
public static final byte JOB = 3;
public static final byte VEC = 4; // Vec
public static final byte CHK = 5; // Chunk
public static final byte GRP = 6; // Vec.VectorGroup
public static final byte HIDDEN_USER_KEY = 31;
public static final byte USER_KEY = 32;
// For Fluid Vectors, we have a special Key layout.
// 0 - key type byte, one of VEC, CHK or GRP
// 1 - homing byte, always -1/0xFF as these keys use the hash to figure their home out
// 4 - Vector Group
// 4 - Chunk # for CHK, or 0xFFFFFFFF for VEC
static final int VEC_PREFIX_LEN = 1+1+4+4;
/** True is this is a {@link Vec} Key.
* @return True is this is a {@link Vec} Key */
public final boolean isVec() { return _kb != null && _kb.length > 0 && _kb[0] == VEC; }
/** True is this is a {@link Chunk} Key.
* @return True is this is a {@link Chunk} Key */
public final boolean isChunkKey() { return _kb != null && _kb.length > 0 && _kb[0] == CHK; }
/** Returns the {@link Vec} Key from a {@link Chunk} Key.
* @return Returns the {@link Vec} Key from a {@link Chunk} Key. */
public final Key getVecKey() { assert isChunkKey(); return water.fvec.Vec.getVecKey(this); }
/** Convenience function to fetch key contents from the DKV.
* @return null if the Key is not mapped, or an instance of {@link Keyed} */
public final T get() {
Value val = DKV.get(this);
return val == null ? null : (T)val.get();
}
// *Desired* distribution function on keys & replication factor. Replica #0
// is the master, replica #1, 2, 3, etc represent additional desired
// replication nodes. Note that this function is just the distribution
// function - it does not DO any replication, nor does it dictate any policy
// on how fast replication occurs. Returns -1 if the desired replica
// is nonsense, e.g. asking for replica #3 in a 2-Node system.
int D( int repl ) {
int hsz = H2O.CLOUD.size();
if (0 == hsz) return -1; // Clients starting up find no cloud, be unable to home keys
// See if this is a specifically homed Key
if( !user_allowed() && repl < _kb[1] ) { // Asking for a replica# from the homed list?
assert repl == 0 : "No replication is support now";
assert _kb[0] != Key.CHK;
H2ONode h2o = H2ONode.intern(_kb,2+repl*(H2ONode.H2Okey.SIZE /* serialized bytesize of H2OKey - depends on IP protocol */));
// Reverse the home to the index
int idx = h2o.index();
if( idx >= 0 ) return idx;
// Else homed to a node which is no longer in the cloud!
// Fall back to the normal home mode
}
// Distribution of Fluid Vectors is a special case.
// Fluid Vectors are grouped into vector groups, each of which must have
// the same distribution of chunks so that MRTask run over group of
// vectors will keep data-locality. The fluid vecs from the same group
// share the same key pattern + each has 4 bytes identifying particular
// vector in the group. Since we need the same chunks end up on the same
// node in the group, we need to skip the 4 bytes containing vec# from the
// hash. Apart from that, we keep the previous mode of operation, so that
// ByteVec would have first 64MB distributed around cloud randomly and then
// go round-robin in 64MB chunks.
if( _kb[0] == CHK ) {
// Homed Chunk?
if( _kb[1] != -1 ) throw H2O.fail();
// For round-robin on Chunks in the following pattern:
// 1 Chunk-per-node, until all nodes have 1 chunk (max parallelism).
// Then 2 chunks-per-node, once around, then 4, then 8, then 16.
// Getting several chunks-in-a-row on a single Node means that stencil
// calculations that step off the end of one chunk into the next won't
// force a chunk local - replicating the data. If all chunks round robin
// exactly, then any stencil calc will double the cached volume of data
// (every node will have it's own chunk, plus a cached next-chunk).
// Above 16-chunks-in-a-row we hit diminishing returns.
int cidx = UnsafeUtils.get4(_kb, 1 + 1 + 4); // Chunk index
int x = cidx/hsz; // Multiples of cluster size
// 0 -> 1st trip around the cluster; nidx= (cidx- 0*hsz)>>0
// 1,2 -> 2nd & 3rd trip; allocate in pairs: nidx= (cidx- 1*hsz)>>1
// 3,4,5,6 -> next 4 rounds; allocate in quads: nidx= (cidx- 3*hsz)>>2
// 7-14 -> next 8 rounds in octets: nidx= (cidx- 7*hsz)>>3
// 15+ -> remaining rounds in groups of 16: nidx= (cidx-15*hsz)>>4
int z = x==0 ? 0 : (x<=2 ? 1 : (x<=6 ? 2 : (x<=14 ? 3 : 4)));
int nidx = (cidx-((1<<z)-1)*hsz)>>z;
return ((nidx+repl)&0x7FFFFFFF) % hsz;
}
// Easy Cheesy Stupid:
return ((_hash+repl)&0x7FFFFFFF) % hsz;
}
/** List of illegal characters which are not allowed in user keys. */
static final CharSequence ILLEGAL_USER_KEY_CHARS = " !@#$%^&*()+={}[]|\\;:\"'<>,/?";
// 64 bits of Cloud-specific cached stuff. It is changed atomically by any
// thread that visits it and has the wrong Cloud. It has to be read *in the
// context of a specific Cloud*, since a re-read may be for another Cloud.
private transient volatile long _cache;
private static final AtomicLongFieldUpdater<Key> _cacheUpdater =
AtomicLongFieldUpdater.newUpdater(Key.class, "_cache");
// Accessors and updaters for the Cloud-specific cached stuff.
// The Cloud index, a byte uniquely identifying the last 256 Clouds. It
// changes atomically with the _cache word, so we can tell which Cloud this
// data is a cache of.
private static int cloud( long cache ) { return (int)(cache>>> 0)&0x00FF; }
// Shortcut node index for Home replica#0. This replica is responsible for
// breaking ties on writes. 'char' because I want an unsigned 16bit thing,
// limit of 65534 Cloud members. -1 is reserved for a bare-key
private static int home ( long cache ) { return (int)(cache>>> 8)&0xFFFF; }
// Our replica #, or -1 if we're not one of the first 127 replicas. This
// value is found using the Cloud distribution function and changes for a
// changed Cloud.
private static int replica(long cache) { return (byte)(cache>>>24)&0x00FF; }
// Desired replication factor. Can be zero for temp keys. Not allowed to
// later, because it messes with e.g. meta-data on disk.
private static int desired(long cache) { return (int)(cache>>>32)&0x00FF; }
private static long build_cache( int cidx, int home, int replica, int desired ) {
return // Build the new cache word
((long)(cidx &0xFF)<< 0) |
((long)(home &0xFFFF)<< 8) |
((long)(replica&0xFF)<<24) |
((long)(desired&0xFF)<<32) |
((long)(0 )<<40);
}
int home ( H2O cloud ) { return home (cloud_info(cloud)); }
int replica( H2O cloud ) { return replica(cloud_info(cloud)); }
int desired( ) { return desired(_cache); }
/** True if the {@link #home_node} is the current node.
* @return True if the {@link #home_node} is the current node */
public boolean home() { return home_node()==H2O.SELF; }
/** The home node for this Key.
* @return The home node for this Key. */
public H2ONode home_node( ) {
H2O cloud = H2O.CLOUD;
return cloud._memary[home(cloud)];
}
// Update the cache, but only to strictly newer Clouds
private boolean set_cache( long cache ) {
while( true ) { // Spin till get it
long old = _cache; // Read once at the start
if( !H2O.larger(cloud(cache),cloud(old)) ) // Rolling backwards?
// Attempt to set for an older Cloud. Blow out with a failure; caller
// should retry on a new Cloud.
return false;
assert cloud(cache) != cloud(old) || cache == old;
if( old == cache ) return true; // Fast-path cutout
if( _cacheUpdater.compareAndSet(this,old,cache) ) return true;
// Can fail if the cache is really old, and just got updated to a version
// which is still not the latest, and we are trying to update it again.
}
}
// Return the info word for this Cloud. Use the cache if possible
long cloud_info( H2O cloud ) {
long x = _cache;
// See if cached for this Cloud. This should be the 99% fast case.
if( cloud(x) == cloud._idx ) return x;
// Cache missed! Probably it just needs (atomic) updating.
// But we might be holding the stale cloud...
// Figure out home Node in this Cloud
char home = (char)D(0);
// Figure out what replica # I am, if any
int desired = desired(x);
int replica = -1;
for( int i=0; i<desired; i++ ) {
int idx = D(i);
if( idx >= 0 && cloud._memary[idx] == H2O.SELF ) {
replica = i;
break;
}
}
long cache = build_cache(cloud._idx,home,replica,desired);
set_cache(cache); // Attempt to upgrade cache, but ignore failure
return cache; // Return the magic word for this Cloud
}
// Default desired replication factor. Unless specified otherwise, all new
// k-v pairs start with this replication factor.
static final byte DEFAULT_DESIRED_REPLICA_FACTOR = 1;
// Construct a new Key.
private Key(byte[] kb) {
if( kb.length > KEY_LENGTH ) throw new IllegalArgumentException("Key length would be "+kb.length);
_kb = kb;
// Quicky hash: http://en.wikipedia.org/wiki/Jenkins_hash_function
int hash = 0;
for( byte b : kb ) {
hash += b;
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
_hash = hash;
}
// Make new Keys. Optimistically attempt interning, but no guarantee.
static <P extends Keyed> Key<P> make(byte[] kb, byte rf) {
if( rf == -1 ) throw new IllegalArgumentException();
Key key = new Key(kb);
Key key2 = H2O.getk(key); // Get the interned version, if any
if( key2 != null ) // There is one! Return it instead
return key2;
// Set the cache with desired replication factor, and a fake cloud index
H2O cloud = H2O.CLOUD; // Read once
key._cache = build_cache(cloud._idx-1,0,0,rf);
key.cloud_info(cloud); // Now compute & cache the real data
return key;
}
/** A random string, useful as a Key name or partial Key suffix.
* @return A random short string */
public static String rand() {
UUID uid = UUID.randomUUID();
long l1 = uid.getLeastSignificantBits();
long l2 = uid. getMostSignificantBits();
return "_"+Long.toHexString(l1)+Long.toHexString(l2);
}
/** Factory making a Key from a byte[]
* @return Desired Key */
public static <P extends Keyed> Key<P> make(byte[] kb) { return make(kb, DEFAULT_DESIRED_REPLICA_FACTOR); }
/** Factory making a Key from a String
* @return Desired Key */
public static <P extends Keyed> Key<P> make(String s) {
return make(decodeKeyName(s != null? s : rand()));
}
public static <P extends Keyed> Key<P> makeSystem(String s) {
return make(s,DEFAULT_DESIRED_REPLICA_FACTOR,BUILT_IN_KEY, false);
}
public static <P extends Keyed> Key<P> makeUserHidden(String s) {
return make(s,DEFAULT_DESIRED_REPLICA_FACTOR,HIDDEN_USER_KEY, false);
}
/**
* Make a random key, homed to a given node.
* @param node a node at which the new key is homed.
* @return the new key
*/
public static <P extends Keyed> Key<P> make(H2ONode node) {
return make(decodeKeyName(rand()),DEFAULT_DESIRED_REPLICA_FACTOR,BUILT_IN_KEY,false,node);
}
static <P extends Keyed> Key<P> make(String s, byte rf) { return make(decodeKeyName(s), rf);}
/** Factory making a random Key
* @return Desired Key */
public static <P extends Keyed> Key<P> make() { return make(rand()); }
/** Factory making a homed system Key. Requires the initial system byte but
* then allows a String for the remaining bytes. Requires a list of exactly
* one H2ONode to home at. The hint specifies if it is an error to name an
* H2ONode that is NOT in the Cloud, or if some other H2ONode can be
* substituted. The rf parameter and passing more than 1 H2ONode are both
* depreciated.
* @return the desired Key */
public static <P extends Keyed> Key<P> make(String s, byte rf, byte systemType, boolean hint, H2ONode... replicas) {
return make(decodeKeyName(s),rf,systemType,hint,replicas);
}
/** Factory making a homed system Key. Requires the initial system byte and
* uses {@link #rand} for the remaining bytes. Requires a list of exactly
* one H2ONode to home at. The hint specifies if it is an error to name an
* H2ONode that is NOT in the Cloud, or if some other H2ONode can be
* substituted. The rf parameter and passing more than 1 H2ONode are both
* depreciated.
* @return the desired Key */
public static <P extends Keyed> Key<P> make(byte rf, byte systemType, boolean hint, H2ONode... replicas) {
return make(rand(),rf,systemType,hint,replicas);
}
// Make a Key which is homed to specific nodes.
public static <P extends Keyed> Key<P> make(byte[] kb, byte rf, byte systemType, boolean required, H2ONode... replicas) {
// no more than 3 replicas allowed to be stored in the key
assert 0 <=replicas.length && replicas.length<=3;
assert systemType<32; // only system keys allowed
boolean inCloud=true;
for( H2ONode h2o : replicas ) if( !H2O.CLOUD.contains(h2o) ) inCloud = false;
if( required ) assert inCloud; // If required placement, error to find a client as the home
else if( !inCloud ) replicas = new H2ONode[0]; // If placement is a hint & cannot be placed, then ignore
// Key byte layout is:
// 0 - systemType, from 0-31
// 1 - replica-count, plus up to 3 bits for ip4 vs ip6
// 2-n - zero, one, two or 3 IP4 (4+2 bytes) or IP6 (16+2 bytes) addresses
// 2-5- 4 bytes of chunk#, or -1 for masters
// n+ - repeat of the original kb
AutoBuffer ab = new AutoBuffer();
ab.put1(systemType).put1(replicas.length);
for( H2ONode h2o : replicas )
h2o.write(ab);
ab.put4(-1);
ab.putA1(kb,kb.length);
return make(Arrays.copyOf(ab.buf(),ab.position()),rf);
}
/** Remove a Key from the DKV, including any embedded Keys.
*/
public void remove() { remove(new Futures()).blockForPending(); }
public Futures remove(Futures fs) {
Value val = DKV.get(this);
if( val!=null ) ((Keyed)val.get()).remove(fs);
return fs;
}
/** True if a {@link #USER_KEY} and not a system key.
* @return True if a {@link #USER_KEY} and not a system key */
public boolean user_allowed() { return type()==USER_KEY; }
/** System type/byte of a Key, or the constant {@link #USER_KEY}
* @return Key type */
// Returns the type of the key.
public int type() { return ((_kb[0]&0xff)>=32) ? USER_KEY : (_kb[0]&0xff); }
/** Return the classname for the Value that this Key points to, if any (e.g., "water.fvec.Frame"). */
public String valueClass() {
// Because Key<Keyed> doesn't have concrete parameterized subclasses (e.g.
// class FrameKey extends Key<Frame>) we can't get the type parameter at
// runtime. See:
// http://www.javacodegeeks.com/2013/12/advanced-java-generics-retreiving-generic-type-arguments.html
//
// Therefore, we have to fetch the type of the item the Key is pointing to at runtime.
Value v = DKV.get(this);
if (null == v)
return null;
else
return v.className();
}
/** Return the base classname (not including the package) for the Value that this Key points to, if any (e.g., "Frame"). */
public String valueClassSimple() {
String vc = this.valueClass();
if (null == vc) return null;
String[] elements = vc.split("\\.");
return elements[elements.length - 1];
}
static final char MAGIC_CHAR = '$'; // Used to hexalate displayed keys
private static final char[] HEX = "0123456789abcdef".toCharArray();
/** Converts the key to HTML displayable string.
*
* For user keys returns the key itself, for system keys returns their
* hexadecimal values.
*
* @return key as a printable string
*/
@Override public String toString() {
int len = _kb.length;
while( --len >= 0 ) {
char a = (char) _kb[len];
if (' ' <= a && a <= '#') continue;
// then we have $ which is not allowed
if ('%' <= a && a <= '~') continue;
// already in the one above
//if( 'a' <= a && a <= 'z' ) continue;
//if( 'A' <= a && a <= 'Z' ) continue;
//if( '0' <= a && a <= '9' ) continue;
break;
}
if (len>=0) {
StringBuilder sb = new StringBuilder();
sb.append(MAGIC_CHAR);
for( int i = 0; i <= len; ++i ) {
byte a = _kb[i];
sb.append(HEX[(a >> 4) & 0x0F]);
sb.append(HEX[(a >> 0) & 0x0F]);
}
sb.append(MAGIC_CHAR);
for( int i = len + 1; i < _kb.length; ++i ) sb.append((char)_kb[i]);
return sb.toString();
} else {
return new String(_kb);
}
}
private static byte[] decodeKeyName(String what) {
if( what==null ) return null;
if( what.length()==0 ) return null;
if (what.charAt(0) == MAGIC_CHAR) {
int len = what.indexOf(MAGIC_CHAR,1);
if( len < 0 ) throw new IllegalArgumentException("No matching magic '"+MAGIC_CHAR+"', key name is not legal");
String tail = what.substring(len+1);
byte[] res = new byte[(len-1)/2 + tail.length()];
int r = 0;
for( int i = 1; i < len; i+=2 ) {
char h = what.charAt(i);
char l = what.charAt(i+1);
h -= Character.isDigit(h) ? '0' : ('a' - 10);
l -= Character.isDigit(l) ? '0' : ('a' - 10);
res[r++] = (byte)(h << 4 | l);
}
System.arraycopy(StringUtils.bytesOf(tail), 0, res, r, tail.length());
return res;
} else {
byte[] res = new byte[what.length()];
for( int i=0; i<res.length; i++ ) res[i] = (byte)what.charAt(i);
return res;
}
}
@Override public int hashCode() { return _hash; }
@Override public boolean equals( Object o ) {
if( this == o ) return true;
if( o == null ) return false;
Key k = (Key)o;
if( _hash != k._hash ) return false;
return Arrays.equals(k._kb,_kb);
}
/** Lexically ordered Key comparison, so Keys can be sorted. Modestly expensive. */
@Override public int compareTo(Object o) {
assert (o instanceof Key);
return this.toString().compareTo(o.toString());
}
public static final AutoBuffer write_impl(Key k, AutoBuffer ab) {return ab.putA1(k._kb);}
public static final Key read_impl(Key k, AutoBuffer ab) {return make(ab.getA1());}
public static final AutoBuffer writeJSON_impl( Key k, AutoBuffer ab ) {
ab.putJSONStr("name",k.toString());
ab.put1(',');
ab.putJSONStr("type", ReflectionUtils.findActualClassParameter(k.getClass(), 0).getSimpleName());
return ab;
}
}