package water;
import java.util.Arrays;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicLongFieldUpdater;
import water.util.UnsafeUtils;
/**
* Keys
*
* This class defines:
* - A Key's bytes (name) and hash
* - Known Disk and memory replicas.
* - A cache of somewhat expensive to compute stuff related to the current
* Cloud, plus a byte of the desired replication factor.
*
* Keys are expected to be a high-count item, hence the care about size.
*
* Keys are *interned* in the local K/V store, a non-blocking hash set and are
* kept pointer equivalent (via the interning) for cheap compares. The
* interning only happens after a successful insert in the local H2O.STORE via
* H2O.put_if_later.
*
* @author <a href="mailto:cliffc@h2o.ai"></a>
* @version 1.0
*/
public final class Key extends Iced implements Comparable {
// The Key!!!
// Limited to 512 random bytes - to fit better in UDP packets.
public static final int KEY_LENGTH = 512;
public byte[] _kb; // Key bytes, wire-line protocol
transient int _hash; // Hash on key alone (and not value)
// The user keys must be ASCII, so the values 0..31 are reserved for system
// keys. When you create a system key, please do add its number to this list
public static final byte BUILT_IN_KEY = 2; // C.f. Constants.BUILT_IN_KEY_*
public static final byte JOB = 3;
public static final byte VEC = 4;
public static final byte DVEC = 5;
public static final byte VGROUP = 6; // vector group
public static final byte DFJ_INTERNAL_USER = 7;
public static final byte HIDDEN_USER_KEY = 31;
public static final byte USER_KEY = 32;
// *Desired* distribution function on keys & replication factor. Replica #0
// is the master, replica #1, 2, 3, etc represent additional desired
// replication nodes. Note that this function is just the distribution
// function - it does not DO any replication, nor does it dictate any policy
// on how fast replication occurs. Returns -1 if the desired replica
// is nonsense, e.g. asking for replica #3 in a 2-Node system.
int D( int repl ) {
int hsz = H2O.CLOUD.size();
// See if this is a specifically homed Key
if( !user_allowed() && repl < _kb[1] ) { // Asking for a replica# from the homed list?
assert _kb[0] != Key.DVEC;
H2ONode h2o = H2ONode.intern(_kb,2+repl*(4+2/*serialized bytesize of H2OKey*/));
// Reverse the home to the index
int idx = h2o.index();
if( idx >= 0 ) return idx;
// Else homed to a node which is no longer in the cloud!
// Fall back to the normal home mode
}
// Distribution of Fluid Vectors is a special case.
// Fluid Vectors are grouped into vector groups, each of which must have
// the same distribution of chunks so that MRTask2 run over group of
// vectors will keep data-locality. The fluid vecs from the same group
// share the same key pattern + each has 4 bytes identifying particular
// vector in the group. Since we need the same chunks end up on the same
// node in the group, we need to skip the 4 bytes containing vec# from the
// hash. Apart from that, we keep the previous mode of operation, so that
// ByteVec would have first 64MB distributed around cloud randomly and then
// go round-robin in 64MB chunks.
if( _kb[0] == DVEC ) {
// Homed Chunk?
if( _kb[1] != -1 ) throw H2O.unimpl();
// For round-robin on Chunks in the following pattern:
// 1 Chunk-per-node, until all nodes have 1 chunk (max parallelism).
// Then 2 chunks-per-node, once around, then 4, then 8, then 16.
// Getting several chunks-in-a-row on a single Node means that stencil
// calculations that step off the end of one chunk into the next won't
// force a chunk local - replicating the data. If all chunks round robin
// exactly, then any stencil calc will double the cached volume of data
// (every node will have it's own chunk, plus a cached next-chunk).
// Above 16-chunks-in-a-row we hit diminishing returns.
int cidx = UnsafeUtils.get4(_kb, 1 + 1 + 4); // Chunk index
int x = cidx/hsz; // Multiples of cluster size
// 0 -> 1st trip around the cluster; nidx= (cidx- 0*hsz)>>0
// 1,2 -> 2nd & 3rd trip; allocate in pairs: nidx= (cidx- 1*hsz)>>1
// 3,4,5,6 -> next 4 rounds; allocate in quads: nidx= (cidx- 3*hsz)>>2
// 7-14 -> next 8 rounds in octets: nidx= (cidx- 7*hsz)>>3
// 15+ -> remaining rounds in groups of 16: nidx= (cidx-15*hsz)>>4
int z = x==0 ? 0 : (x<=2 ? 1 : (x<=6 ? 2 : (x<=14 ? 3 : 4)));
int nidx = (cidx-((1<<z)-1)*hsz)>>z;
return ((nidx+repl)&0x7FFFFFFF) % hsz;
}
// Easy Cheesy Stupid:
return ((_hash+repl)&0x7FFFFFFF) % hsz;
}
/** List of illegal characters which are not allowed in user keys. */
public static final CharSequence ILLEGAL_USER_KEY_CHARS = " !@#$%^&*()+={}[]|\\;:\"'<>,/?";
// 64 bits of Cloud-specific cached stuff. It is changed atomically by any
// thread that visits it and has the wrong Cloud. It has to be read *in the
// context of a specific Cloud*, since a re-read may be for another Cloud.
private transient volatile long _cache;
private static final AtomicLongFieldUpdater<Key> _cacheUpdater =
AtomicLongFieldUpdater.newUpdater(Key.class, "_cache");
public final boolean isVec () { return _kb != null && _kb.length > 0 && _kb[0] == VEC; }
public final boolean isChunkKey() { return _kb != null && _kb.length > 0 && _kb[0] == DVEC; }
public final Key getVecKey() { assert isChunkKey(); return water.fvec.Vec.getVecKey(this); }
// Accessors and updaters for the Cloud-specific cached stuff.
// The Cloud index, a byte uniquely identifying the last 256 Clouds. It
// changes atomically with the _cache word, so we can tell which Cloud this
// data is a cache of.
private static int cloud( long cache ) { return (int)(cache>>> 0)&0x00FF; }
// Shortcut node index for Home replica#0. This replica is responsible for
// breaking ties on writes. 'char' because I want an unsigned 16bit thing,
// limit of 65534 Cloud members. -1 is reserved for a bare-key
private static int home ( long cache ) { return (int)(cache>>> 8)&0xFFFF; }
// Our replica #, or -1 if we're not one of the first 127 replicas. This
// value is found using the Cloud distribution function and changes for a
// changed Cloud.
private static int replica(long cache) { return (byte)(cache>>>24)&0x00FF; }
// Desired replication factor. Can be zero for temp keys. Not allowed to
// later, because it messes with e.g. meta-data on disk.
private static int desired(long cache) { return (int)(cache>>>32)&0x00FF; }
private static long build_cache( int cidx, int home, int replica, int desired ) {
return // Build the new cache word
((long)(cidx &0xFF)<< 0) |
((long)(home &0xFFFF)<< 8) |
((long)(replica&0xFF)<<24) |
((long)(desired&0xFF)<<32) |
((long)(0 )<<40);
}
public int home ( H2O cloud ) { return home (cloud_info(cloud)); }
public int replica( H2O cloud ) { return replica(cloud_info(cloud)); }
public int desired( ) { return desired(_cache); }
public boolean home() { return home_node()==H2O.SELF; }
public H2ONode home_node( ) {
H2O cloud = H2O.CLOUD;
return cloud._memary[home(cloud)];
}
// Update the cache, but only to strictly newer Clouds
private boolean set_cache( long cache ) {
while( true ) { // Spin till get it
long old = _cache; // Read once at the start
if( !H2O.larger(cloud(cache),cloud(old)) ) // Rolling backwards?
// Attempt to set for an older Cloud. Blow out with a failure; caller
// should retry on a new Cloud.
return false;
assert cloud(cache) != cloud(old) || cache == old;
if( old == cache ) return true; // Fast-path cutout
if( _cacheUpdater.compareAndSet(this,old,cache) ) return true;
// Can fail if the cache is really old, and just got updated to a version
// which is still not the latest, and we are trying to update it again.
}
}
// Return the info word for this Cloud. Use the cache if possible
public long cloud_info( H2O cloud ) {
long x = _cache;
// See if cached for this Cloud. This should be the 99% fast case.
if( cloud(x) == cloud._idx ) return x;
// Cache missed! Probaby it just needs (atomic) updating.
// But we might be holding the stale cloud...
// Figure out home Node in this Cloud
char home = (char)D(0);
// Figure out what replica # I am, if any
int desired = desired(x);
int replica = -1;
for( int i=0; i<desired; i++ ) {
int idx = D(i);
if( idx >= 0 && cloud._memary[idx] == H2O.SELF ) {
replica = i;
break;
}
}
long cache = build_cache(cloud._idx,home,replica,desired);
set_cache(cache); // Attempt to upgrade cache, but ignore failure
return cache; // Return the magic word for this Cloud
}
// Default desired replication factor. Unless specified otherwise, all new
// k-v pairs start with this replication factor.
public static final byte DEFAULT_DESIRED_REPLICA_FACTOR = 2;
// Construct a new Key.
private Key(byte[] kb) {
if( kb.length > KEY_LENGTH ) throw new IllegalArgumentException("Key length would be "+kb.length);
_kb = kb;
// Quicky hash: http://en.wikipedia.org/wiki/Jenkins_hash_function
int hash = 0;
for( byte b : kb ) {
hash += b;
hash += (hash << 10);
hash ^= (hash >> 6);
}
hash += (hash << 3);
hash ^= (hash >> 11);
hash += (hash << 15);
_hash = hash;
}
// Make new Keys. Optimistically attempt interning, but no guarantee.
static public Key make(byte[] kb, byte rf) {
if( rf == -1 ) throw new IllegalArgumentException();
Key key = new Key(kb);
Key key2 = H2O.getk(key); // Get the interned version, if any
if( key2 != null ) // There is one! Return it instead
return key2;
// Set the cache with desired replication factor, and a fake cloud index
H2O cloud = H2O.CLOUD; // Read once
key._cache = build_cache(cloud._idx-1,0,0,rf);
key.cloud_info(cloud); // Now compute & cache the real data
return key;
}
// A random string, useful as a Key name or partial Key suffix.
static public String rand() {
UUID uid = UUID.randomUUID();
long l1 = uid.getLeastSignificantBits();
long l2 = uid. getMostSignificantBits();
return "_"+Long.toHexString(l1)+Long.toHexString(l2);
}
static public Key make(byte[] kb) { return make(kb,DEFAULT_DESIRED_REPLICA_FACTOR); }
static public Key make(String s) { return make(decodeKeyName(s));}
static public Key make(String s, byte rf) { return make(decodeKeyName(s), rf);}
static public Key make() { return make(rand()); }
// Make a particular system key that is homed to given node and possibly
// specifies also other 2 replicas. Works for both IPv4 and IPv6 addresses.
// If the addresses are not specified, returns a key with no home information.
static public Key make(String s, byte rf, byte systemType, H2ONode... replicas) {
return make(decodeKeyName(s),rf,systemType,replicas);
}
static public Key make(byte rf, byte systemType, H2ONode... replicas) {
return make(rand(),rf,systemType,replicas);
}
// Make a Key which is homed to specific nodes.
static public Key make(byte[] kb, byte rf, byte systemType, H2ONode... replicas) {
// no more than 3 replicas allowed to be stored in the key
assert 0 <=replicas.length && replicas.length<=3;
assert systemType<32; // only system keys allowed
// Key byte layout is:
// 0 - systemType, from 0-31
// 1 - replica-count, plus up to 3 bits for ip4 vs ip6
// 2-n - zero, one, two or 3 IP4 (4+2 bytes) or IP6 (16+2 bytes) addresses
// 2-5- 4 bytes of chunk#, or -1 for masters
// n+ - repeat of the original kb
AutoBuffer ab = new AutoBuffer();
ab.put1(systemType).put1(replicas.length);
for( H2ONode h2o : replicas )
h2o.write(ab);
ab.put4(-1);
ab.putA1(kb,kb.length);
return make(Arrays.copyOf(ab.buf(),ab.position()),rf);
}
// Hide a user key by turning it into a system key of type HIDDEN_USER_KEY
final public static Key makeSystem(String s) {
byte[] kb= decodeKeyName(s);
byte[] kb2 = new byte[kb.length+1];
System.arraycopy(kb,0,kb2,1,kb.length);
kb2[0] = Key.BUILT_IN_KEY;
return Key.make(kb2);
}
// Custom Serialization Reader: Keys must be interned on construction.
@Override public final Key read(AutoBuffer bb) { return make(bb.getA1()); }
@Override public final AutoBuffer write(AutoBuffer bb) { return bb.putA1(_kb); }
@Override public final AutoBuffer writeJSON(AutoBuffer bb) { return bb.putJSONStr(toString()); }
// User keys must be all ASCII, but we only check the 1st byte
public boolean user_allowed() {
return (_kb[0]&0xFF) >= 32;
}
// Returns the type of the key.
public int type() {
return ((_kb[0]&0xff)>=32) ? USER_KEY : (_kb[0]&0xff);
}
public static final char MAGIC_CHAR = '$';
private static final char[] HEX = "0123456789abcdef".toCharArray();
/** Converts the key to HTML displayable string.
*
* For user keys returns the key itself, for system keys returns their
* hexadecimal values.
*
* @return key as a printable string
*/
@Override
public String toString() {
int len = _kb.length;
while( --len >= 0 ) {
char a = (char) _kb[len];
if (' ' <= a && a <= '#') continue;
// then we have $ which is not allowed
if ('%' <= a && a <= '~') continue;
// already in the one above
//if( 'a' <= a && a <= 'z' ) continue;
//if( 'A' <= a && a <= 'Z' ) continue;
//if( '0' <= a && a <= '9' ) continue;
break;
}
if (len>=0) {
StringBuilder sb = new StringBuilder();
sb.append(MAGIC_CHAR);
for( int i = 0; i <= len; ++i ) {
byte a = _kb[i];
sb.append(HEX[(a >> 4) & 0x0F]);
sb.append(HEX[(a >> 0) & 0x0F]);
}
sb.append(MAGIC_CHAR);
for( int i = len + 1; i < _kb.length; ++i ) sb.append((char)_kb[i]);
return sb.toString();
} else {
return new String(_kb);
}
}
private static byte[] decodeKeyName(String what) {
if( what==null ) return null;
if( what.length()==0 ) return null;
if (what.charAt(0) == MAGIC_CHAR) {
int len = what.indexOf(MAGIC_CHAR,1);
String tail = what.substring(len+1);
byte[] res = new byte[(len-1)/2 + tail.length()];
int r = 0;
for( int i = 1; i < len; i+=2 ) {
char h = what.charAt(i);
char l = what.charAt(i+1);
h -= Character.isDigit(h) ? '0' : ('a' - 10);
l -= Character.isDigit(l) ? '0' : ('a' - 10);
res[r++] = (byte)(h << 4 | l);
}
System.arraycopy(tail.getBytes(), 0, res, r, tail.length());
return res;
} else {
return what.getBytes();
}
}
@Override public int hashCode() { return _hash; }
@Override public boolean equals( Object o ) {
if( o == null || ((Key)(o))._kb == null || _kb == null) return false;
if( this == o ) return true;
Key k = (Key)o;
return Arrays.equals(k._kb,_kb);
}
@Override public int compareTo(Object o) {
assert (o instanceof Key);
return this.toString().compareTo(o.toString());
}
// Simple wrapper class defining an array-of-keys that is serializable.
// Note that if you modify any fields of a POJO that is part of a Value,
// - this is not the recommended programming style,
// - those changes are visible to all on the node,
// - but not to other nodes
// - and the POJO might be dropped by the MemoryManager and reconstitued from
// disk and/or the byte array back to it's original form.
public static class Ary extends Iced {
public final Key[] _keys;
Ary( Key[] keys ) { _keys = keys; }
}
public static String toPrettyString(Key k) {
StringBuilder sb = new StringBuilder("Key { type: ");
switch( k._kb[0] ) {
case 0: sb.append("arraylet chunk"); break;
case 2: sb.append("build-in"); break;
case 3: sb.append("job"); break;
case 4: sb.append("vec"); break;
case 5: sb.append("dvec"); break;
case 6: sb.append("vgroup"); break;
case 7: sb.append("DFJ internal"); break;
case 31: sb.append("hidden user"); break;
case 32: sb.append("user"); break;
default: sb.append("UNKNOWN"); break;
}
sb.append(",replicas: ").append(k._kb[1]).append(",");
sb.append(k.toString()).append("}");
return sb.toString();
}
}