package water.fvec; import jsr166y.CountedCompleter; import water.*; import water.nbhm.NonBlockingHashMapLong; import water.util.Utils; import java.util.Arrays; import java.util.UUID; import java.util.concurrent.Future; import static water.util.Utils.seq; /** * A single distributed vector column. * <p> * A distributed vector has a count of elements, an element-to-chunk mapping, a * Java type (mostly determines rounding on store and display), and functions * to directly load elements without further indirections. The data is * compressed, or backed by disk or both. *Writing* to elements may throw if the * backing data is read-only (file backed). * <p> * <pre> * Vec Key format is: Key. VEC - byte, 0 - byte, 0 - int, normal Key bytes. * DVec Key format is: Key.DVEC - byte, 0 - byte, chunk# - int, normal Key bytes. * </pre> * * The main API is at, set, and isNA:<br> *<pre> * double at ( long row ); // Returns the value expressed as a double. NaN if missing. * long at8 ( long row ); // Returns the value expressed as a long. Throws if missing. * boolean isNA( long row ); // True if the value is missing. * set( long row, double d ); // Stores a double; NaN will be treated as missing. * set( long row, long l ); // Stores a long; throws if l exceeds what fits in a double and any floats are ever set. * setNA( long row ); // Sets the value as missing. * </pre> * * Note this dangerous scenario: loading a missing value as a double, and * setting it as a long: <pre> * set(row,(long)at(row)); // Danger! *</pre> * The cast from a Double.NaN to a long produces a zero! This code will * replace a missing value with a zero. * * @author Cliff Click */ public class Vec extends Iced { /** Key mapping a Value which holds this Vec. */ final public Key _key; // Top-level key /** Element-start per chunk. Always zero for chunk 0. One more entry than * chunks, so the last entry is the total number of rows. This field is * dead/ignored in subclasses that are guaranteed to have fixed-sized chunks * such as file-backed Vecs. */ final public long _espc[]; /** Enum/factor/categorical names. */ public String [] _domain; /** Time parse, index into Utils.TIME_PARSE, or -1 for not-a-time */ public byte _time; /** RollupStats: min/max/mean of this Vec lazily computed. */ private double _min, _max, _mean, _sigma; long _size; boolean _isInt; // All ints boolean _isUUID; // All UUIDs (or zero or missing) /** The count of missing elements.... or -2 if we have active writers and no * rollup info can be computed (because the vector is being rapidly * modified!), or -1 if rollups have not been computed since the last * modification. */ volatile long _naCnt=-1; private long _last_write_timestamp = System.currentTimeMillis(); private long _checksum_timestamp = -1; private long _checksum = 0; /** Main default constructor; requires the caller understand Chunk layout * already, along with count of missing elements. */ public Vec( Key key, long espc[]) { this(key, espc, null); } public Vec( Key key, long espc[], String[] domain) { this(key,espc,domain,false,(byte)-1); } public Vec( Key key, long espc[], String[] domain, boolean hasUUID, byte time) { assert key._kb[0]==Key.VEC; _key = key; _espc = espc; _time = time; // is-a-time, or not (and what flavor used to parse time) _isUUID = hasUUID; // all-or-nothing UUIDs _domain = domain; } protected Vec( Key key, Vec v ) { this(key, v._espc); assert group()==v.group(); } public Vec [] makeZeros(int n){return makeZeros(n,null,null,null);} public Vec [] makeZeros(int n, String [][] domain, boolean[] uuids, byte[] times){ return makeCons(n, 0, domain, uuids, times);} public Vec [] makeCons(int n, final long l, String [][] domain, boolean[] uuids, byte[] times){ if( _espc == null ) throw H2O.unimpl(); // need to make espc for e.g. NFSFileVecs! final int nchunks = nChunks(); Key [] keys = group().addVecs(n); final Vec [] vs = new Vec[keys.length]; for(int i = 0; i < vs.length; ++i) vs[i] = new Vec(keys[i],_espc, domain == null ? null : domain[i], uuids == null ? false : uuids [i], times == null ? (byte)-1: times [i]); new DRemoteTask(){ @Override public void lcompute(){ addToPendingCount(vs.length); for(int i = 0; i < vs.length; ++i){ final int fi = i; new H2O.H2OCountedCompleter(this){ @Override public void compute2(){ long row=0; // Start row Key k; for( int i=0; i<nchunks; i++ ) { long nrow = chunk2StartElem(i+1); // Next row if((k = vs[fi].chunkKey(i)).home()) DKV.put(k,new C0LChunk(l,(int)(nrow-row)),_fs); row = nrow; } tryComplete(); } }.fork(); } tryComplete(); } @Override public final void lonCompletion( CountedCompleter caller ) { Futures fs = new Futures(); for(Vec v:vs) if(v._key.home()) DKV.put(v._key,v,fs); fs.blockForPending(); } @Override public void reduce(DRemoteTask drt){} }.invokeOnAllNodes(); return vs; } /** * Create an array of Vecs from scratch * @param rows Length of each vec * @param cols Number of vecs * @param val Constant value (long) * @param domain Factor levels (for factor columns) * @return Array of Vecs */ static public Vec [] makeNewCons(final long rows, final int cols, final long val, final String [][] domain){ int chunks = Math.min((int)rows, 4*H2O.NUMCPUS*H2O.CLOUD.size()); long[] espc = new long[chunks+1]; for (int i = 0; i<=chunks; ++i) espc[i] = i * rows / chunks; Vec v = new Vec(Vec.newKey(), espc); return v.makeCons(cols, val, domain,null,null); } /** Make a new vector with the same size and data layout as the old one, and * initialized to zero. */ public Vec makeZero() { return makeCon(0); } public Vec makeZero(String[] domain) { return makeCon(0, domain); } /** Make a new vector with the same size and data layout as the old one, and * initialized to a constant. */ public Vec makeCon( final long l ) { return makeCon(l, null); } public Vec makeCon( final long l, String[] domain ) { Futures fs = new Futures(); if( _espc == null ) throw H2O.unimpl(); // need to make espc for e.g. NFSFileVecs! final int nchunks = nChunks(); final Vec v0 = new Vec(group().addVecs(1)[0],_espc, domain); new DRemoteTask(){ @Override public void lcompute(){ long row=0; // Start row Key k; for( int i=0; i<nchunks; i++ ) { long nrow = chunk2StartElem(i+1); // Next row if((k = v0.chunkKey(i)).home()) DKV.put(k,new C0LChunk(l,(int)(nrow-row)),_fs); row = nrow; } tryComplete(); } @Override public void reduce(DRemoteTask drt){} }.invokeOnAllNodes(); DKV.put(v0._key,v0,fs); fs.blockForPending(); return v0; } public Vec makeCon( final double d ) { Futures fs = new Futures(); if( _espc == null ) throw H2O.unimpl(); // need to make espc for e.g. NFSFileVecs! if( (long)d==d ) return makeCon((long)d); final int nchunks = nChunks(); final Vec v0 = new Vec(group().addVecs(1)[0],_espc); new DRemoteTask(){ @Override public void lcompute(){ getFutures(); long row=0; // Start row Key k; for( int i=0; i<nchunks; i++ ) { long nrow = chunk2StartElem(i+1); // Next row if((k = v0.chunkKey(i)).home()) DKV.put(k,new C0DChunk(d,(int)(nrow-row)),_fs); row = nrow; } tryComplete(); } @Override public void reduce(DRemoteTask drt){} }.invokeOnAllNodes(); DKV.put(v0._key,v0,fs); fs.blockForPending(); return v0; } public static Vec makeSeq( long len) { return new MRTask2() { @Override public void map(Chunk[] cs) { for (int i = 0; i < cs.length; i++) { Chunk c = cs[i]; for (int r = 0; r < c._len; r++) c.set0(r, r+1+c._start); } } }.doAll(makeConSeq(0, len)).vecs(0); } public static Vec makeConSeq(double x, long len) { final int CHUNK_SZ = 1 << H2O.LOG_CHK; int chunks = (int)Math.ceil((double)len / CHUNK_SZ); long[] espc = new long[chunks+1]; for (int i = 1; i<=chunks; ++i) espc[i] = Math.min(espc[i-1] + CHUNK_SZ, len); return new Vec(VectorGroup.VG_LEN1.addVec(), espc).makeCon(x); } /** Create a new 1-element vector in the shared vector group for 1-element vectors. */ public static Vec make1Elem(double d) { return make1Elem(Vec.VectorGroup.VG_LEN1.addVec(), d); } /** Create a new 1-element vector representing a scalar value. */ public static Vec make1Elem(Key key, double d) { assert key.isVec(); Vec v = new Vec(key,new long[]{0,1}); Futures fs = new Futures(); DKV.put(v.chunkKey(0),new C0DChunk(d,1),fs); DKV.put(key,v,fs); fs.blockForPending(); return v; } /** Create a vector transforming values according given domain map. * @see Vec#makeTransf(int[], int[], String[]) */ public Vec makeTransf(final int[][] map, String[] finalDomain) { return makeTransf(map[0], map[1], finalDomain); } /** * Creates a new transformation from given values to given indexes of * given domain. * @param values values being mapped from * @param indexes values being mapped to * @param domain domain of new vector * @return always return a new vector which maps given values into a new domain */ public Vec makeTransf(final int[] values, final int[] indexes, final String[] domain) { if( _espc == null ) throw H2O.unimpl(); Vec v0 = new TransfVec(values, indexes, domain, this._key, group().addVecs(1)[0],_espc); UKV.put(v0._key,v0); return v0; } /** * Makes a new transformation vector with identity mapping. * * @return a new transformation vector * @see Vec#makeTransf(int[], int[], String[]) */ Vec makeIdentityTransf() { assert _domain != null : "Cannot make an identity transformation of non-enum vector!"; return makeTransf(seq(0, _domain.length), null, _domain); } /** * Makes a new transformation vector from given values to * values 0..domain size * @param values values which are mapped from * @param domain target domain which is mapped to * @return a new transformation vector providing mapping between given values and target domain. * @see Vec#makeTransf(int[], int[], String[]) */ Vec makeSimpleTransf(long[] values, String[] domain) { int is[] = new int[values.length]; for( int i=0; i<values.length; i++ ) is[i] = (int)values[i]; return makeTransf(is, null, domain); } /** This Vec does not have dependent hidden Vec it uses. * * @return dependent hidden vector or <code>null</code> */ public Vec masterVec() { return null; } /** * Adapt given vector <code>v</code> to this vector. * I.e., unify domains, compute transformation, and call makeTransf(). * * This vector is a leader - it determines a domain (i.e., {@link #domain()}) and mapping between values stored in vector * and domain values. * The vector <code>v</code> can contain different domain (subset, superset), hence the values stored in the vector * has to be transformed to the values determined by this vector. The resulting vector domain is the * same as this vector domain. * * Always returns a new vector and user's responsibility is delete the vector. * * @param v vector which should be adapter in according this vector. * @param exact should vector match exactly (recommended value is true). * @return a new vector which implements transformation of original values. */ /*// Not used any more in code ?? public Vec adaptTo(Vec v, boolean exact) { assert isInt() : "This vector has to be int/enum vector!"; int[] domain = null; // Compute domain of this vector // - if vector is enum, use domain directly // - if vector is int, then vector numeric domain is collected and transformed to string domain // and then adapted String[] sdomain = (_domain == null) ? Utils.toStringMap(domain = new CollectDomain(this).doAll(this).domain()) // it is number-column : domain(); // it is enum // Compute transformation - domain map - each value in an array is one value from vector domain, its index // represents an index into string domain representation. int[] domMap = Model.getDomainMapping(v._domain, sdomain, exact); if (domain!=null) { // do a mapping from INT -> ENUM -> this vector ENUM domMap = Utils.compose(Utils.mapping(domain), domMap); } return this.makeTransf(domMap, sdomain); }*/ /** Number of elements in the vector. Overridden by subclasses that compute * length in an alternative way, such as file-backed Vecs. */ public long length() { return _espc[_espc.length-1]; } /** Number of chunks. Overridden by subclasses that compute chunks in an * alternative way, such as file-backed Vecs. */ public int nChunks() { return _espc.length-1; } /** Whether or not this column parsed as a time, and if so what pattern was used. */ public final boolean isTime(){ return _time>=0; } public final int timeMode(){ return _time; } public final String timeParse(){ return ParseTime.TIME_PARSE[_time]; } /** Map the integer value for a enum/factor/categorical to it's String. * Error if it is not an ENUM. */ public String domain(long i) { return _domain[(int)i]; } /** Return an array of domains. This is eagerly manifested for enum or * categorical columns. Returns null for non-Enum/factor columns. */ public String[] domain() { return _domain; } /** Returns cardinality for enum domain or -1 for other types. */ public int cardinality() { return isEnum() ? _domain.length : -1; } /** Transform this vector to enum. * If the vector is integer vector then its domain is collected and transformed to * corresponding strings. * If the vector is enum an identity transformation vector is returned. * Transformation is done by a {@link TransfVec} which provides a mapping between values. * * @return always returns a new vector and the caller is responsible for vector deletion! */ public Vec toEnum() { if( isEnum() ) return this.makeIdentityTransf(); // Make an identity transformation of this vector if( !isInt() ) throw new IllegalArgumentException("Enum conversion only works on integer columns"); long[] domain; String[] sdomain = Utils.toString(domain = new CollectDomain(this).doAll(this).domain()); if( domain.length > H2O.DATA_MAX_FACTOR_LEVELS ) throw new IllegalArgumentException("Column domain is too large to be represented as an enum: " + domain.length + " > " + H2O.DATA_MAX_FACTOR_LEVELS + ". Launch H2O with -data_max_factor_levels <N>."); return this.makeSimpleTransf(domain, sdomain); } /** Default read/write behavior for Vecs. File-backed Vecs are read-only. */ protected boolean readable() { return true ; } /** Default read/write behavior for Vecs. AppendableVecs are write-only. */ protected boolean writable() { return true; } /** Return column min - lazily computed as needed. */ public double min() { return rollupStats()._min; } /** Return column max - lazily computed as needed. */ public double max() { return rollupStats()._max; } /** Return column mean - lazily computed as needed. */ public double mean() { return rollupStats()._mean; } /** Return column standard deviation - lazily computed as needed. */ public double sigma(){ return rollupStats()._sigma; } /** Return column missing-element-count - lazily computed as needed. */ public long naCnt() { return rollupStats()._naCnt; } /** Is all integers? */ public boolean isInt(){return rollupStats()._isInt; } /** Size of compressed vector data. */ public long byteSize(){return rollupStats()._size; } public long checksum() { final long now = _last_write_timestamp; // TODO: someone can be writing while we're checksuming. . . if (-1 != now && now == _checksum_timestamp) { return _checksum; } final long checksum = new ChecksummerTask().doAll(this).getChecksum(); new TAtomic<Vec>() { @Override public Vec atomic(Vec v) { if (v != null) { v._checksum = checksum; v._checksum_timestamp = now; } return v; } }.invoke(_key); this._checksum = checksum; this._checksum_timestamp = now; return checksum; } /** Is the column a factor/categorical/enum? Note: all "isEnum()" columns * are are also "isInt()" but not vice-versa. */ public final boolean isEnum(){return _domain != null;} public final boolean isUUID(){return _isUUID;} /** Is the column constant. * <p>Returns true if the column contains only constant values and it is not full of NAs.</p> */ public final boolean isConst() { return min() == max(); } /** Is the column bad. * <p>Returns true if the column is full of NAs.</p> */ public final boolean isBad() { return naCnt() == length(); } public static class VecIdenticalTask extends MRTask2<VecIdenticalTask> { final double fpointPrecision; VecIdenticalTask(H2O.H2OCountedCompleter cc, double precision){super(cc); fpointPrecision = precision;} boolean _res; @Override public void map(Chunk c1, Chunk c2){ if(!(c1 instanceof C8DChunk) && c1.getClass().equals(c2.getClass())) _res = Arrays.equals(c1._mem,c2._mem); else { if(c1._len != c2._len)return; if(c1.hasFloat()){ if(!c2.hasFloat())return; for(int i = 0; i < c1._len; ++i) { double diff = c1.at0(i) - c2.at0(i); if(diff > fpointPrecision || -diff > fpointPrecision)return; } } else { if(c2.hasFloat())return; for(int i = 0; i < c1._len; ++i) if(c1.at80(i) != c2.at80(i))return; } _res = true; } } @Override public void reduce(VecIdenticalTask bt){_res = _res && bt._res;} } /** Is the column contains float values. */ public final boolean isFloat() { return !isEnum() && !isInt(); } public final boolean isByteVec() { return (this instanceof ByteVec); } Vec setRollupStats( RollupStats rs ) { _min = rs._min; _max = rs._max; _mean = rs._mean; _sigma = Math.sqrt(rs._sigma / (rs._rows - 1)); _size =rs._size; _isInt= rs._isInt; if( rs._rows == 0 ) // All rows missing? Then no rollups _min = _max = _mean = _sigma = Double.NaN; _naCnt= rs._naCnt; // Volatile write last to announce all stats ready return this; } Vec setRollupStats( Vec v ) { _min = v._min; _max = v._max; _mean = v._mean; _sigma = v._sigma; _size = v._size; _isInt = v._isInt; _naCnt= v._naCnt; // Volatile write last to announce all stats ready return this; } /** Compute the roll-up stats as-needed, and copy into the Vec object */ public Vec rollupStats() { return rollupStats(null); } // Allow a bunch of rollups to run in parallel. If Futures is passed in, run // the rollup in the background. *Always* returns "this". public Vec rollupStats(Futures fs) { Vec vthis = DKV.get(_key).get(); if( vthis._naCnt==-2 ) throw new IllegalArgumentException("Cannot ask for roll-up stats while the vector is being actively written."); if( vthis._naCnt>= 0 ) // KV store has a better answer return vthis == this ? this : setRollupStats(vthis); // KV store reports we need to recompute RollupStats rs = new RollupStats().dfork(this); if(fs != null) fs.add(rs); else setRollupStats(rs.getResult()); return this; } /** A private class to compute the rollup stats */ private static class RollupStats extends MRTask2<RollupStats> { double _min=Double.MAX_VALUE, _max=-Double.MAX_VALUE, _mean, _sigma; long _rows, _naCnt, _size; boolean _isInt=true; @Override public void postGlobal(){ final RollupStats rs = this; _fr.vecs()[0].setRollupStats(rs); // Now do this remotely also new TAtomic<Vec>() { @Override public Vec atomic(Vec v) { if( v!=null && v._naCnt == -1 ) v.setRollupStats(rs); return v; } }.fork(_fr._keys[0]); } @Override public void map( Chunk c ) { _size = c.byteSize(); // UUID columns do not compute min/max/mean/sigma if( c._vec._isUUID ) { _min = _max = _mean = _sigma = Double.NaN; for( int i=0; i<c._len; i++ ) { if( c.isNA0(i) ) _naCnt++; else _rows++; } return; } // All other columns have useful rollups for( int i=0; i<c._len; i++ ) { double d = c.at0(i); if( Double.isNaN(d) ) _naCnt++; else { if( d < _min ) _min = d; if( d > _max ) _max = d; _mean += d; _rows++; if( _isInt && ((long)d) != d ) _isInt = false; } } _mean = _mean / _rows; for( int i=0; i<c._len; i++ ) { if( !c.isNA0(i) ) { double d = c.at0(i); _sigma += (d - _mean) * (d - _mean); } } } @Override public void reduce( RollupStats rs ) { _min = Math.min(_min,rs._min); _max = Math.max(_max,rs._max); _naCnt += rs._naCnt; double delta = _mean - rs._mean; if (_rows == 0) { _mean = rs._mean; _sigma = rs._sigma; } else if (rs._rows > 0) { _mean = (_mean*_rows + rs._mean*rs._rows)/(_rows + rs._rows); _sigma = _sigma + rs._sigma + delta*delta * _rows*rs._rows / (_rows+rs._rows); } _rows += rs._rows; _size += rs._size; _isInt &= rs._isInt; } // Just toooo common to report always. Drowning in multi-megabyte log file writes. @Override public boolean logVerbose() { return false; } } // class RollupStats /** A private class to compute the rollup stats */ private static class ChecksummerTask extends MRTask2<ChecksummerTask> { public long checksum = 0; public long getChecksum() { return checksum; } @Override public void map( Chunk c ) { long _start = c._start; for( int i=0; i<c._len; i++ ) { long l = 81985529216486895L; // 0x0123456789ABCDEF if (! c.isNA0(i)) { if (c instanceof C16Chunk) { l = c.at16l0(i); l ^= (37 * c.at16h0(i)); } else { l = c.at80(i); } } long global_row = _start + i; checksum ^= (17 * global_row); checksum ^= (23 * l); } } // map() @Override public void reduce( ChecksummerTask that ) { this.checksum ^= that.checksum; } } // class ChecksummerTask /** Writing into this Vector from *some* chunk. Immediately clear all caches * (_min, _max, _mean, etc). Can be called repeatedly from one or all * chunks. Per-chunk row-counts will not be changing, just row contents and * caches of row contents. */ public void preWriting( ) { if( _naCnt == -2 ) return; // Already set _naCnt = -2; if( !writable() ) throw new IllegalArgumentException("Vector not writable"); // Set remotely lazily. This will trigger a cloud-wide invalidate of the // existing Vec, and eventually we'll have to load a fresh copy of the Vec // with active writing turned on, and caching disabled. new TAtomic<Vec>() { @Override public Vec atomic(Vec v) { if( v!=null ) v._naCnt=-2; return v; } }.invoke(_key); } /** Stop writing into this Vec. Rollup stats will again (lazily) be computed. */ public void postWrite() { Vec vthis = DKV.get(_key).get(); if( vthis._naCnt==-2 ) { _naCnt = vthis._naCnt=-1; new TAtomic<Vec>() { @Override public Vec atomic(Vec v) { if( v != null ) { v._last_write_timestamp = System.currentTimeMillis(); if (v._naCnt==-2 ) { v._naCnt=-1; } // _naCnt != -2 } // ! null return v; } }.invoke(_key); } } /** Convert a row# to a chunk#. For constant-sized chunks this is a little * shift-and-add math. For variable-sized chunks this is a binary search, * with a sane API (JDK has an insane API). Overridden by subclasses that * compute chunks in an alternative way, such as file-backed Vecs. */ public int elem2ChunkIdx(long i) { assert 0 <= i && i < length() : "0 <= "+i+" < "+length(); int lo=0, hi = nChunks(); while( lo < hi-1 ) { int mid = (hi+lo)>>>1; if( i < _espc[mid] ) hi = mid; else lo = mid; } while( _espc[lo+1] == i ) lo++; return lo; } /** Convert a chunk-index into a starting row #. For constant-sized chunks * this is a little shift-and-add math. For variable-sized chunks this is a * table lookup. */ public long chunk2StartElem( int cidx ) { return _espc[cidx]; } /** Number of rows in chunk. Does not fetch chunk content. */ public int chunkLen( int cidx ) { return (int) (_espc[cidx + 1] - _espc[cidx]); } /** Get a Vec Key from Chunk Key, without loading the Chunk */ static public Key getVecKey( Key key ) { assert key._kb[0]==Key.DVEC; byte [] bits = key._kb.clone(); bits[0] = Key.VEC; UDP.set4(bits,6,-1); // chunk# return Key.make(bits); } /** Get a Chunk Key from a chunk-index. Basically the index-to-key map. */ public Key chunkKey(int cidx ) { return chunkKey(_key,cidx); } static public Key chunkKey(Key veckey, int cidx ) { byte [] bits = veckey._kb.clone(); bits[0] = Key.DVEC; UDP.set4(bits,6,cidx); // chunk# return Key.make(bits); } /** Get a Chunk's Value by index. Basically the index-to-key map, * plus the {@code DKV.get()}. Warning: this pulls the data locally; * using this call on every Chunk index on the same node will * probably trigger an OOM! */ public Value chunkIdx( int cidx ) { Value val = DKV.get(chunkKey(cidx)); assert checkMissing(cidx,val); return val; } protected boolean checkMissing(int cidx, Value val) { if( val != null ) return true; System.out.println("Error: Missing chunk "+cidx+" for "+_key); return false; } /** Make a new random Key that fits the requirements for a Vec key. */ static public Key newKey(){return newKey(Key.make());} public static final int KEY_PREFIX_LEN = 4+4+1+1; /** Make a new Key that fits the requirements for a Vec key, based on the * passed-in key. Used to make Vecs that back over e.g. disk files. */ static Key newKey(Key k) { byte [] kb = k._kb; byte [] bits = MemoryManager.malloc1(kb.length+KEY_PREFIX_LEN); bits[0] = Key.VEC; bits[1] = -1; // Not homed UDP.set4(bits,2,0); // new group, so we're the first vector UDP.set4(bits,6,-1); // 0xFFFFFFFF in the chunk# area System.arraycopy(kb, 0, bits, 4+4+1+1, kb.length); return Key.make(bits); } /** Make a Vector-group key. */ public Key groupKey(){ byte [] bits = _key._kb.clone(); bits[0] = Key.VGROUP; UDP.set4(bits, 2, -1); UDP.set4(bits, 6, -1); return Key.make(bits); } /** * Get the group this vector belongs to. * In case of a group with only one vector, the object actually does not exist in KV store. * * @return VectorGroup this vector belongs to. */ public final VectorGroup group() { Key gKey = groupKey(); Value v = DKV.get(gKey); if(v != null)return v.get(VectorGroup.class); // no group exists so we have to create one return new VectorGroup(gKey,1); } /** The Chunk for a chunk#. Warning: this loads the data locally! */ public Chunk chunkForChunkIdx(int cidx) { long start = chunk2StartElem(cidx); // Chunk# to chunk starting element# Value dvec = chunkIdx(cidx); // Chunk# to chunk data Chunk c = dvec.get(); // Chunk data to compression wrapper long cstart = c._start; // Read once, since racily filled in Vec v = c._vec; if( cstart == start && v != null) return c; // Already filled-in assert cstart == -1 || v == null; // Was not filled in (everybody racily writes the same start value) c._vec = this; // Fields not filled in by unpacking from Value c._start = start; // Fields not filled in by unpacking from Value return c; } /** The Chunk for a row#. Warning: this loads the data locally! */ private Chunk chunkForRow_impl(long i) { return chunkForChunkIdx(elem2ChunkIdx(i)); } // Cache of last Chunk accessed via at/set api transient Chunk _cache; /** The Chunk for a row#. Warning: this loads the data locally! */ public final Chunk chunkForRow(long i) { Chunk c = _cache; return (c != null && c._chk2==null && c._start <= i && i < c._start+c._len) ? c : (_cache = chunkForRow_impl(i)); } /** Fetch element the slow way, as a long. Floating point values are * silently rounded to an integer. Throws if the value is missing. */ public final long at8( long i ) { return chunkForRow(i).at8(i); } /** Fetch element the slow way, as a double. Missing values are * returned as Double.NaN instead of throwing. */ public final double at( long i ) { return chunkForRow(i).at(i); } /** Fetch the missing-status the slow way. */ public final boolean isNA(long row){ return chunkForRow(row).isNA(row); } /** Fetch element the slow way, as a long. Throws if the value is missing or not a UUID. */ public final long at16l( long i ) { return chunkForRow(i).at16l(i); } public final long at16h( long i ) { return chunkForRow(i).at16h(i); } /** Write element the VERY slow way, as a long. There is no way to write a * missing value with this call. Under rare circumstances this can throw: * if the long does not fit in a double (value is larger magnitude than * 2^52), AND float values are stored in Vector. In this case, there is no * common compatible data representation. * * NOTE: For a faster way, but still slow, use the Vec.Writer below. * */ public final long set( long i, long l) { Chunk ck = chunkForRow(i); long ret = ck.set(i,l); Futures fs = new Futures(); ck.close(ck.cidx(), fs); //slow to do this for every set -> use Writer if writing many values fs.blockForPending(); postWrite(); return ret; } /** Write element the VERY slow way, as a double. Double.NaN will be treated as * a set of a missing element. * */ public final double set( long i, double d) { Chunk ck = chunkForRow(i); double ret = ck.set(i,d); Futures fs = new Futures(); ck.close(ck.cidx(), fs); //slow to do this for every set -> use Writer if writing many values fs.blockForPending(); postWrite(); return ret; } /** Write element the VERY slow way, as a float. Float.NaN will be treated as * a set of a missing element. * */ public final float set( long i, float f) { Chunk ck = chunkForRow(i); float ret = ck.set(i, f); Futures fs = new Futures(); ck.close(ck.cidx(), fs); //slow to do this for every set -> use Writer if writing many values fs.blockForPending(); postWrite(); return ret; } /** Set the element as missing the VERY slow way. */ public final boolean setNA( long i ) { Chunk ck = chunkForRow(i); boolean ret = ck.setNA(i); Futures fs = new Futures(); ck.close(ck.cidx(), fs); //slow to do this for every set -> use Writer if writing many values fs.blockForPending(); postWrite(); return ret; } /** * More efficient way to write randomly to a Vec - still slow, but much faster than Vec.set() * * Usage: * Vec.Writer vw = vec.open(); * vw.set(0, 3.32); * vw.set(1, 4.32); * vw.set(2, 5.32); * vw.close(); */ public final static class Writer { Vec _vec; private Writer(Vec v){ _vec=v; _vec.preWriting(); } public final long set( long i, long l) { return _vec.chunkForRow(i).set(i,l); } public final double set( long i, double d) { return _vec.chunkForRow(i).set(i,d); } public final float set( long i, float f) { return _vec.chunkForRow(i).set(i,f); } public final boolean setNA( long i ) { return _vec.chunkForRow(i).setNA(i); } public void close() { Futures fs = new Futures(); _vec.close(fs); fs.blockForPending(); _vec.postWrite(); } } public final Writer open() { return new Writer(this); } /** Close all chunks that are local (not just the ones that are homed) * This should only be called from a Writer object * */ private void close(Futures fs) { int nc = nChunks(); for( int i=0; i<nc; i++ ) { if (H2O.get(chunkKey(i)) != null) { chunkForChunkIdx(i).close(i, fs); } } } /** Pretty print the Vec: [#elems, min/mean/max]{chunks,...} */ @Override public String toString() { String s = "["+length()+(_naCnt<0 ? ", {" : ","+_min+"/"+_mean+"/"+_max+", "+PrettyPrint.bytes(_size)+", {"); int nc = nChunks(); for( int i=0; i<nc; i++ ) { s += chunkKey(i).home_node()+":"+chunk2StartElem(i)+":"; // CNC: Bad plan to load remote data during a toString... messes up debug printing // Stupidly chunkForChunkIdx loads all data locally // s += chunkForChunkIdx(i).getClass().getSimpleName().replaceAll("Chunk","")+", "; } return s+"}]"; } public Futures remove( Futures fs ) { for( int i=0; i<nChunks(); i++ ) UKV.remove(chunkKey(i),fs); DKV.remove(_key,fs); return fs; } @Override public boolean equals( Object o ) { return o instanceof Vec && ((Vec)o)._key.equals(_key); } @Override public int hashCode() { return _key.hashCode(); } /** Always makes a copy of the given vector which shares the same * group. * * The user is responsible for deleting the returned vector. * * This can be expensive operation since it can force copy of data * among nodes. * * @param vec vector which is intended to be copied * @return a copy of vec which shared the same {@link VectorGroup} with this vector */ public Vec align(final Vec vec) { assert ! this.group().equals(vec.group()) : "Vector align expects a vector from different vector group"; assert this.length()== vec.length() : "Trying to align vectors with different length!"; Vec avec = makeZero(); // aligned vector new MRTask2() { @Override public void map(Chunk c0) { long srow = c0._start; for (int r = 0; r < c0._len; r++) c0.set0(r, vec.at(srow + r)); } }.doAll(avec); avec._domain = _domain; return avec; } /** * Class representing the group of vectors. * * Vectors from the same group have same distribution of chunks among nodes. * Each vector is member of exactly one group. Default group of one vector * is created for each vector. Group of each vector can be retrieved by * calling group() method; * * The expected mode of operation is that user wants to add new vectors * matching the source. E.g. parse creates several vectors (one for each * column) which are all colocated and are colocated with the original * bytevector. * * To do this, user should first ask for the set of keys for the new vectors * by calling addVecs method on the target group. * * Vectors in the group will have the same keys except for the prefix which * specifies index of the vector inside the group. The only information the * group object carries is it's own key and the number of vectors it * contains(deleted vectors still count). * * Because vectors(and chunks) share the same key-pattern with the group, * default group with only one vector does not have to be actually created, * it is implicit. * * @author tomasnykodym * */ public static class VectorGroup extends Iced { public static VectorGroup newVectorGroup(){ return new Vec(Vec.newKey(),(long[])null).group(); } // The common shared vector group for length==1 vectors public static VectorGroup VG_LEN1 = new VectorGroup(); final int _len; final Key _key; private VectorGroup(Key key, int len){_key = key;_len = len;} public VectorGroup() { byte[] bits = new byte[26]; bits[0] = Key.VGROUP; bits[1] = -1; UDP.set4(bits, 2, -1); UDP.set4(bits, 6, -1); UUID uu = UUID.randomUUID(); UDP.set8(bits,10,uu.getLeastSignificantBits()); UDP.set8(bits,18,uu. getMostSignificantBits()); _key = Key.make(bits); _len = 0; } public Key vecKey(int vecId){ byte [] bits = _key._kb.clone(); bits[0] = Key.VEC; UDP.set4(bits,2,vecId);// return Key.make(bits); } /** * Task to atomically add vectors into existing group. * @author tomasnykodym */ private static class AddVecs2GroupTsk extends TAtomic<VectorGroup>{ final Key _key; int _n; // INPUT: Keys to allocate; OUTPUT: start of run of keys private AddVecs2GroupTsk(Key key, int n){_key = key; _n = n;} @Override public VectorGroup atomic(VectorGroup old) { int n = _n; // how many // If the old group is missing, assume it is the default group-of-self // (having 1 ID already allocated for self), not a new group with // zero prior vectors. _n = old==null ? 1 : old._len; // start of allocated key run return new VectorGroup(_key, n+_n); } } /** * Task to atomically add vectors into existing group. * @author tomasnykodym */ private static class ReturnKeysTsk extends TAtomic<VectorGroup>{ final int _newCnt; // INPUT: Keys to allocate; OUTPUT: start of run of keys final int _oldCnt; private ReturnKeysTsk(Key key, int oldCnt, int newCnt){_newCnt = newCnt; _oldCnt = oldCnt;} @Override public VectorGroup atomic(VectorGroup old) { return (old._len == _oldCnt)? new VectorGroup(_key, _newCnt):old; } } public Future tryReturnKeys(final int oldCnt, int newCnt) { return new ReturnKeysTsk(_key,oldCnt,newCnt).fork(_key);} // reserve range of keys and return index of first new available key public int reserveKeys(final int n){ AddVecs2GroupTsk tsk = new AddVecs2GroupTsk(_key, n); tsk.invoke(_key); return tsk._n; } /** * Gets the next n keys of this group. * Performs atomic update of the group object to assure we get unique keys. * The group size will be updated by adding n. * * @param n number of keys to make * @return arrays of unique keys belonging to this group. */ public Key [] addVecs(final int n){ AddVecs2GroupTsk tsk = new AddVecs2GroupTsk(_key, n); tsk.invoke(_key); Key [] res = new Key[n]; for(int i = 0; i < n; ++i) res[i] = vecKey(i + tsk._n); return res; } /** * Shortcut for addVecs(1). * @see #addVecs(int) */ public Key addVec() { return addVecs(1)[0]; } @Override public String toString() { return "VecGrp "+_key.toString()+", next free="+_len; } @Override public boolean equals( Object o ) { return o instanceof VectorGroup && ((VectorGroup)o)._key.equals(_key); } @Override public int hashCode() { return _key.hashCode(); } } /** * Method to change the domain of the Vec. * * Can only be applied to factors (Vec with non-null domain) and * domain can only be set to domain of the same or greater length. * * Updating the domain requires updating the Vec header in the K/V and since chunks cache Vec header references, * need to execute distributed task to flush (null) those references). * * @param newDomain */ public void changeDomain(String [] newDomain){ if(_domain == null)throw new RuntimeException("Setting a domain to a non-factor Vector, call as.Factor() instead."); if(newDomain == null)throw new RuntimeException("Can not set domain to null. You have to convert the vec to numbers explicitly"); if(newDomain.length < _domain.length) throw new RuntimeException("Setting domain to incompatible size. New domain must be at least the same length!"); _domain = newDomain; // update the vec header in the K/V DKV.put(_key,this); // now flush the cached vec header references (still pointing to the old guy) new MRTask2(){ @Override public void map(Chunk c){c._vec = null;} }.doAll(this); } /** Collect numeric domain of given vector */ public static class CollectDomain extends MRTask2<CollectDomain> { transient NonBlockingHashMapLong<Object> _uniques; @Override protected void setupLocal() { _uniques = new NonBlockingHashMapLong(); } public CollectDomain(Vec v) { } @Override public void map(Chunk ys) { for( int row=0; row<ys._len; row++ ) if( !ys.isNA0(row) ) _uniques.put(ys.at80(row),""); } @Override public void reduce(CollectDomain mrt) { if( _uniques == mrt._uniques ) return; _uniques.putAll(mrt._uniques); } @Override public AutoBuffer write( AutoBuffer ab ) { super.write(ab); return ab.putA8(_uniques==null ? null : _uniques.keySetLong()); } @Override public Freezable read( AutoBuffer ab ) { super.read(ab); assert _uniques == null || _uniques.size()==0; long ls[] = ab.getA8(); _uniques = new NonBlockingHashMapLong(); if( ls != null ) for( long l : ls ) _uniques.put(l,""); return this; } @Override public void copyOver(Freezable that) { super.copyOver(that); _uniques = ((CollectDomain)that)._uniques; } /** Returns exact numeric domain of given vector computed by this task. * The domain is always sorted. Hence: * domain()[0] - minimal domain value * domain()[domain().length-1] - maximal domain value */ public long[] domain() { long[] dom = _uniques.keySetLong(); Arrays.sort(dom); return dom; } } }