package water.fvec; import water.*; import water.util.Utils; import java.util.Arrays; /** * A NEW single distributed vector column. * * The NEW vector has no data, and takes no space. It supports distributed * parallel writes to it, via calls to append2. Such writes happen in parallel * and all writes are ordered. Writes *will* be local to the node doing them, * specifically to allow control over locality. By default, writes will go * local-homed chunks with no compression; there is a final 'close' to the NEW * vector which may do compression; the final 'close' will return some other * Vec type. NEW Vectors do NOT support reads! */ public class AppendableVec extends Vec { long [] _espc; public static final byte NA = 1; public static final byte ENUM = 2; public static final byte NUMBER = 4; public static final byte TIME = 8; public static final byte UUID =16; byte [] _chunkTypes; long _naCnt; long _strCnt; final long _timCnt[] = new long[ParseTime.TIME_PARSE.length]; long _totalCnt; int _chunkOff; public AppendableVec( Key key){ this(key, new long[4],0); } public AppendableVec( Key key, long [] espc, int chunkOff) { super(key, (long[])null); _espc = espc; _chunkTypes = MemoryManager.malloc1(espc.length); _chunkOff = chunkOff; } // A NewVector chunk was "closed" - completed. Add it's info to the roll-up. // This call is made in parallel across all node-local created chunks, but is // not called distributed. synchronized void closeChunk( NewChunk chk) { final int cidx = chk._cidx - _chunkOff; if( cidx >= _espc.length ) { int newlen = Math.max(_espc.length * 2, cidx + 1); _espc = Arrays.copyOf(_espc,newlen); } if(_chunkTypes.length < _espc.length) _chunkTypes = Arrays.copyOf(_chunkTypes,_espc.length); _espc[cidx] = chk._len; _chunkTypes[cidx] = chk.type(); _naCnt += chk._naCnt; _strCnt += chk._strCnt; for( int i=0; i<_timCnt.length; i++ ) _timCnt[i] += chk._timCnt[i]; _totalCnt += chk._len; } // What kind of data did we find? NA's? Strings-only? Floats or Ints? boolean shouldBeEnum() { // We declare column to be string/enum only if it does not have ANY numbers in it. return _strCnt > 0 && (_strCnt + _naCnt) == _totalCnt; } /** * Add AV build over sub-range of this vec (used e.g. by multifile parse where each file produces its own AV which represents sub-range of the resulting vec) * @param av */ public void setSubRange(AppendableVec av) { assert _key.equals(av._key):"mismatched keys " + _key + ", " + av._key; System.arraycopy(av._espc, 0, _espc, av._chunkOff, av._espc.length); System.arraycopy(av._chunkTypes, 0, _chunkTypes, av._chunkOff, av._chunkTypes.length); _strCnt += av._strCnt; _naCnt += av._naCnt; Utils.add(_timCnt, av._timCnt); _totalCnt += av._totalCnt; } public static Vec[] closeAll(AppendableVec [] avs) { Futures fs = new Futures(); Vec [] res = closeAll(avs,fs); fs.blockForPending(); return res; } public static Vec[] closeAll(AppendableVec [] avs, Futures fs) { Vec [] res = new Vec[avs.length]; for(int i = 0; i < avs.length; ++i) res[i] = avs[i].close(fs); return res; } // Class 'reduce' call on new vectors; to combine the roll-up info. // Called single-threaded from the M/R framework. public void reduce( AppendableVec nv ) { if( this == nv ) return; // Trivially done // Combine arrays of elements-per-chunk long e1[] = nv._espc; // Shorter array of longs? byte t1[] = nv._chunkTypes; if( e1.length > _espc.length ) { e1 = _espc; // Keep the shorter one in e1 t1 = _chunkTypes; _espc = nv._espc; // Keep longer in the object _chunkTypes = nv._chunkTypes; } for( int i=0; i<e1.length; i++ ){ // Copy non-zero elements over assert _chunkTypes[i] == 0 || t1[i] == 0; if( e1[i] != 0 && _espc[i]==0 ) _espc[i] = e1[i]; _chunkTypes[i] |= t1[i]; } _naCnt += nv._naCnt; _strCnt += nv._strCnt; Utils.add(_timCnt,nv._timCnt); _totalCnt += nv._totalCnt; } // "Close" out a NEW vector - rewrite it to a plain Vec that supports random // reads, plus computes rows-per-chunk, min/max/mean, etc. public Vec close(Futures fs) { // Compute #chunks int nchunk = _espc.length; DKV.remove(chunkKey(nchunk),fs); // remove potential trailing key while( nchunk > 0 && _espc[nchunk-1] == 0 ) { nchunk--; DKV.remove(chunkKey(nchunk),fs); // remove potential trailing key } boolean hasNumber = false, hasEnum = false, hasTime=false, hasUUID=false; for( int i = 0; i < nchunk; ++i ) { if( (_chunkTypes[i] & TIME ) != 0 ) { hasNumber = true; hasTime=true; } if( (_chunkTypes[i] & NUMBER) != 0 ) hasNumber = true; if( (_chunkTypes[i] & ENUM ) != 0 ) hasEnum = true; if( (_chunkTypes[i] & UUID ) != 0 ) hasUUID = true; } // number wins, we need to go through the enum chunks and declare them all // NAs (chunk is considered enum iff it has only enums + possibly some nas) if( hasNumber && hasEnum ) { for(int i = 0; i < nchunk; ++i) if(_chunkTypes[i] == ENUM) DKV.put(chunkKey(i), new C0DChunk(Double.NaN, (int)_espc[i]),fs); } // UUID wins over enum & number if( hasUUID && (hasEnum || hasNumber) ) { hasEnum=hasNumber=false; for(int i = 0; i < nchunk; ++i) if((_chunkTypes[i] & UUID)==0) DKV.put(chunkKey(i), new C0DChunk(Double.NaN, (int)_espc[i]),fs); } // Make sure time is consistent int t = -1; if( hasTime ) { // Find common time parse, and all zeros - or inconsistent time parse for( int i=0; i<_timCnt.length; i++ ) if( _timCnt[i] != 0 ) if( t== -1 ) t=i; // common time parse else t = -2; // inconsistent parse if( t < 0 ) // blow off time parse for(int i = 0; i < nchunk; ++i) if(_chunkTypes[i] == TIME) DKV.put(chunkKey(i), new C0DChunk(Double.NaN, (int)_espc[i]),fs); } assert t<0 || _domain == null; // Compute elems-per-chunk. // Roll-up elem counts, so espc[i] is the starting element# of chunk i. long espc[] = new long[nchunk+1]; // Shorter array long x=0; // Total row count so far for( int i=0; i<nchunk; i++ ) { espc[i] = x; // Start elem# for chunk i x += _espc[i]; // Raise total elem count } espc[nchunk]=x; // Total element count in last // Replacement plain Vec for AppendableVec. Vec vec = new Vec(_key, espc, _domain, hasUUID, (byte)t); DKV.put(_key,vec,fs); // Inject the header return vec; } // Default read/write behavior for AppendableVecs @Override public boolean readable() { return false; } @Override public boolean writable() { return true ; } @Override public Chunk chunkForChunkIdx(int cidx) { return new NewChunk(this,cidx); } // None of these are supposed to be called while building the new vector @Override public Value chunkIdx( int cidx ) { throw H2O.fail(); } @Override public long length() { throw H2O.fail(); } @Override public int nChunks() { throw H2O.fail(); } @Override public int elem2ChunkIdx(long i) { throw H2O.fail(); } @Override public long chunk2StartElem( int cidx ) { throw H2O.fail(); } @Override public long byteSize() { return 0; } @Override public String toString() { return "[AppendableVec, unknown size]"; } }