AppendableVec.java example

Explorer
h2o-2-master
package water.fvec;

import water.*;
import water.util.Utils;
import java.util.Arrays;

/**
 * A NEW single distributed vector column.
 *
 * The NEW vector has no data, and takes no space.  It supports distributed
 * parallel writes to it, via calls to append2.  Such writes happen in parallel
 * and all writes are ordered.  Writes *will* be local to the node doing them,
 * specifically to allow control over locality.  By default, writes will go
 * local-homed chunks with no compression; there is a final 'close' to the NEW
 * vector which may do compression; the final 'close' will return some other
 * Vec type.  NEW Vectors do NOT support reads!
 */
public class AppendableVec extends Vec {
  long [] _espc;
  public static final byte NA     = 1;
  public static final byte ENUM   = 2;
  public static final byte NUMBER = 4;
  public static final byte TIME   = 8;
  public static final byte UUID   =16;
  byte [] _chunkTypes;
  long _naCnt;
  long _strCnt;
  final long _timCnt[] = new long[ParseTime.TIME_PARSE.length];
  long _totalCnt;
  int _chunkOff;

  
  public AppendableVec( Key key){
    this(key, new long[4],0);
  }
  public AppendableVec( Key key, long [] espc, int chunkOff) {
    super(key, (long[])null);
    _espc = espc;
    _chunkTypes = MemoryManager.malloc1(espc.length);
    _chunkOff = chunkOff;
  }


  // A NewVector chunk was "closed" - completed.  Add it's info to the roll-up.
  // This call is made in parallel across all node-local created chunks, but is
  // not called distributed.
  synchronized void closeChunk( NewChunk chk) {
    final int cidx = chk._cidx - _chunkOff;
    if( cidx >= _espc.length ) {
      int newlen = Math.max(_espc.length * 2, cidx + 1);
      _espc = Arrays.copyOf(_espc,newlen);
    }
    if(_chunkTypes.length < _espc.length)
      _chunkTypes = Arrays.copyOf(_chunkTypes,_espc.length);
    _espc[cidx] = chk._len;
    _chunkTypes[cidx] = chk.type();
    _naCnt += chk._naCnt;
    _strCnt += chk._strCnt;
    for( int i=0; i<_timCnt.length; i++ ) _timCnt[i] += chk._timCnt[i];
    _totalCnt += chk._len;
  }

  // What kind of data did we find?  NA's?  Strings-only?  Floats or Ints?
  boolean shouldBeEnum() {
    // We declare column to be string/enum only if it does not have ANY numbers in it.
    return _strCnt > 0 && (_strCnt + _naCnt) == _totalCnt;
  }

  /**
   * Add AV build over sub-range of this vec (used e.g. by multifile parse where each file produces its own AV which represents sub-range of the resulting vec)
   * @param av
   */
  public void setSubRange(AppendableVec av) {
    assert _key.equals(av._key):"mismatched keys " + _key + ", " + av._key;
    System.arraycopy(av._espc, 0, _espc, av._chunkOff, av._espc.length);
    System.arraycopy(av._chunkTypes, 0, _chunkTypes, av._chunkOff, av._chunkTypes.length);
    _strCnt += av._strCnt;
    _naCnt += av._naCnt;
    Utils.add(_timCnt, av._timCnt);
    _totalCnt += av._totalCnt;
  }

  public static Vec[] closeAll(AppendableVec [] avs) {
    Futures fs = new Futures();
    Vec [] res = closeAll(avs,fs);
    fs.blockForPending();
    return res;
  }
  public static Vec[] closeAll(AppendableVec [] avs, Futures fs) {
    Vec [] res = new Vec[avs.length];
    for(int i = 0; i < avs.length; ++i)
      res[i] = avs[i].close(fs);
    return res;
  }
  // Class 'reduce' call on new vectors; to combine the roll-up info.
  // Called single-threaded from the M/R framework.
  public void reduce( AppendableVec nv ) {
    if( this == nv ) return;    // Trivially done

    // Combine arrays of elements-per-chunk
    long e1[] = nv._espc;       // Shorter array of longs?
    byte t1[] = nv._chunkTypes;
    if( e1.length > _espc.length ) {
      e1 = _espc;               // Keep the shorter one in e1
      t1 = _chunkTypes;
      _espc = nv._espc;         // Keep longer in the object
      _chunkTypes = nv._chunkTypes;
    }
    for( int i=0; i<e1.length; i++ ){ // Copy non-zero elements over
      assert _chunkTypes[i] == 0 || t1[i] == 0;
      if( e1[i] != 0 && _espc[i]==0 )
        _espc[i] = e1[i];
      _chunkTypes[i] |= t1[i];
    }
    _naCnt += nv._naCnt;
    _strCnt += nv._strCnt;
    Utils.add(_timCnt,nv._timCnt);
    _totalCnt += nv._totalCnt;
  }


  // "Close" out a NEW vector - rewrite it to a plain Vec that supports random
  // reads, plus computes rows-per-chunk, min/max/mean, etc.
  public Vec close(Futures fs) {
    // Compute #chunks
    int nchunk = _espc.length;
    DKV.remove(chunkKey(nchunk),fs); // remove potential trailing key
    while( nchunk > 0 && _espc[nchunk-1] == 0 ) {
      nchunk--;
      DKV.remove(chunkKey(nchunk),fs); // remove potential trailing key
    }
    boolean hasNumber = false, hasEnum = false, hasTime=false, hasUUID=false;
    for( int i = 0; i < nchunk; ++i ) {
      if( (_chunkTypes[i] & TIME  ) != 0 ) { hasNumber = true; hasTime=true; }
      if( (_chunkTypes[i] & NUMBER) != 0 )   hasNumber = true;
      if( (_chunkTypes[i] & ENUM  ) != 0 )   hasEnum   = true;
      if( (_chunkTypes[i] & UUID  ) != 0 )   hasUUID   = true;
    }
    // number wins, we need to go through the enum chunks and declare them all
    // NAs (chunk is considered enum iff it has only enums + possibly some nas)
    if( hasNumber && hasEnum ) {
      for(int i = 0; i < nchunk; ++i)
        if(_chunkTypes[i] == ENUM)
          DKV.put(chunkKey(i), new C0DChunk(Double.NaN, (int)_espc[i]),fs);
    }
    // UUID wins over enum & number
    if( hasUUID && (hasEnum || hasNumber) ) {
      hasEnum=hasNumber=false;
      for(int i = 0; i < nchunk; ++i)
        if((_chunkTypes[i] & UUID)==0)
          DKV.put(chunkKey(i), new C0DChunk(Double.NaN, (int)_espc[i]),fs);
    }

    // Make sure time is consistent
    int t = -1;
    if( hasTime ) {
      // Find common time parse, and all zeros - or inconsistent time parse
      for( int i=0; i<_timCnt.length; i++ )
        if( _timCnt[i] != 0 )
          if( t== -1 ) t=i;     // common time parse
          else t = -2;          // inconsistent parse
      if( t < 0 )               // blow off time parse
        for(int i = 0; i < nchunk; ++i)
          if(_chunkTypes[i] == TIME)
            DKV.put(chunkKey(i), new C0DChunk(Double.NaN, (int)_espc[i]),fs);

    }
    assert t<0 || _domain == null;

    // Compute elems-per-chunk.
    // Roll-up elem counts, so espc[i] is the starting element# of chunk i.
    long espc[] = new long[nchunk+1]; // Shorter array
    long x=0;                   // Total row count so far
    for( int i=0; i<nchunk; i++ ) {
      espc[i] = x;              // Start elem# for chunk i
      x += _espc[i];            // Raise total elem count
    }
    espc[nchunk]=x;             // Total element count in last
    // Replacement plain Vec for AppendableVec.
    Vec vec = new Vec(_key, espc, _domain, hasUUID, (byte)t);
    DKV.put(_key,vec,fs);       // Inject the header
    return vec;
  }

  // Default read/write behavior for AppendableVecs
  @Override
  public boolean readable() { return false; }
  @Override
  public boolean writable() { return true ; }

  @Override public Chunk chunkForChunkIdx(int cidx) { return new NewChunk(this,cidx); }

  // None of these are supposed to be called while building the new vector
  @Override
  public Value chunkIdx( int cidx ) { throw H2O.fail(); }
  @Override
  public long length() { throw H2O.fail(); }
  @Override
  public int nChunks() { throw H2O.fail(); }
  @Override
  public int elem2ChunkIdx(long i) { throw H2O.fail(); }
  @Override
  public long chunk2StartElem( int cidx ) { throw H2O.fail(); }

  @Override
  public long byteSize() { return 0; }
  @Override public String toString() { return "[AppendableVec, unknown size]"; }
}