Frame.java example

Explorer
h2o-2-master
package water.fvec;

import jsr166y.CountedCompleter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.exec.Flow;
import water.util.Log;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.IllegalFormatException;
import java.util.Random;

/**
 * A collection of named Vecs.  Essentially an R-like data-frame.  Multiple
 * Frames can reference the same Vecs.  A Frame is a lightweight object, it is
 * meant to be cheaply created and discarded for data munging purposes.
 * E.g. to exclude a Vec from a computation on a Frame, create a new Frame that
 * references all the Vecs but this one.
 */
public class Frame extends Lockable<Frame> {
  public String[] _names;
  Key[] _keys;          // Keys for the vectors
  private transient Vec[] _vecs;// The Vectors (transient to avoid network traffic)
  private transient Vec _col0;  // First readable vec; fast access to the VectorGroup's Chunk layout
  private final UniqueId uniqueId;

  public Frame(Key k){
    super(k);
    uniqueId = new UniqueFrameId(k, this);
  }
  public Frame( Frame fr ) { this(fr._key,fr._names.clone(), fr.vecs().clone()); _col0 = null; }
  public Frame( Vec... vecs ){ this(null,vecs);}
  public Frame( String[] names, Vec[] vecs ) { this(null,names,vecs); }


  public Frame( Key key, String[] names, Vec[] vecs ) {
    super(key);
    this.uniqueId = new UniqueFrameId(_key, this);
    if( names==null ) {
      names = new String[vecs.length];
      for( int i=0; i<vecs.length; i++ ) names[i] = "C"+(i+1);
    }
    assert names.length == vecs.length : "Number of columns does not match to number of cols' names.";
    _names=names;
    _vecs=vecs;
    _keys = new Key[vecs.length];
    for( int i=0; i<vecs.length; i++ )
      _keys[i] = vecs[i]._key;
    assert checkCompatible();
  }

  /**
   * Task to compare the two frames, returns true if they are identical.
   * We can't in general expect frames to be bit-compatible so we compare the numbers,
   * integers are compared exaclty, doubles only with given precision (1e-8 is default).
   * (compression scheme may be altered by the way they were parsed and by rebalancing)
   * The frames are expected to be compatible.
   * @param f
   * @return
   */
  public final boolean isIdentical(Frame f){
    FrameIdenticalTask fbt = new FrameIdenticalTask(this,f);
    H2O.submitTask(fbt);
    fbt.join();
    return fbt._res;
  }
  public static class FrameIdenticalTask extends H2OCountedCompleter {
    final Frame _f1;
    final Frame _f2;
    public FrameIdenticalTask(Frame f1, Frame f2){_f1 = f1; _f2 = f2;}
    boolean _res;
    double _fpointPrecision = 1e-8;
    private Vec.VecIdenticalTask[] _vts;
    @Override
    public void compute2() {
      if(_f1 == _f2){
        _res = true;
      } else if(Arrays.deepEquals(_f1.names(), _f2.names())){
        _vts = new Vec.VecIdenticalTask[_f1.numCols()];
        addToPendingCount(_vts.length);
        for(int i = 0; i < _vts.length; ++i) {
          _vts[i] = new Vec.VecIdenticalTask(this,_fpointPrecision);
          _vts[i].asyncExec(_f1.vec(i),_f2.vec(i));
        }
      }
      tryComplete();
    }

    @Override public void onCompletion(CountedCompleter cc){
      if(_vts != null){
        _res = _vts[0]._res;
        for(int i = 1; i < _vts.length; ++i)
          _res = _res && _vts[i]._res;
      }
    }
  }
  public UniqueId getUniqueId() {
    return this.uniqueId;
  }

  /** 64-bit checksum of the checksums of the vecs.  SHA-265 checksums of the chunks are XORed
   * together.  Since parse always parses the same pieces of files into the same offsets
   * in some chunk this checksum will be consistent across reparses.
   */
  public long checksum() {
    Vec [] vecs = vecs();
    long _checksum = 0;
    for(int i = 0; i < _names.length; ++i) {
      long vec_checksum = vecs[i].checksum();
      _checksum ^= vec_checksum;
      _checksum ^= (2147483647 * i);
    }
    return _checksum;
  }

  public Vec vec(String name){
    Vec [] vecs = vecs();
    for(int i = 0; i < _names.length; ++i)
      if(_names[i].equals(name))return vecs[i];
    return null;
  }
  /** Returns the vector by given index.
   * <p>The call is direct equivalent to call <code>vecs()[i]</code> and
   * it does not do any array bounds checking.</p>
   * @param idx idx of column
   * @return this frame idx-th vector, never returns <code>null</code>
   */
  public Vec vec(int idx) {
    Vec[] vecs = vecs();
    return vecs[idx];
  }
  /** Returns a subframe of this frame containing only vectors with desired names.
   *
   * @param names list of vector names
   * @return a new frame which collects vectors from this frame with desired names.
   * @throws IllegalArgumentException if there is no vector with desired name in this frame.
   */
  public Frame subframe(String[] names) { return subframe(names, false, 0)[0]; }
  /** Returns a new frame composed of vectors of this frame selected by given names.
   * The method replaces missing vectors by a constant column filled by given value.
   * @param names names of vector to compose a subframe
   * @param c value to fill missing columns.
   * @return two frames, the first contains subframe, the second contains newly created constant vectors or null
   */
  public Frame[] subframe(String[] names, double c) { return subframe(names, true, c); }
  /** Create a subframe from this frame based on desired names.
   * Throws an exception if desired column is not in this frame and <code>replaceBy</code> is <code>false</code>.
   * Else replace a missing column by a constant column with given value.
   *
   * @param names list of column names to extract
   * @param replaceBy should be missing column replaced by a constant column
   * @param c value for constant column
   * @return array of 2 frames, the first is containing a desired subframe, the second one contains newly created columns or null
   * @throws IllegalArgumentException if <code>replaceBy</code> is false and there is a missing column in this frame
   */
  private Frame[] subframe(String[] names, boolean replaceBy, double c){
    Vec [] vecs     = new Vec[names.length];
    Vec [] cvecs    = replaceBy ? new Vec   [names.length] : null;
    String[] cnames = replaceBy ? new String[names.length] : null;
    int ccv = 0; // counter of constant columns
    vecs();                     // Preload the vecs
    HashMap<String, Integer> map = new HashMap<String, Integer>((int) ((names.length/0.75f)+1)); // avoid rehashing by set up initial capacity
    for(int i = 0; i < _names.length; ++i) map.put(_names[i], i);
    for(int i = 0; i < names.length; ++i)
      if(map.containsKey(names[i])) vecs[i] = _vecs[map.get(names[i])];
      else if (replaceBy) {
        Log.warn("Column " + names[i] + " is missing, filling it in with " + c);
        cnames[ccv] = names[i];
        vecs[i] = cvecs[ccv++] = anyVec().makeCon(c);
      }
    return new Frame[] { new Frame(names,vecs), ccv>0 ?  new Frame(Arrays.copyOf(cnames, ccv), Arrays.copyOf(cvecs,ccv)) : null };
  }

  public final Vec[] vecs(int [] idxs) {
    Vec [] all = vecs();
    Vec [] res = new Vec[idxs.length];
    for(int i = 0; i < idxs.length; ++i)
      res[i] = all[idxs[i]];
    return res;
  }
  // Return (and cache) vectors
  public final Vec[] vecs() {
    Vec[] tvecs = _vecs; // read the content
    return tvecs == null ? (_vecs=vecs_impl()) : tvecs;
  }
  // Compute vectors for caching
  private Vec[] vecs_impl() {
    // Load all Vec headers; load them all in parallel by spawning F/J tasks.
    final Vec [] vecs = new Vec[_keys.length];
    Futures fs = new Futures();
    for( int i=0; i<_keys.length; i++ ) {
      final int ii = i;
      final Key k = _keys[i];
      H2OCountedCompleter t = new H2OCountedCompleter() {
          // We need higher priority here as there is a danger of deadlock in
          // case of many calls from MRTask2 at once (e.g. frame with many
          // vectors invokes rollup tasks for all vectors in parallel).  Should
          // probably be done in CPS style in the future
          @Override public byte priority(){return H2O.MIN_HI_PRIORITY;}
          @Override public void compute2() {
            Value v = DKV.get(k);
            if( v==null ) Log.err("Missing vector #" + ii + " (" + _names[ii] + ") during Frame fetch: "+k);
            vecs[ii] = v.get();
            tryComplete();
          }
        };
      H2O.submitTask(t);
      fs.add(t);
    }
    fs.blockForPending();
    return vecs;
  }
  // Force a cache-flush & reload, assuming vec mappings were altered remotely
  public final Vec[] reloadVecs() { _vecs=null; return vecs(); }

  /** Finds the first column with a matching name.  */
  public int find( String name ) {
    if (_names!=null)
      for( int i=0; i<_names.length; i++ )
        if( name.equals(_names[i]) )
          return i;
    return -1;
  }

  public int find( Vec vec ) {
    Vec[] vecs = vecs();
    for( int i=0; i<vecs.length; i++ )
      if( vec.equals(vecs[i]) )
        return i;
    return -1;
  }


  // Return Frame 'f' if 'f' is compatible with 'this'.
  // Return a new Frame compatible with 'this' and a copy of 'f's data otherwise.
  public Frame makeCompatible( Frame f) {
    // Small data frames are always "compatible"
    if( anyVec()==null)      // Or it is small
      return f;                 // Then must be compatible
    // Same VectorGroup is also compatible
    if( f.anyVec() == null ||
        f.anyVec().group().equals(anyVec().group()) && Arrays.equals(f.anyVec()._espc,anyVec()._espc))
      return f;
    // Ok, here make some new Vecs with compatible layout
    Key k = Key.make();
    H2O.submitTask(new RebalanceDataSet(this, f, k)).join();
    Frame f2 = DKV.get(k).get();
    DKV.remove(k);
    return f2;
  }

 /** Appends a named column, keeping the last Vec as the response */
  public Frame add( String name, Vec vec ) {
    if( find(name) != -1 ) throw new IllegalArgumentException("Duplicate name '"+name+"' in Frame");
    if( _vecs.length != 0 ) {
      if( !anyVec().group().equals(vec.group()) && !Arrays.equals(anyVec()._espc,vec._espc) )
        throw new IllegalArgumentException("Vector groups differs - adding vec '"+name+"' into the frame " + Arrays.toString(_names));
      if( numRows() != vec.length() )
        throw new IllegalArgumentException("Vector lengths differ - adding vec '"+name+"' into the frame " + Arrays.toString(_names));
    }
    final int len = _names != null ? _names.length : 0;
    _names = _names != null ? Arrays.copyOf(_names,len+1) : new String[len+1];
    _vecs  = _names != null ? Arrays.copyOf(_vecs ,len+1) : new Vec   [len+1];
    _keys  = _names != null ? Arrays.copyOf(_keys ,len+1) : new Key   [len+1];
    _names[len] = name;
    _vecs [len] = vec ;
    _keys [len] = vec._key;
    return this;
  }

  /** Insert a named column as the first column */
  public Frame prepend( String name, Vec vec ) {
    if( find(name) != -1 ) throw new IllegalArgumentException("Duplicate name '"+name+"' in Frame");
    if( _vecs.length != 0 ) {
      if( !anyVec().group().equals(vec.group()) && !Arrays.equals(anyVec()._espc,vec._espc) )
        throw new IllegalArgumentException("Vector groups differs - adding vec '"+name+"' into the frame " + Arrays.toString(_names));
      if( numRows() != vec.length() )
        throw new IllegalArgumentException("Vector lengths differ - adding vec '"+name+"' into the frame " + Arrays.toString(_names));
    }
    final int len = _names != null ? _names.length : 0;
    String[] _names2 = new String[len+1];
    Vec[]    _vecs2  = new Vec   [len+1];
    Key[]    _keys2  = new Key   [len+1];
    _names2[0] = name;
    _vecs2 [0] = vec ;
    _keys2 [0] = vec._key;
    System.arraycopy(_names, 0, _names2, 1, len);
    System.arraycopy(_vecs,  0, _vecs2,  1, len);
    System.arraycopy(_keys,  0, _keys2,  1, len);
    _names = _names2;
    _vecs  = _vecs2;
    _keys  = _keys2;
    return this;
  }

  /** Appends an entire Frame */
  public Frame add( Frame fr, String names[] ) {
    assert _vecs.length==0 || (anyVec().group().equals(fr.anyVec().group()) || Arrays.equals(anyVec()._espc,fr.anyVec()._espc)): "Adding a vector from different vector group. Current frame contains "+Arrays.toString(_names)+ " vectors. New frame contains "+Arrays.toString(fr.names()) + " vectors.";
    if( _names != null && fr._names != null )
      for( String name : names )
        if( find(name) != -1 ) throw new IllegalArgumentException("Duplicate name '"+name+"' in Frame");
    final int len0= _names!=null ? _names.length : 0;
    final int len1=  names!=null ?  names.length : 0;
    final int len = len0+len1;
    // Note: _names==null <=> _vecs==null <=> _keys==null
    _names = _names != null ? Arrays.copyOf(_names,len) : new String[len];
    _vecs  = _vecs  != null ? Arrays.copyOf(_vecs ,len) : new Vec   [len];
    _keys  = _keys  != null ? Arrays.copyOf(_keys ,len) : new Key   [len];
    System.arraycopy(    names,0,_names,len0,len1);
    System.arraycopy(fr._vecs ,0,_vecs ,len0,len1);
    System.arraycopy(fr._keys ,0,_keys ,len0,len1);
    return this;
  }
  public Frame add( Frame fr, boolean rename ) {
    if( !rename ) return add(fr,fr._names);
    String names[] = new String[fr._names.length];
    for( int i=0; i<names.length; i++ ) {
      String name = fr._names[i];
      int cnt=0;
      while( find(name) != -1 )
        name = fr._names[i]+"_"+(cnt++);
      names[i] = name;
    }
    return add(fr,names);
  }

  /** Removes the first column with a matching name.  */
  public Vec remove( String name ) { return remove(find(name)); }

  /** Removes a numbered column. */
  public Vec [] remove( int [] idxs ) {
    for(int i :idxs)if(i < 0 || i > _vecs.length)
      throw new ArrayIndexOutOfBoundsException();
    Arrays.sort(idxs);
    Vec [] res = new Vec[idxs.length];
    Vec [] rem = new Vec[_vecs.length-idxs.length];
    String [] names = new String[rem.length];
    Key    [] keys  = new Key   [rem.length];
    int j = 0;
    int k = 0;
    int l = 0;
    for(int i = 0; i < _vecs.length; ++i) {
      if(j < idxs.length && i == idxs[j]) {
        ++j;
        res[k++] = _vecs[i];
      } else {
        rem  [l] = _vecs [i];
        names[l] = _names[i];
        keys [l] = _keys [i];
        ++l;
      }
    }
    _vecs = rem;
    _names = names;
    _keys = keys;
    assert l == rem.length && k == idxs.length;
    return res;
  }
  /** Removes a numbered column. */
  public Vec remove( int idx ) {
    int len = _names.length;
    if( idx < 0 || idx >= len ) return null;
    Vec v = vecs()[idx];
    System.arraycopy(_names,idx+1,_names,idx,len-idx-1);
    System.arraycopy(_vecs ,idx+1,_vecs ,idx,len-idx-1);
    System.arraycopy(_keys ,idx+1,_keys ,idx,len-idx-1);
    _names = Arrays.copyOf(_names,len-1);
    _vecs  = Arrays.copyOf(_vecs ,len-1);
    _keys  = Arrays.copyOf(_keys ,len-1);
    if( v == _col0 ) _col0 = null;
    return v;
  }

  /**
   * Remove given interval of columns from frame. Motivated by R intervals.
   * @param startIdx - start index of column (inclusive)
   * @param endIdx - end index of column (exclusive)
   * @return an array of remove columns
   */
  public Vec[] remove(int startIdx, int endIdx) {
    int len = _names.length;
    int nlen = len - (endIdx-startIdx);
    String[] names = new String[nlen];
    Key[] keys = new Key[nlen];
    Vec[] vecs = new Vec[nlen];
    reloadVecs(); // force vecs reload
    if (startIdx > 0) {
      System.arraycopy(_names, 0, names, 0, startIdx);
      System.arraycopy(_vecs,  0, vecs,  0, startIdx);
      System.arraycopy(_keys,  0, keys,  0, startIdx);
    }
    nlen -= startIdx;
    if (endIdx < _names.length+1) {
      System.arraycopy(_names, endIdx, names, startIdx, nlen);
      System.arraycopy(_vecs,  endIdx, vecs,  startIdx, nlen);
      System.arraycopy(_keys,  endIdx, keys,  startIdx, nlen);
    }

    Vec[] vec = Arrays.copyOfRange(vecs(),startIdx,endIdx);
    _names = names;
    _vecs = vecs;
    _keys = keys;
    _col0 = null;
    return vec;
  }

  public Vec replace(int col, Vec nv) {
    if (col >= numCols())
      throw new IllegalArgumentException("Trying to select column "+(col+1)+" but only "+numCols()+" present.");
    Vec rv = vecs()[col];
    assert rv.group().equals(nv.group());
    _vecs[col] = nv;
    _keys[col] = nv._key;
    if( DKV.get(nv._key)==null )    // If not already in KV, put it there
      DKV.put(nv._key, nv);
    return rv;
  }

  public Vec factor(int col) {
    Vec nv = vecs()[col].toEnum();
    return replace(col, nv);
  }

  public Frame extractFrame(int startIdx, int endIdx) {
    Frame f = subframe(startIdx, endIdx);
    remove(startIdx, endIdx);
    return f;
  }

  /** Create a subframe from given interval of columns.
   *
   * @param startIdx index of first column (inclusive)
   * @param endIdx index of the last column (exclusive)
   * @return a new frame containing specified interval of columns
   */
  public Frame subframe(int startIdx, int endIdx) {
    Frame result = new Frame(Arrays.copyOfRange(_names,startIdx,endIdx),Arrays.copyOfRange(vecs(),startIdx,endIdx));
    return result;
  }

  public final String[] names() { return _names; }
  public int  numCols() { return vecs().length; }
  public long numRows() { return anyVec()==null ? 0 : anyVec().length(); }

  public boolean isRawData() {
    // Right now there is only one Vec for raw data, but imagine a Parse after a JDBC import or such.
    for (Vec v : vecs()) {
      if (v.isByteVec())
        return true;
    }
    return false;
  }

  // Number of columns when categoricals expanded.
  // Note: One level is dropped in each categorical col.
  public int numExpCols() {
    int ncols = 0;
    for(int i = 0; i < vecs().length; i++)
      ncols += vecs()[i].domain() == null ? 1 : (vecs()[i].domain().length - 1);
    return ncols;
  }

  /** All the domains for enum columns; null for non-enum columns.  */
  public String[][] domains() {
    String ds[][] = new String[vecs().length][];
    for( int i=0; i<vecs().length; i++ )
      ds[i] = vecs()[i].domain();
    return ds;
  }

  /** true/false every Vec is a UUID */
  public boolean[] uuids() {
    boolean bs[] = new boolean[vecs().length];
    for( int i=0; i<vecs().length; i++ )
      bs[i] = vecs()[i].isUUID();
    return bs;
  }

  /** Time status for every Vec */
  public byte[] times() {
    byte bs[] = new byte[vecs().length];
    for( int i=0; i<vecs().length; i++ )
      bs[i] = vecs()[i]._time;
    return bs;
  }

  private String[][] domains(int [] cols){
    Vec [] vecs = vecs();
    String [][] res = new String[cols.length][];
    for(int i = 0; i < cols.length; ++i)
      res[i] = vecs[cols[i]]._domain;
    return res;
  }

  private String [] names(int [] cols){
    if(_names == null)return null;
    String [] res = new String[cols.length];
    for(int i = 0; i < cols.length; ++i)
      res[i] = _names[cols[i]];
    return res;
  }

  public Vec lastVec() {
    final Vec [] vecs = vecs();
    return vecs[vecs.length-1];
  }
  /** Returns the first readable vector. */
  public Vec anyVec() {
    Vec c0 = _col0; // single read
    if( c0 != null ) return c0;
    for( Vec v : vecs() )
      if( v.readable() )
        return (_col0 = v);
    return null;
  }
  /* Returns the only Vector, or tosses IAE */
  public final Vec theVec(String err) {
    if( _keys.length != 1 ) throw new IllegalArgumentException(err);
    if( _vecs == null ) _vecs = new Vec[]{_col0 = DKV.get(_keys[0]).get() };
    return _vecs[0];
  }

  /** Check that the vectors are all compatible.  All Vecs have their content
   *  sharded using same number of rows per chunk.  */
  public boolean checkCompatible( ) {
    Vec v0 = anyVec();
    if( v0 == null ) return true;
    int nchunks = v0.nChunks();
    for( Vec vec : vecs() ) {
      if( vec instanceof AppendableVec ) continue; // New Vectors are endlessly compatible
      if( vec.nChunks() != nchunks )
        throw new IllegalArgumentException("Vectors different numbers of chunks, "+nchunks+" and "+vec.nChunks());
    }
    // Also check each chunk has same rows
    for( int i=0; i<nchunks; i++ ) {
      long es = v0.chunk2StartElem(i);
      for(int j = 1; j < numCols(); ++j) {
        Vec vec = vec(j);
        if (!(vec instanceof AppendableVec) && vec.chunk2StartElem(i) != es)
          throw new IllegalArgumentException("Vector chunks have different numbers of rows, " + es + " and " + vec.chunk2StartElem(i) + " at vec " + j + " and chunk " + i);
      }
    }
    // For larger Frames, verify that the layout is compatible - else we'll be
    // endlessly cache-missing the data around the cluster, pulling copies
    // local everywhere.
    if( v0.length() > 1e4 ) {
      Key gk = v0.groupKey();
      for( Vec vec : vecs() )
        assert gk.equals(vec.groupKey()) : "Vector " + vec + " has different vector group!";
    }
    return true;
  }

  public void closeAppendables() {closeAppendables(new Futures()).blockForPending(); }
  // Close all AppendableVec
  public Futures closeAppendables(Futures fs) {
    _col0 = null;               // Reset cache
    int len = vecs().length;
    for( int i=0; i<len; i++ ) {
      Vec v = _vecs[i];
      if( v instanceof AppendableVec )
        DKV.put(_keys[i],_vecs[i] = ((AppendableVec)v).close(fs),fs);
    }
    return fs;
  }

  /** Actually remove/delete all Vecs from memory, not just from the Frame. */
  @Override public Futures delete_impl(Futures fs) {
    for( Key k : _keys ) UKV.remove(k,fs);
    _names = new String[0];
    _vecs = new Vec[0];
    _keys = new Key[0];
    return fs;
  }
  @Override public String errStr() { return "Dataset"; }

  public long byteSize() {
    long sum=0;
    for( int i=0; i<vecs().length; i++ )
      sum += _vecs[i].byteSize();
    return sum;
  }


  // Allow sorting of columns based on some function
  public void swap( int lo, int hi ) {
    assert 0 <= lo && lo < _keys.length;
    assert 0 <= hi && hi < _keys.length;
    if( lo==hi ) return;
    Vec vecs[] = vecs();
    Vec v   = vecs [lo]; vecs  [lo] = vecs  [hi]; vecs  [hi] = v;
    Key k   = _keys[lo]; _keys [lo] = _keys [hi]; _keys [hi] = k;
    String n=_names[lo]; _names[lo] = _names[hi]; _names[hi] = n;
  }

  @Override public String toString() {
    // Across
    Vec vecs[] = _vecs;
    // Do Not Cache _vecs in toString lest IdeaJ variable display cause side-effects
    if( vecs == null ) vecs = vecs_impl();
    if( vecs.length==0 ) return "{}";
    String s="{"+(_names==null?"C0":_names[0]);
    long bs=vecs[0].byteSize();
    for( int i=1; i<vecs.length; i++ ) {
      s += ","+(_names==null?"C"+i:_names[i]);
      bs+= vecs[i].byteSize();
    }
    s += "}, "+PrettyPrint.bytes(bs)+"\n";
    // Down
    Vec v0 = vecs[0];          // Do Not Cache, no side-effects
    if( v0 == null ) return s;
    int nc = v0.nChunks();
    s += "Chunk starts: {";
    for( int c=0; c<nc; c++ ) s += v0.chunk2StartElem(c)+",";
    s += "}";
    return s;
  }
  public String toStringNames() { return Arrays.toString(_names); }

  // Print a row with headers inlined
  private String toStr( long idx, int col ) {
    return _names[col]+"="+(_vecs[col].isNA(idx) ? "NA" : _vecs[col].at(idx));
  }
  public String toString( long idx ) {
    String s="{"+toStr(idx,0);
    for( int i=1; i<_names.length; i++ )
       s += ","+toStr(idx,i);
    return s+"}";
  }

  public void replaceVecs(Vec [] vecs){
    if(vecs.length != _vecs.length)
      throw new IllegalArgumentException("Incompatible number of vecs");
    _vecs = vecs;
    _col0 = _vecs[0];
    for(int i = 0; i < _keys.length; ++i)
      _keys[i] = vecs[i]._key;
  }

  // Print fixed-width row & fixed-width headers (more compressed print
  // format).  Returns the column formats.
  public String[] toStringHdr( StringBuilder sb ) {
    String[] fs = new String[numCols()];
    for( int c=0; c<fs.length; c++ ) {
      String n = (_names != null && c < _names.length) ? _names[c] : ("C"+c);
      int nlen = n.length();
      if( numRows()==0 ) { sb.append(n).append(' '); continue; }
      int w=0;
      if( _vecs[c].isEnum() ) {
        String ss[] = _vecs[c]._domain;
        for( int i=0; i<ss.length; i++ )
          w = Math.max(w,ss[i].length());
        w = Math.min(w,10);
        fs[c] = "%"+w+"."+w+"s";
      } else {
        Chunk C = _vecs[c].chunkForChunkIdx(0);   // 1st Chunk
        // Possible situation: 1) vec is INT - C is has no floats => OK
        // 2) vec is INT - C has floats => IMPOSSIBLE,
        // 3) vec is FLOAT - C has floats => OK,
        // 4) vec is FLOAT - C has no floats => find the first chunk with floats
        if (!_vecs[c].isInt() &&  !C.hasFloat()) {
          for (int i=1; i<_vecs[c].nChunks(); i++) {
            C=_vecs[c].chunkForChunkIdx(i);
            if (C.hasFloat()) break;
          }
        }
        String f = fs[c] = C.pformat();  // Printable width
        for( int x=0; x<f.length(); x++ )// Get printable width from format
          if( Character.isDigit(f.charAt(x)) ) w = w*10+(f.charAt(x)-'0');
          else if( w>0 ) break;
        if( f.charAt(1)==' ' ) w++; // Leading blank is not in print-width
      }
      int len = sb.length();
      if( nlen>1 && w==1 ) {
        fs[c]=" "+fs[c];
        w=2;
      }
      if( nlen <= w ) {         // Short name, big digits
        sb.append(n);
        for( int i=nlen; i<w; i++ ) sb.append(' ');
      } else if( w==1 ) {       // First char only
        sb.append(n.charAt(0));
      } else if( w==2 ) {       // First 2 chars only
        sb.append(n.charAt(0)).append(n.charAt(1));
      } else {                  // First char dot lastchars; e.g. Compress "Interval" to "I.val"
        sb.append(n.charAt(0)).append('.');
        for( int i=nlen-(w-2); i<nlen; i++ )
          sb.append(n.charAt(i));
      }
      assert len+w==sb.length();
      sb.append(' ');           // Column seperator
    }
    sb.append('\n');
    return fs;
  }
  public StringBuilder toString( StringBuilder sb, String[] fs, long idx ) {
    Vec vecs[] = vecs();
    for( int c=0; c<fs.length; c++ ) {
      Vec vec = vecs[c];
      if( vec.isEnum() ) {
        String s = "----------";
        if( !vec.isNA(idx) ) {
          int x = (int)vec.at8(idx);
          if( x >= 0 && x < vec._domain.length ) s = vec._domain[x];
        }
        sb.append(String.format(fs[c],s));
      } else if( vec.isInt() ) {
        if( vec.isNA(idx) ) {
          Chunk C = vec.chunkForChunkIdx(0);   // 1st Chunk
          int len = C.pformat_len0();  // Printable width
          for( int i=0; i<len; i++ ) sb.append('-');
        } else {
          try {
            if( vec.isUUID() ) sb.append(PrettyPrint.UUID(vec.at16l(idx),vec.at16h(idx)));
            else sb.append(String.format(fs[c],vec.at8(idx)));
          } catch( IllegalFormatException ife ) {
            System.out.println("Format: "+fs[c]+" col="+c+" not for ints");
            ife.printStackTrace();
          }
        }
      } else {
        sb.append(String.format(fs[c],vec.at (idx)));
        if( vec.isNA(idx) ) sb.append(' ');
      }
      sb.append(' ');           // Column seperator
    }
    sb.append('\n');
    return sb;
  }
  public String toStringAll() {
    StringBuilder sb = new StringBuilder();
    String[] fs = toStringHdr(sb);
    for( int i=0; i<numRows(); i++ )
      toString(sb,fs,i);
    return sb.toString();
  }

  // Return the entire Frame as a CSV stream
  public InputStream toCSV(boolean headers) {
    return new CSVStream(headers, false);
  }

  public InputStream toCSV(boolean headers, boolean hex_string) {
    return new CSVStream(headers, hex_string);
  }

  private class CSVStream extends InputStream {
    private final boolean _hex_string;
    byte[] _line;
    int _position;
    long _row;

    CSVStream(boolean headers, boolean hex_string) {
      _hex_string = hex_string;
      StringBuilder sb = new StringBuilder();
      Vec vs[] = vecs();
      if( headers ) {
        sb.append('"' + _names[0] + '"');
        for(int i = 1; i < vs.length; i++)
          sb.append(',').append('"' + _names[i] + '"');
        sb.append('\n');
      }
      _line = sb.toString().getBytes();
    }

    @Override public int available() throws IOException {
      if(_position == _line.length) {
        if(_row == numRows())
          return 0;
        StringBuilder sb = new StringBuilder();
        Vec vs[] = vecs();
        for( int i = 0; i < vs.length; i++ ) {
          if(i > 0) sb.append(',');
          if(!vs[i].isNA(_row)) {
            if( vs[i].isEnum() ) sb.append('"' + vs[i]._domain[(int) vs[i].at8(_row)] + '"');
            else if( vs[i].isUUID() ) sb.append(PrettyPrint.UUID(vs[i].at16l(_row),vs[i].at16h(_row)));
            else if( vs[i].isInt() ) sb.append(vs[i].at8(_row));
            else {
              // R 3.1 unfortunately changed the behavior of read.csv().
              // (Really type.convert()).
              //
              // Numeric values with too much precision now trigger a type conversion in R 3.1 into a factor.
              //
              // See these discussions:
              //   https://bugs.r-project.org/bugzilla/show_bug.cgi?id=15751
              //   https://stat.ethz.ch/pipermail/r-devel/2014-April/068778.html
              //   http://stackoverflow.com/questions/23072988/preserve-old-pre-3-1-0-type-convert-behavior

              double d = vs[i].at(_row);

              String s;
              if (_hex_string) {
                // Used by R's as.data.frame().
                s = Double.toHexString(d);
              }
              else {
                // To emit CSV files that can be read by R 3.1, limit the number of significant digits.
                // s = String.format("%.15g", d);

                s = Double.toString(d);
              }

              sb.append(s);
            }
          }
        }
        sb.append('\n');
        _line = sb.toString().getBytes();
        _position = 0;
        _row++;
      }
      return _line.length - _position;
    }

    @Override public void close() throws IOException {
      super.close();
      _line = null;
    }

    @Override public int read() throws IOException {
      return available() == 0 ? -1 : _line[_position++];
    }

    @Override public int read(byte[] b, int off, int len) throws IOException {
      int n = available();
      if(n > 0) {
        n = Math.min(n, len);
        System.arraycopy(_line, _position, b, off, n);
        _position += n;
      }
      return n;
    }
  }


  // --------------------------------------------------------------------------
  // In support of R, a generic Deep Copy & Slice.
  // Semantics are a little odd, to match R's.
  // Each dimension spec can be:
  //   null - all of them
  //   a sorted list of negative numbers (no dups) - all BUT these
  //   an unordered list of positive - just these, allowing dups
  // The numbering is 1-based; zero's are not allowed in the lists, nor are out-of-range.
  final int MAX_EQ2_COLS = 100000;      // FIXME.  Put this in a better spot.
  public Frame deepSlice( Object orows, Object ocols ) {
    // ocols is either a long[] or a Frame-of-1-Vec
    long[] cols = null;
    if( ocols == null ) cols = null;
    else if (ocols instanceof long[]) cols = (long[])ocols;
    else if (ocols instanceof Frame) {
      Frame fr = (Frame) ocols;
      if (fr.numCols() != 1)
        throw new IllegalArgumentException("Columns Frame must have only one column (actually has " + fr.numCols() + " columns)");
      long n = fr.anyVec().length();
      if (n > MAX_EQ2_COLS)
        throw new IllegalArgumentException("Too many requested columns (requested " + n +", max " + MAX_EQ2_COLS + ")");
      cols = new long[(int)n];
      Vec v = fr.anyVec();
      for (long i = 0; i < v.length(); i++)
        cols[(int)i] = v.at8(i);
    } else
      throw new IllegalArgumentException("Columns is specified by an unsupported data type (" + ocols.getClass().getName() + ")");

    // Since cols is probably short convert to a positive list.
    int c2[] = null;
    if( cols==null ) {
      c2 = new int[numCols()];
      for( int i=0; i<c2.length; i++ ) c2[i]=i;
    } else if( cols.length==0 ) {
      c2 = new int[0];
    } else if( cols[0] > 0 ) {
      c2 = new int[cols.length];
      for( int i=0; i<cols.length; i++ )
        c2[i] = (int)cols[i]-1; // Convert 1-based cols to zero-based
    } else {
      c2 = new int[numCols()-cols.length];
      int j=0;
      for( int i=0; i<numCols(); i++ ) {
        if( j >= cols.length || i < (-cols[j]-1) ) c2[i-j] = i;
        else j++;
      }
    }
    for( int i=0; i<c2.length; i++ )
      if( c2[i] >= numCols() )
        throw new IllegalArgumentException("Trying to select column "+(c2[i]+1)+" but only "+numCols()+" present.");
    if( c2.length==0 )
      throw new IllegalArgumentException("No columns selected (did you try to select column 0 instead of column 1?)");

    // Do Da Slice
    // orows is either a long[] or a Vec
    if (orows == null)
      return copyRollups(new DeepSlice(null,c2,vecs()).doAll(c2.length,this).outputFrame(names(c2),domains(c2)),true);
    else if (orows instanceof long[]) {
      final long CHK_ROWS=1000000;
      final long[] rows = (long[])orows;
      if (this.numRows() == 0) {
        return this;
      }
      if( rows.length==0 || rows[0] < 0 ) {
        if (rows.length != 0 && rows[0] < 0) {
          Vec v = new MRTask2() {
            @Override public void map(Chunk cs) {
              for (long er : rows) {
                if (er >= 0) continue;
                er = Math.abs(er) - 1; // 1-based -> 0-based
                if (er < cs._start || er > (cs._len + cs._start - 1)) continue;
                cs.set0((int) (er - cs._start), 1);
              }
            }
          }.doAll(this.anyVec().makeZero()).getResult()._fr.anyVec();
          Frame slicedFrame = new DeepSlice(rows, c2, vecs()).doAll(c2.length, this.add("select_vec", v)).outputFrame(names(c2), domains(c2));
          UKV.remove(v._key);
          UKV.remove(this.remove(this.numCols()-1)._key);
          return copyRollups(slicedFrame, false);
        } else {
          return copyRollups(new DeepSlice(rows.length == 0 ? null : rows, c2, vecs()).doAll(c2.length, this).outputFrame(names(c2), domains(c2)), rows.length == 0);
        }
      }
      // Vec'ize the index array
      Futures fs = new Futures();
      AppendableVec av = new AppendableVec(Vec.newKey(Key.make("rownames")));
      int r = 0;
      int c = 0;
      while (r < rows.length) {
        NewChunk nc = new NewChunk(av, c);
        long end = Math.min(r+CHK_ROWS, rows.length);
        for (; r < end; r++) {
          nc.addNum(rows[r]);
        }
        nc.close(c++, fs);
      }
      Vec c0 = av.close(fs);   // c0 is the row index vec
      fs.blockForPending();
      Frame fr2 = new Slice(c2, this).doAll(c2.length,new Frame(new String[]{"rownames"}, new Vec[]{c0}))
              .outputFrame(names(c2), domains(c2));
      UKV.remove(c0._key);      // Remove hidden vector
      return fr2;
    }
    Frame frows = (Frame)orows;
    Vec vrows = frows.anyVec();
    // It's a compatible Vec; use it as boolean selector.
    // Build column names for the result.
    Vec [] vecs = new Vec[c2.length+1];
    String [] names = new String[c2.length+1];
    for(int i = 0; i < c2.length; ++i){
      vecs[i] = _vecs[c2[i]];
      names[i] = _names[c2[i]];
    }
    vecs[c2.length] = vrows;
    names[c2.length] = "predicate";
    return new DeepSelect().doAll(c2.length,new Frame(names,vecs)).outputFrame(names(c2),domains(c2));
  }

  // Slice and return in the form of new chunks.
  private static class Slice extends MRTask2<Slice> {
    final Frame  _base;   // the base frame to slice from
    final int[]  _cols;
    Slice(int[] cols, Frame base) { _cols = cols; _base = base; }
    @Override public void map(Chunk[] ix, NewChunk[] ncs) {
      final Vec[] vecs = new Vec[_cols.length];
      final Vec   anyv = _base.anyVec();
      final long  nrow = anyv.length();
            long  r    = ix[0].at80(0);
      int   last_ci = anyv.elem2ChunkIdx(r<nrow?r:0); // memoize the last chunk index
      long  last_c0 = anyv._espc[last_ci];            // ...         last chunk start
      long  last_c1 = anyv._espc[last_ci + 1];        // ...         last chunk end
      Chunk[] last_cs = new Chunk[vecs.length];       // ...         last chunks
      for (int c = 0; c < _cols.length; c++) {
        vecs[c] = _base.vecs()[_cols[c]];
        last_cs[c] = vecs[c].chunkForChunkIdx(last_ci);
      }
      for (int i = 0; i < ix[0]._len; i++) {
        // select one row
        r = ix[0].at80(i) - 1;   // next row to select
        if (r < 0) continue;
        if (r >= nrow) {
          for (int c = 0; c < vecs.length; c++) ncs[c].addNum(Double.NaN);
        } else {
          if (r < last_c0 || r >= last_c1) {
            last_ci = anyv.elem2ChunkIdx(r);
            last_c0 = anyv._espc[last_ci];
            last_c1 = anyv._espc[last_ci + 1];
            for (int c = 0; c < vecs.length; c++)
              last_cs[c] = vecs[c].chunkForChunkIdx(last_ci);
          }
          for (int c = 0; c < vecs.length; c++)
            if( vecs[c].isUUID() ) ncs[c].addUUID(last_cs[c],r);
            else                   ncs[c].addNum (last_cs[c].at(r));
        }
      }
    }
  }

  // Bulk (expensive) copy from 2nd cols into 1st cols.
  // Sliced by the given cols & rows
  private static class DeepSlice extends MRTask2<DeepSlice> {
    final int  _cols[];
    final long _rows[];
    final byte _isInt[];
    boolean _ex = true;
    DeepSlice( long rows[], int cols[], Vec vecs[] ) {
      _cols=cols;
      _rows=rows;
      _isInt = new byte[cols.length];
      for( int i=0; i<cols.length; i++ )
        _isInt[i] = (byte)(vecs[cols[i]].isInt() ? 1 : 0);
    }

    @Override public boolean logVerbose() { return false; }

    @Override public void map( Chunk chks[], NewChunk nchks[] ) {
      long rstart = chks[0]._start;
      int rlen = chks[0]._len;  // Total row count
      int rx = 0;               // Which row to in/ex-clude
      int rlo = 0;              // Lo/Hi for this block of rows
      int rhi = rlen;
      if (_rows != null && _rows[0] < 0) {
        // Skip any rows that have 1 in the last column!
        Chunk select_vec = chks[chks.length-1];
        for (int i = 0; i < _cols.length; i++) {
          Chunk oc = chks[_cols[i]];
          NewChunk nc = nchks[i];
          if (_isInt[i] == 1) { // Slice on integer columns
            for (int j = 0; j < oc._len; j++) {
              if (select_vec.at80(j) == 1) continue;
              if (oc._vec.isUUID()) nc.addUUID(oc, j);
              else if (oc.isNA0(j)) nc.addNA();
              else nc.addNum(oc.at80(j), 0);
            }
          } else {                // Slice on double columns
            for (int j = 0; j < oc._len; j++) {
              if (select_vec.at80(j) == 1) continue;
              nc.addNum(oc.at0(j));
            }
          }
        }
      } else {
        while (true) {           // Still got rows to include?
          if (_rows != null) {   // Got a row selector?
            if (rx >= _rows.length) break; // All done with row selections
            long r = _rows[rx++] - 1;// Next row selector
            if (r < rstart) continue;
            rlo = (int) (r - rstart);
            rhi = rlo + 1;        // Stop at the next row
            while (rx < _rows.length && (_rows[rx] - 1 - rstart) == rhi && rhi < rlen) {
              rx++;
              rhi++;      // Grab sequential rows
            }
          }
          // Process this next set of rows
          // For all cols in the new set
          for (int i = 0; i < _cols.length; i++) {
            Chunk oc = chks[_cols[i]];
            NewChunk nc = nchks[i];
            if (_isInt[i] == 1) { // Slice on integer columns
              for (int j = rlo; j < rhi; j++)
                if (oc._vec.isUUID()) nc.addUUID(oc, j);
                else if (oc.isNA0(j)) nc.addNA();
                else nc.addNum(oc.at80(j), 0);
            } else {                // Slice on double columns
              for (int j = rlo; j < rhi; j++)
                nc.addNum(oc.at0(j));
            }
          }
          rlo = rhi;
          if (_rows == null) break;
        }
      }
    }
  }


  public static Frame[] runifSplit(Frame f, float threshold, long seed) {
    if (seed == -1) seed = new Random().nextLong();
    Vec rv = new Vec(f.anyVec().group().addVecs(1)[0],f.anyVec()._espc);
    Futures fs = new Futures();
    DKV.put(rv._key,rv, fs);
    for(int i = 0; i < rv._espc.length-1; ++i)
      DKV.put(rv.chunkKey(i),new C0DChunk(0,(int)(rv._espc[i+1]-rv._espc[i])),fs);
    fs.blockForPending();
    final long zeed = seed;
    new MRTask2() {
      @Override public void map(Chunk c){
        Random rng = new Random(zeed*c.cidx());
        for(int i = 0; i < c._len; ++i)
          c.set0(i, (float)rng.nextDouble());
      }
    }.doAll(rv);
    Vec[] vecs = new Vec[f.numCols()+1];
    System.arraycopy(f.vecs(), 0, vecs,0, f.numCols());
    vecs[f.numCols()] = rv;
    Frame doAllFr = new Frame(null, vecs);
    // it would be great if there was a map call for NewChunk[][] multi frame output
    Frame left = new DeepSelectThresh(threshold,  true).doAll(f.numCols(),doAllFr).outputFrame(Key.make(), f.names(), f.domains());
    Frame rite = new DeepSelectThresh(threshold, false).doAll(f.numCols(),doAllFr).outputFrame(Key.make(), f.names(), f.domains());
    UKV.remove(rv._key);
    return new Frame[]{left,rite};
  }

  private static class DeepSelect extends MRTask2<DeepSelect> {
    @Override public void map( Chunk chks[], NewChunk nchks[] ) {
      Chunk pred = chks[chks.length-1];
      for(int i = 0; i < pred._len; ++i) {
        if(pred.at0(i) != 0) {
          for( int j = 0; j < chks.length - 1; j++ ) {
            Chunk chk = chks[j];
            if( chk._vec.isUUID() ) nchks[j].addUUID(chk,i);
            else nchks[j].addNum(chk.at0(i));
          }
        }
      }
    }
  }

  private static class DeepSelectThresh extends MRTask2<DeepSelectThresh> {
    private final float _threshold;
    private final boolean _left;
    DeepSelectThresh(float threshold, boolean left) { _threshold = threshold; _left = left; }

    private void addRow(Chunk[] cs, NewChunk[] ncs, int i) {
      for (int j = 0; j < cs.length -1; ++j) {
        Chunk c = cs[j];
        if (c._vec.isUUID()) ncs[j].addUUID(c,i);
        else ncs[j].addNum(c.at0(i)); // NewChunk will compress later ... not set0s
      }
    }

    @Override public void map(Chunk cs[], NewChunk ncs[]) {
      Chunk rv = cs[cs.length-1];
      for (int i = 0; i < rv._len; ++i) {
        if (_left) {
          if (rv.at0(i) <= _threshold) addRow(cs, ncs, i);
        } else {
          if (rv.at0(i) > _threshold)  addRow(cs, ncs, i);
        }
      }
    }
  }

  private Frame copyRollups( Frame fr, boolean isACopy ) {
    if( !isACopy ) return fr; // Not a clean copy, do not copy rollups (will do rollups "the hard way" on first ask)
    Vec vecs0[] = vecs();
    Vec vecs1[] = fr.vecs();
    for( int i=0; i<fr._names.length; i++ ) {
      assert vecs1[i]._naCnt== -1; // not computed yet, right after slice
      Vec v0 = vecs0[find(fr._names[i])];
      Vec v1 = vecs1[i];
      v1.setRollupStats(v0);
    }
    return fr;
  }

  // ------------------------------------------------------------------------------

  public
  <Y extends Flow.PerRow<Y>>      // Type parameter
  Flow.FlowPerRow<Y>              // Return type of with()
  with                            // The method name
  ( Flow.PerRow<Y> pr )           // Arguments for with()
  {
    return new Flow.FlowPerRow<Y>(pr,new Flow.FlowFrame(this));
  }

  public Flow.FlowFilter with( Flow.Filter fr ) {
    return new Flow.FlowFilter(fr,new Flow.FlowFrame(this));
  }

  public Flow.FlowGroupBy with( Flow.GroupBy fr ) {
    return new Flow.FlowGroupBy(fr,new Flow.FlowFrame(this));
  }
}