package water.fvec; import jsr166y.CountedCompleter; import water.*; import water.H2O.H2OCountedCompleter; import water.exec.Flow; import water.util.Log; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.HashMap; import java.util.IllegalFormatException; import java.util.Random; /** * A collection of named Vecs. Essentially an R-like data-frame. Multiple * Frames can reference the same Vecs. A Frame is a lightweight object, it is * meant to be cheaply created and discarded for data munging purposes. * E.g. to exclude a Vec from a computation on a Frame, create a new Frame that * references all the Vecs but this one. */ public class Frame extends Lockable<Frame> { public String[] _names; Key[] _keys; // Keys for the vectors private transient Vec[] _vecs;// The Vectors (transient to avoid network traffic) private transient Vec _col0; // First readable vec; fast access to the VectorGroup's Chunk layout private final UniqueId uniqueId; public Frame(Key k){ super(k); uniqueId = new UniqueFrameId(k, this); } public Frame( Frame fr ) { this(fr._key,fr._names.clone(), fr.vecs().clone()); _col0 = null; } public Frame( Vec... vecs ){ this(null,vecs);} public Frame( String[] names, Vec[] vecs ) { this(null,names,vecs); } public Frame( Key key, String[] names, Vec[] vecs ) { super(key); this.uniqueId = new UniqueFrameId(_key, this); if( names==null ) { names = new String[vecs.length]; for( int i=0; i<vecs.length; i++ ) names[i] = "C"+(i+1); } assert names.length == vecs.length : "Number of columns does not match to number of cols' names."; _names=names; _vecs=vecs; _keys = new Key[vecs.length]; for( int i=0; i<vecs.length; i++ ) _keys[i] = vecs[i]._key; assert checkCompatible(); } /** * Task to compare the two frames, returns true if they are identical. * We can't in general expect frames to be bit-compatible so we compare the numbers, * integers are compared exaclty, doubles only with given precision (1e-8 is default). * (compression scheme may be altered by the way they were parsed and by rebalancing) * The frames are expected to be compatible. * @param f * @return */ public final boolean isIdentical(Frame f){ FrameIdenticalTask fbt = new FrameIdenticalTask(this,f); H2O.submitTask(fbt); fbt.join(); return fbt._res; } public static class FrameIdenticalTask extends H2OCountedCompleter { final Frame _f1; final Frame _f2; public FrameIdenticalTask(Frame f1, Frame f2){_f1 = f1; _f2 = f2;} boolean _res; double _fpointPrecision = 1e-8; private Vec.VecIdenticalTask[] _vts; @Override public void compute2() { if(_f1 == _f2){ _res = true; } else if(Arrays.deepEquals(_f1.names(), _f2.names())){ _vts = new Vec.VecIdenticalTask[_f1.numCols()]; addToPendingCount(_vts.length); for(int i = 0; i < _vts.length; ++i) { _vts[i] = new Vec.VecIdenticalTask(this,_fpointPrecision); _vts[i].asyncExec(_f1.vec(i),_f2.vec(i)); } } tryComplete(); } @Override public void onCompletion(CountedCompleter cc){ if(_vts != null){ _res = _vts[0]._res; for(int i = 1; i < _vts.length; ++i) _res = _res && _vts[i]._res; } } } public UniqueId getUniqueId() { return this.uniqueId; } /** 64-bit checksum of the checksums of the vecs. SHA-265 checksums of the chunks are XORed * together. Since parse always parses the same pieces of files into the same offsets * in some chunk this checksum will be consistent across reparses. */ public long checksum() { Vec [] vecs = vecs(); long _checksum = 0; for(int i = 0; i < _names.length; ++i) { long vec_checksum = vecs[i].checksum(); _checksum ^= vec_checksum; _checksum ^= (2147483647 * i); } return _checksum; } public Vec vec(String name){ Vec [] vecs = vecs(); for(int i = 0; i < _names.length; ++i) if(_names[i].equals(name))return vecs[i]; return null; } /** Returns the vector by given index. * <p>The call is direct equivalent to call <code>vecs()[i]</code> and * it does not do any array bounds checking.</p> * @param idx idx of column * @return this frame idx-th vector, never returns <code>null</code> */ public Vec vec(int idx) { Vec[] vecs = vecs(); return vecs[idx]; } /** Returns a subframe of this frame containing only vectors with desired names. * * @param names list of vector names * @return a new frame which collects vectors from this frame with desired names. * @throws IllegalArgumentException if there is no vector with desired name in this frame. */ public Frame subframe(String[] names) { return subframe(names, false, 0)[0]; } /** Returns a new frame composed of vectors of this frame selected by given names. * The method replaces missing vectors by a constant column filled by given value. * @param names names of vector to compose a subframe * @param c value to fill missing columns. * @return two frames, the first contains subframe, the second contains newly created constant vectors or null */ public Frame[] subframe(String[] names, double c) { return subframe(names, true, c); } /** Create a subframe from this frame based on desired names. * Throws an exception if desired column is not in this frame and <code>replaceBy</code> is <code>false</code>. * Else replace a missing column by a constant column with given value. * * @param names list of column names to extract * @param replaceBy should be missing column replaced by a constant column * @param c value for constant column * @return array of 2 frames, the first is containing a desired subframe, the second one contains newly created columns or null * @throws IllegalArgumentException if <code>replaceBy</code> is false and there is a missing column in this frame */ private Frame[] subframe(String[] names, boolean replaceBy, double c){ Vec [] vecs = new Vec[names.length]; Vec [] cvecs = replaceBy ? new Vec [names.length] : null; String[] cnames = replaceBy ? new String[names.length] : null; int ccv = 0; // counter of constant columns vecs(); // Preload the vecs HashMap<String, Integer> map = new HashMap<String, Integer>((int) ((names.length/0.75f)+1)); // avoid rehashing by set up initial capacity for(int i = 0; i < _names.length; ++i) map.put(_names[i], i); for(int i = 0; i < names.length; ++i) if(map.containsKey(names[i])) vecs[i] = _vecs[map.get(names[i])]; else if (replaceBy) { Log.warn("Column " + names[i] + " is missing, filling it in with " + c); cnames[ccv] = names[i]; vecs[i] = cvecs[ccv++] = anyVec().makeCon(c); } return new Frame[] { new Frame(names,vecs), ccv>0 ? new Frame(Arrays.copyOf(cnames, ccv), Arrays.copyOf(cvecs,ccv)) : null }; } public final Vec[] vecs(int [] idxs) { Vec [] all = vecs(); Vec [] res = new Vec[idxs.length]; for(int i = 0; i < idxs.length; ++i) res[i] = all[idxs[i]]; return res; } // Return (and cache) vectors public final Vec[] vecs() { Vec[] tvecs = _vecs; // read the content return tvecs == null ? (_vecs=vecs_impl()) : tvecs; } // Compute vectors for caching private Vec[] vecs_impl() { // Load all Vec headers; load them all in parallel by spawning F/J tasks. final Vec [] vecs = new Vec[_keys.length]; Futures fs = new Futures(); for( int i=0; i<_keys.length; i++ ) { final int ii = i; final Key k = _keys[i]; H2OCountedCompleter t = new H2OCountedCompleter() { // We need higher priority here as there is a danger of deadlock in // case of many calls from MRTask2 at once (e.g. frame with many // vectors invokes rollup tasks for all vectors in parallel). Should // probably be done in CPS style in the future @Override public byte priority(){return H2O.MIN_HI_PRIORITY;} @Override public void compute2() { Value v = DKV.get(k); if( v==null ) Log.err("Missing vector #" + ii + " (" + _names[ii] + ") during Frame fetch: "+k); vecs[ii] = v.get(); tryComplete(); } }; H2O.submitTask(t); fs.add(t); } fs.blockForPending(); return vecs; } // Force a cache-flush & reload, assuming vec mappings were altered remotely public final Vec[] reloadVecs() { _vecs=null; return vecs(); } /** Finds the first column with a matching name. */ public int find( String name ) { if (_names!=null) for( int i=0; i<_names.length; i++ ) if( name.equals(_names[i]) ) return i; return -1; } public int find( Vec vec ) { Vec[] vecs = vecs(); for( int i=0; i<vecs.length; i++ ) if( vec.equals(vecs[i]) ) return i; return -1; } // Return Frame 'f' if 'f' is compatible with 'this'. // Return a new Frame compatible with 'this' and a copy of 'f's data otherwise. public Frame makeCompatible( Frame f) { // Small data frames are always "compatible" if( anyVec()==null) // Or it is small return f; // Then must be compatible // Same VectorGroup is also compatible if( f.anyVec() == null || f.anyVec().group().equals(anyVec().group()) && Arrays.equals(f.anyVec()._espc,anyVec()._espc)) return f; // Ok, here make some new Vecs with compatible layout Key k = Key.make(); H2O.submitTask(new RebalanceDataSet(this, f, k)).join(); Frame f2 = DKV.get(k).get(); DKV.remove(k); return f2; } /** Appends a named column, keeping the last Vec as the response */ public Frame add( String name, Vec vec ) { if( find(name) != -1 ) throw new IllegalArgumentException("Duplicate name '"+name+"' in Frame"); if( _vecs.length != 0 ) { if( !anyVec().group().equals(vec.group()) && !Arrays.equals(anyVec()._espc,vec._espc) ) throw new IllegalArgumentException("Vector groups differs - adding vec '"+name+"' into the frame " + Arrays.toString(_names)); if( numRows() != vec.length() ) throw new IllegalArgumentException("Vector lengths differ - adding vec '"+name+"' into the frame " + Arrays.toString(_names)); } final int len = _names != null ? _names.length : 0; _names = _names != null ? Arrays.copyOf(_names,len+1) : new String[len+1]; _vecs = _names != null ? Arrays.copyOf(_vecs ,len+1) : new Vec [len+1]; _keys = _names != null ? Arrays.copyOf(_keys ,len+1) : new Key [len+1]; _names[len] = name; _vecs [len] = vec ; _keys [len] = vec._key; return this; } /** Insert a named column as the first column */ public Frame prepend( String name, Vec vec ) { if( find(name) != -1 ) throw new IllegalArgumentException("Duplicate name '"+name+"' in Frame"); if( _vecs.length != 0 ) { if( !anyVec().group().equals(vec.group()) && !Arrays.equals(anyVec()._espc,vec._espc) ) throw new IllegalArgumentException("Vector groups differs - adding vec '"+name+"' into the frame " + Arrays.toString(_names)); if( numRows() != vec.length() ) throw new IllegalArgumentException("Vector lengths differ - adding vec '"+name+"' into the frame " + Arrays.toString(_names)); } final int len = _names != null ? _names.length : 0; String[] _names2 = new String[len+1]; Vec[] _vecs2 = new Vec [len+1]; Key[] _keys2 = new Key [len+1]; _names2[0] = name; _vecs2 [0] = vec ; _keys2 [0] = vec._key; System.arraycopy(_names, 0, _names2, 1, len); System.arraycopy(_vecs, 0, _vecs2, 1, len); System.arraycopy(_keys, 0, _keys2, 1, len); _names = _names2; _vecs = _vecs2; _keys = _keys2; return this; } /** Appends an entire Frame */ public Frame add( Frame fr, String names[] ) { assert _vecs.length==0 || (anyVec().group().equals(fr.anyVec().group()) || Arrays.equals(anyVec()._espc,fr.anyVec()._espc)): "Adding a vector from different vector group. Current frame contains "+Arrays.toString(_names)+ " vectors. New frame contains "+Arrays.toString(fr.names()) + " vectors."; if( _names != null && fr._names != null ) for( String name : names ) if( find(name) != -1 ) throw new IllegalArgumentException("Duplicate name '"+name+"' in Frame"); final int len0= _names!=null ? _names.length : 0; final int len1= names!=null ? names.length : 0; final int len = len0+len1; // Note: _names==null <=> _vecs==null <=> _keys==null _names = _names != null ? Arrays.copyOf(_names,len) : new String[len]; _vecs = _vecs != null ? Arrays.copyOf(_vecs ,len) : new Vec [len]; _keys = _keys != null ? Arrays.copyOf(_keys ,len) : new Key [len]; System.arraycopy( names,0,_names,len0,len1); System.arraycopy(fr._vecs ,0,_vecs ,len0,len1); System.arraycopy(fr._keys ,0,_keys ,len0,len1); return this; } public Frame add( Frame fr, boolean rename ) { if( !rename ) return add(fr,fr._names); String names[] = new String[fr._names.length]; for( int i=0; i<names.length; i++ ) { String name = fr._names[i]; int cnt=0; while( find(name) != -1 ) name = fr._names[i]+"_"+(cnt++); names[i] = name; } return add(fr,names); } /** Removes the first column with a matching name. */ public Vec remove( String name ) { return remove(find(name)); } /** Removes a numbered column. */ public Vec [] remove( int [] idxs ) { for(int i :idxs)if(i < 0 || i > _vecs.length) throw new ArrayIndexOutOfBoundsException(); Arrays.sort(idxs); Vec [] res = new Vec[idxs.length]; Vec [] rem = new Vec[_vecs.length-idxs.length]; String [] names = new String[rem.length]; Key [] keys = new Key [rem.length]; int j = 0; int k = 0; int l = 0; for(int i = 0; i < _vecs.length; ++i) { if(j < idxs.length && i == idxs[j]) { ++j; res[k++] = _vecs[i]; } else { rem [l] = _vecs [i]; names[l] = _names[i]; keys [l] = _keys [i]; ++l; } } _vecs = rem; _names = names; _keys = keys; assert l == rem.length && k == idxs.length; return res; } /** Removes a numbered column. */ public Vec remove( int idx ) { int len = _names.length; if( idx < 0 || idx >= len ) return null; Vec v = vecs()[idx]; System.arraycopy(_names,idx+1,_names,idx,len-idx-1); System.arraycopy(_vecs ,idx+1,_vecs ,idx,len-idx-1); System.arraycopy(_keys ,idx+1,_keys ,idx,len-idx-1); _names = Arrays.copyOf(_names,len-1); _vecs = Arrays.copyOf(_vecs ,len-1); _keys = Arrays.copyOf(_keys ,len-1); if( v == _col0 ) _col0 = null; return v; } /** * Remove given interval of columns from frame. Motivated by R intervals. * @param startIdx - start index of column (inclusive) * @param endIdx - end index of column (exclusive) * @return an array of remove columns */ public Vec[] remove(int startIdx, int endIdx) { int len = _names.length; int nlen = len - (endIdx-startIdx); String[] names = new String[nlen]; Key[] keys = new Key[nlen]; Vec[] vecs = new Vec[nlen]; reloadVecs(); // force vecs reload if (startIdx > 0) { System.arraycopy(_names, 0, names, 0, startIdx); System.arraycopy(_vecs, 0, vecs, 0, startIdx); System.arraycopy(_keys, 0, keys, 0, startIdx); } nlen -= startIdx; if (endIdx < _names.length+1) { System.arraycopy(_names, endIdx, names, startIdx, nlen); System.arraycopy(_vecs, endIdx, vecs, startIdx, nlen); System.arraycopy(_keys, endIdx, keys, startIdx, nlen); } Vec[] vec = Arrays.copyOfRange(vecs(),startIdx,endIdx); _names = names; _vecs = vecs; _keys = keys; _col0 = null; return vec; } public Vec replace(int col, Vec nv) { if (col >= numCols()) throw new IllegalArgumentException("Trying to select column "+(col+1)+" but only "+numCols()+" present."); Vec rv = vecs()[col]; assert rv.group().equals(nv.group()); _vecs[col] = nv; _keys[col] = nv._key; if( DKV.get(nv._key)==null ) // If not already in KV, put it there DKV.put(nv._key, nv); return rv; } public Vec factor(int col) { Vec nv = vecs()[col].toEnum(); return replace(col, nv); } public Frame extractFrame(int startIdx, int endIdx) { Frame f = subframe(startIdx, endIdx); remove(startIdx, endIdx); return f; } /** Create a subframe from given interval of columns. * * @param startIdx index of first column (inclusive) * @param endIdx index of the last column (exclusive) * @return a new frame containing specified interval of columns */ public Frame subframe(int startIdx, int endIdx) { Frame result = new Frame(Arrays.copyOfRange(_names,startIdx,endIdx),Arrays.copyOfRange(vecs(),startIdx,endIdx)); return result; } public final String[] names() { return _names; } public int numCols() { return vecs().length; } public long numRows() { return anyVec()==null ? 0 : anyVec().length(); } public boolean isRawData() { // Right now there is only one Vec for raw data, but imagine a Parse after a JDBC import or such. for (Vec v : vecs()) { if (v.isByteVec()) return true; } return false; } // Number of columns when categoricals expanded. // Note: One level is dropped in each categorical col. public int numExpCols() { int ncols = 0; for(int i = 0; i < vecs().length; i++) ncols += vecs()[i].domain() == null ? 1 : (vecs()[i].domain().length - 1); return ncols; } /** All the domains for enum columns; null for non-enum columns. */ public String[][] domains() { String ds[][] = new String[vecs().length][]; for( int i=0; i<vecs().length; i++ ) ds[i] = vecs()[i].domain(); return ds; } /** true/false every Vec is a UUID */ public boolean[] uuids() { boolean bs[] = new boolean[vecs().length]; for( int i=0; i<vecs().length; i++ ) bs[i] = vecs()[i].isUUID(); return bs; } /** Time status for every Vec */ public byte[] times() { byte bs[] = new byte[vecs().length]; for( int i=0; i<vecs().length; i++ ) bs[i] = vecs()[i]._time; return bs; } private String[][] domains(int [] cols){ Vec [] vecs = vecs(); String [][] res = new String[cols.length][]; for(int i = 0; i < cols.length; ++i) res[i] = vecs[cols[i]]._domain; return res; } private String [] names(int [] cols){ if(_names == null)return null; String [] res = new String[cols.length]; for(int i = 0; i < cols.length; ++i) res[i] = _names[cols[i]]; return res; } public Vec lastVec() { final Vec [] vecs = vecs(); return vecs[vecs.length-1]; } /** Returns the first readable vector. */ public Vec anyVec() { Vec c0 = _col0; // single read if( c0 != null ) return c0; for( Vec v : vecs() ) if( v.readable() ) return (_col0 = v); return null; } /* Returns the only Vector, or tosses IAE */ public final Vec theVec(String err) { if( _keys.length != 1 ) throw new IllegalArgumentException(err); if( _vecs == null ) _vecs = new Vec[]{_col0 = DKV.get(_keys[0]).get() }; return _vecs[0]; } /** Check that the vectors are all compatible. All Vecs have their content * sharded using same number of rows per chunk. */ public boolean checkCompatible( ) { Vec v0 = anyVec(); if( v0 == null ) return true; int nchunks = v0.nChunks(); for( Vec vec : vecs() ) { if( vec instanceof AppendableVec ) continue; // New Vectors are endlessly compatible if( vec.nChunks() != nchunks ) throw new IllegalArgumentException("Vectors different numbers of chunks, "+nchunks+" and "+vec.nChunks()); } // Also check each chunk has same rows for( int i=0; i<nchunks; i++ ) { long es = v0.chunk2StartElem(i); for(int j = 1; j < numCols(); ++j) { Vec vec = vec(j); if (!(vec instanceof AppendableVec) && vec.chunk2StartElem(i) != es) throw new IllegalArgumentException("Vector chunks have different numbers of rows, " + es + " and " + vec.chunk2StartElem(i) + " at vec " + j + " and chunk " + i); } } // For larger Frames, verify that the layout is compatible - else we'll be // endlessly cache-missing the data around the cluster, pulling copies // local everywhere. if( v0.length() > 1e4 ) { Key gk = v0.groupKey(); for( Vec vec : vecs() ) assert gk.equals(vec.groupKey()) : "Vector " + vec + " has different vector group!"; } return true; } public void closeAppendables() {closeAppendables(new Futures()).blockForPending(); } // Close all AppendableVec public Futures closeAppendables(Futures fs) { _col0 = null; // Reset cache int len = vecs().length; for( int i=0; i<len; i++ ) { Vec v = _vecs[i]; if( v instanceof AppendableVec ) DKV.put(_keys[i],_vecs[i] = ((AppendableVec)v).close(fs),fs); } return fs; } /** Actually remove/delete all Vecs from memory, not just from the Frame. */ @Override public Futures delete_impl(Futures fs) { for( Key k : _keys ) UKV.remove(k,fs); _names = new String[0]; _vecs = new Vec[0]; _keys = new Key[0]; return fs; } @Override public String errStr() { return "Dataset"; } public long byteSize() { long sum=0; for( int i=0; i<vecs().length; i++ ) sum += _vecs[i].byteSize(); return sum; } // Allow sorting of columns based on some function public void swap( int lo, int hi ) { assert 0 <= lo && lo < _keys.length; assert 0 <= hi && hi < _keys.length; if( lo==hi ) return; Vec vecs[] = vecs(); Vec v = vecs [lo]; vecs [lo] = vecs [hi]; vecs [hi] = v; Key k = _keys[lo]; _keys [lo] = _keys [hi]; _keys [hi] = k; String n=_names[lo]; _names[lo] = _names[hi]; _names[hi] = n; } @Override public String toString() { // Across Vec vecs[] = _vecs; // Do Not Cache _vecs in toString lest IdeaJ variable display cause side-effects if( vecs == null ) vecs = vecs_impl(); if( vecs.length==0 ) return "{}"; String s="{"+(_names==null?"C0":_names[0]); long bs=vecs[0].byteSize(); for( int i=1; i<vecs.length; i++ ) { s += ","+(_names==null?"C"+i:_names[i]); bs+= vecs[i].byteSize(); } s += "}, "+PrettyPrint.bytes(bs)+"\n"; // Down Vec v0 = vecs[0]; // Do Not Cache, no side-effects if( v0 == null ) return s; int nc = v0.nChunks(); s += "Chunk starts: {"; for( int c=0; c<nc; c++ ) s += v0.chunk2StartElem(c)+","; s += "}"; return s; } public String toStringNames() { return Arrays.toString(_names); } // Print a row with headers inlined private String toStr( long idx, int col ) { return _names[col]+"="+(_vecs[col].isNA(idx) ? "NA" : _vecs[col].at(idx)); } public String toString( long idx ) { String s="{"+toStr(idx,0); for( int i=1; i<_names.length; i++ ) s += ","+toStr(idx,i); return s+"}"; } public void replaceVecs(Vec [] vecs){ if(vecs.length != _vecs.length) throw new IllegalArgumentException("Incompatible number of vecs"); _vecs = vecs; _col0 = _vecs[0]; for(int i = 0; i < _keys.length; ++i) _keys[i] = vecs[i]._key; } // Print fixed-width row & fixed-width headers (more compressed print // format). Returns the column formats. public String[] toStringHdr( StringBuilder sb ) { String[] fs = new String[numCols()]; for( int c=0; c<fs.length; c++ ) { String n = (_names != null && c < _names.length) ? _names[c] : ("C"+c); int nlen = n.length(); if( numRows()==0 ) { sb.append(n).append(' '); continue; } int w=0; if( _vecs[c].isEnum() ) { String ss[] = _vecs[c]._domain; for( int i=0; i<ss.length; i++ ) w = Math.max(w,ss[i].length()); w = Math.min(w,10); fs[c] = "%"+w+"."+w+"s"; } else { Chunk C = _vecs[c].chunkForChunkIdx(0); // 1st Chunk // Possible situation: 1) vec is INT - C is has no floats => OK // 2) vec is INT - C has floats => IMPOSSIBLE, // 3) vec is FLOAT - C has floats => OK, // 4) vec is FLOAT - C has no floats => find the first chunk with floats if (!_vecs[c].isInt() && !C.hasFloat()) { for (int i=1; i<_vecs[c].nChunks(); i++) { C=_vecs[c].chunkForChunkIdx(i); if (C.hasFloat()) break; } } String f = fs[c] = C.pformat(); // Printable width for( int x=0; x<f.length(); x++ )// Get printable width from format if( Character.isDigit(f.charAt(x)) ) w = w*10+(f.charAt(x)-'0'); else if( w>0 ) break; if( f.charAt(1)==' ' ) w++; // Leading blank is not in print-width } int len = sb.length(); if( nlen>1 && w==1 ) { fs[c]=" "+fs[c]; w=2; } if( nlen <= w ) { // Short name, big digits sb.append(n); for( int i=nlen; i<w; i++ ) sb.append(' '); } else if( w==1 ) { // First char only sb.append(n.charAt(0)); } else if( w==2 ) { // First 2 chars only sb.append(n.charAt(0)).append(n.charAt(1)); } else { // First char dot lastchars; e.g. Compress "Interval" to "I.val" sb.append(n.charAt(0)).append('.'); for( int i=nlen-(w-2); i<nlen; i++ ) sb.append(n.charAt(i)); } assert len+w==sb.length(); sb.append(' '); // Column seperator } sb.append('\n'); return fs; } public StringBuilder toString( StringBuilder sb, String[] fs, long idx ) { Vec vecs[] = vecs(); for( int c=0; c<fs.length; c++ ) { Vec vec = vecs[c]; if( vec.isEnum() ) { String s = "----------"; if( !vec.isNA(idx) ) { int x = (int)vec.at8(idx); if( x >= 0 && x < vec._domain.length ) s = vec._domain[x]; } sb.append(String.format(fs[c],s)); } else if( vec.isInt() ) { if( vec.isNA(idx) ) { Chunk C = vec.chunkForChunkIdx(0); // 1st Chunk int len = C.pformat_len0(); // Printable width for( int i=0; i<len; i++ ) sb.append('-'); } else { try { if( vec.isUUID() ) sb.append(PrettyPrint.UUID(vec.at16l(idx),vec.at16h(idx))); else sb.append(String.format(fs[c],vec.at8(idx))); } catch( IllegalFormatException ife ) { System.out.println("Format: "+fs[c]+" col="+c+" not for ints"); ife.printStackTrace(); } } } else { sb.append(String.format(fs[c],vec.at (idx))); if( vec.isNA(idx) ) sb.append(' '); } sb.append(' '); // Column seperator } sb.append('\n'); return sb; } public String toStringAll() { StringBuilder sb = new StringBuilder(); String[] fs = toStringHdr(sb); for( int i=0; i<numRows(); i++ ) toString(sb,fs,i); return sb.toString(); } // Return the entire Frame as a CSV stream public InputStream toCSV(boolean headers) { return new CSVStream(headers, false); } public InputStream toCSV(boolean headers, boolean hex_string) { return new CSVStream(headers, hex_string); } private class CSVStream extends InputStream { private final boolean _hex_string; byte[] _line; int _position; long _row; CSVStream(boolean headers, boolean hex_string) { _hex_string = hex_string; StringBuilder sb = new StringBuilder(); Vec vs[] = vecs(); if( headers ) { sb.append('"' + _names[0] + '"'); for(int i = 1; i < vs.length; i++) sb.append(',').append('"' + _names[i] + '"'); sb.append('\n'); } _line = sb.toString().getBytes(); } @Override public int available() throws IOException { if(_position == _line.length) { if(_row == numRows()) return 0; StringBuilder sb = new StringBuilder(); Vec vs[] = vecs(); for( int i = 0; i < vs.length; i++ ) { if(i > 0) sb.append(','); if(!vs[i].isNA(_row)) { if( vs[i].isEnum() ) sb.append('"' + vs[i]._domain[(int) vs[i].at8(_row)] + '"'); else if( vs[i].isUUID() ) sb.append(PrettyPrint.UUID(vs[i].at16l(_row),vs[i].at16h(_row))); else if( vs[i].isInt() ) sb.append(vs[i].at8(_row)); else { // R 3.1 unfortunately changed the behavior of read.csv(). // (Really type.convert()). // // Numeric values with too much precision now trigger a type conversion in R 3.1 into a factor. // // See these discussions: // https://bugs.r-project.org/bugzilla/show_bug.cgi?id=15751 // https://stat.ethz.ch/pipermail/r-devel/2014-April/068778.html // http://stackoverflow.com/questions/23072988/preserve-old-pre-3-1-0-type-convert-behavior double d = vs[i].at(_row); String s; if (_hex_string) { // Used by R's as.data.frame(). s = Double.toHexString(d); } else { // To emit CSV files that can be read by R 3.1, limit the number of significant digits. // s = String.format("%.15g", d); s = Double.toString(d); } sb.append(s); } } } sb.append('\n'); _line = sb.toString().getBytes(); _position = 0; _row++; } return _line.length - _position; } @Override public void close() throws IOException { super.close(); _line = null; } @Override public int read() throws IOException { return available() == 0 ? -1 : _line[_position++]; } @Override public int read(byte[] b, int off, int len) throws IOException { int n = available(); if(n > 0) { n = Math.min(n, len); System.arraycopy(_line, _position, b, off, n); _position += n; } return n; } } // -------------------------------------------------------------------------- // In support of R, a generic Deep Copy & Slice. // Semantics are a little odd, to match R's. // Each dimension spec can be: // null - all of them // a sorted list of negative numbers (no dups) - all BUT these // an unordered list of positive - just these, allowing dups // The numbering is 1-based; zero's are not allowed in the lists, nor are out-of-range. final int MAX_EQ2_COLS = 100000; // FIXME. Put this in a better spot. public Frame deepSlice( Object orows, Object ocols ) { // ocols is either a long[] or a Frame-of-1-Vec long[] cols = null; if( ocols == null ) cols = null; else if (ocols instanceof long[]) cols = (long[])ocols; else if (ocols instanceof Frame) { Frame fr = (Frame) ocols; if (fr.numCols() != 1) throw new IllegalArgumentException("Columns Frame must have only one column (actually has " + fr.numCols() + " columns)"); long n = fr.anyVec().length(); if (n > MAX_EQ2_COLS) throw new IllegalArgumentException("Too many requested columns (requested " + n +", max " + MAX_EQ2_COLS + ")"); cols = new long[(int)n]; Vec v = fr.anyVec(); for (long i = 0; i < v.length(); i++) cols[(int)i] = v.at8(i); } else throw new IllegalArgumentException("Columns is specified by an unsupported data type (" + ocols.getClass().getName() + ")"); // Since cols is probably short convert to a positive list. int c2[] = null; if( cols==null ) { c2 = new int[numCols()]; for( int i=0; i<c2.length; i++ ) c2[i]=i; } else if( cols.length==0 ) { c2 = new int[0]; } else if( cols[0] > 0 ) { c2 = new int[cols.length]; for( int i=0; i<cols.length; i++ ) c2[i] = (int)cols[i]-1; // Convert 1-based cols to zero-based } else { c2 = new int[numCols()-cols.length]; int j=0; for( int i=0; i<numCols(); i++ ) { if( j >= cols.length || i < (-cols[j]-1) ) c2[i-j] = i; else j++; } } for( int i=0; i<c2.length; i++ ) if( c2[i] >= numCols() ) throw new IllegalArgumentException("Trying to select column "+(c2[i]+1)+" but only "+numCols()+" present."); if( c2.length==0 ) throw new IllegalArgumentException("No columns selected (did you try to select column 0 instead of column 1?)"); // Do Da Slice // orows is either a long[] or a Vec if (orows == null) return copyRollups(new DeepSlice(null,c2,vecs()).doAll(c2.length,this).outputFrame(names(c2),domains(c2)),true); else if (orows instanceof long[]) { final long CHK_ROWS=1000000; final long[] rows = (long[])orows; if (this.numRows() == 0) { return this; } if( rows.length==0 || rows[0] < 0 ) { if (rows.length != 0 && rows[0] < 0) { Vec v = new MRTask2() { @Override public void map(Chunk cs) { for (long er : rows) { if (er >= 0) continue; er = Math.abs(er) - 1; // 1-based -> 0-based if (er < cs._start || er > (cs._len + cs._start - 1)) continue; cs.set0((int) (er - cs._start), 1); } } }.doAll(this.anyVec().makeZero()).getResult()._fr.anyVec(); Frame slicedFrame = new DeepSlice(rows, c2, vecs()).doAll(c2.length, this.add("select_vec", v)).outputFrame(names(c2), domains(c2)); UKV.remove(v._key); UKV.remove(this.remove(this.numCols()-1)._key); return copyRollups(slicedFrame, false); } else { return copyRollups(new DeepSlice(rows.length == 0 ? null : rows, c2, vecs()).doAll(c2.length, this).outputFrame(names(c2), domains(c2)), rows.length == 0); } } // Vec'ize the index array Futures fs = new Futures(); AppendableVec av = new AppendableVec(Vec.newKey(Key.make("rownames"))); int r = 0; int c = 0; while (r < rows.length) { NewChunk nc = new NewChunk(av, c); long end = Math.min(r+CHK_ROWS, rows.length); for (; r < end; r++) { nc.addNum(rows[r]); } nc.close(c++, fs); } Vec c0 = av.close(fs); // c0 is the row index vec fs.blockForPending(); Frame fr2 = new Slice(c2, this).doAll(c2.length,new Frame(new String[]{"rownames"}, new Vec[]{c0})) .outputFrame(names(c2), domains(c2)); UKV.remove(c0._key); // Remove hidden vector return fr2; } Frame frows = (Frame)orows; Vec vrows = frows.anyVec(); // It's a compatible Vec; use it as boolean selector. // Build column names for the result. Vec [] vecs = new Vec[c2.length+1]; String [] names = new String[c2.length+1]; for(int i = 0; i < c2.length; ++i){ vecs[i] = _vecs[c2[i]]; names[i] = _names[c2[i]]; } vecs[c2.length] = vrows; names[c2.length] = "predicate"; return new DeepSelect().doAll(c2.length,new Frame(names,vecs)).outputFrame(names(c2),domains(c2)); } // Slice and return in the form of new chunks. private static class Slice extends MRTask2<Slice> { final Frame _base; // the base frame to slice from final int[] _cols; Slice(int[] cols, Frame base) { _cols = cols; _base = base; } @Override public void map(Chunk[] ix, NewChunk[] ncs) { final Vec[] vecs = new Vec[_cols.length]; final Vec anyv = _base.anyVec(); final long nrow = anyv.length(); long r = ix[0].at80(0); int last_ci = anyv.elem2ChunkIdx(r<nrow?r:0); // memoize the last chunk index long last_c0 = anyv._espc[last_ci]; // ... last chunk start long last_c1 = anyv._espc[last_ci + 1]; // ... last chunk end Chunk[] last_cs = new Chunk[vecs.length]; // ... last chunks for (int c = 0; c < _cols.length; c++) { vecs[c] = _base.vecs()[_cols[c]]; last_cs[c] = vecs[c].chunkForChunkIdx(last_ci); } for (int i = 0; i < ix[0]._len; i++) { // select one row r = ix[0].at80(i) - 1; // next row to select if (r < 0) continue; if (r >= nrow) { for (int c = 0; c < vecs.length; c++) ncs[c].addNum(Double.NaN); } else { if (r < last_c0 || r >= last_c1) { last_ci = anyv.elem2ChunkIdx(r); last_c0 = anyv._espc[last_ci]; last_c1 = anyv._espc[last_ci + 1]; for (int c = 0; c < vecs.length; c++) last_cs[c] = vecs[c].chunkForChunkIdx(last_ci); } for (int c = 0; c < vecs.length; c++) if( vecs[c].isUUID() ) ncs[c].addUUID(last_cs[c],r); else ncs[c].addNum (last_cs[c].at(r)); } } } } // Bulk (expensive) copy from 2nd cols into 1st cols. // Sliced by the given cols & rows private static class DeepSlice extends MRTask2<DeepSlice> { final int _cols[]; final long _rows[]; final byte _isInt[]; boolean _ex = true; DeepSlice( long rows[], int cols[], Vec vecs[] ) { _cols=cols; _rows=rows; _isInt = new byte[cols.length]; for( int i=0; i<cols.length; i++ ) _isInt[i] = (byte)(vecs[cols[i]].isInt() ? 1 : 0); } @Override public boolean logVerbose() { return false; } @Override public void map( Chunk chks[], NewChunk nchks[] ) { long rstart = chks[0]._start; int rlen = chks[0]._len; // Total row count int rx = 0; // Which row to in/ex-clude int rlo = 0; // Lo/Hi for this block of rows int rhi = rlen; if (_rows != null && _rows[0] < 0) { // Skip any rows that have 1 in the last column! Chunk select_vec = chks[chks.length-1]; for (int i = 0; i < _cols.length; i++) { Chunk oc = chks[_cols[i]]; NewChunk nc = nchks[i]; if (_isInt[i] == 1) { // Slice on integer columns for (int j = 0; j < oc._len; j++) { if (select_vec.at80(j) == 1) continue; if (oc._vec.isUUID()) nc.addUUID(oc, j); else if (oc.isNA0(j)) nc.addNA(); else nc.addNum(oc.at80(j), 0); } } else { // Slice on double columns for (int j = 0; j < oc._len; j++) { if (select_vec.at80(j) == 1) continue; nc.addNum(oc.at0(j)); } } } } else { while (true) { // Still got rows to include? if (_rows != null) { // Got a row selector? if (rx >= _rows.length) break; // All done with row selections long r = _rows[rx++] - 1;// Next row selector if (r < rstart) continue; rlo = (int) (r - rstart); rhi = rlo + 1; // Stop at the next row while (rx < _rows.length && (_rows[rx] - 1 - rstart) == rhi && rhi < rlen) { rx++; rhi++; // Grab sequential rows } } // Process this next set of rows // For all cols in the new set for (int i = 0; i < _cols.length; i++) { Chunk oc = chks[_cols[i]]; NewChunk nc = nchks[i]; if (_isInt[i] == 1) { // Slice on integer columns for (int j = rlo; j < rhi; j++) if (oc._vec.isUUID()) nc.addUUID(oc, j); else if (oc.isNA0(j)) nc.addNA(); else nc.addNum(oc.at80(j), 0); } else { // Slice on double columns for (int j = rlo; j < rhi; j++) nc.addNum(oc.at0(j)); } } rlo = rhi; if (_rows == null) break; } } } } public static Frame[] runifSplit(Frame f, float threshold, long seed) { if (seed == -1) seed = new Random().nextLong(); Vec rv = new Vec(f.anyVec().group().addVecs(1)[0],f.anyVec()._espc); Futures fs = new Futures(); DKV.put(rv._key,rv, fs); for(int i = 0; i < rv._espc.length-1; ++i) DKV.put(rv.chunkKey(i),new C0DChunk(0,(int)(rv._espc[i+1]-rv._espc[i])),fs); fs.blockForPending(); final long zeed = seed; new MRTask2() { @Override public void map(Chunk c){ Random rng = new Random(zeed*c.cidx()); for(int i = 0; i < c._len; ++i) c.set0(i, (float)rng.nextDouble()); } }.doAll(rv); Vec[] vecs = new Vec[f.numCols()+1]; System.arraycopy(f.vecs(), 0, vecs,0, f.numCols()); vecs[f.numCols()] = rv; Frame doAllFr = new Frame(null, vecs); // it would be great if there was a map call for NewChunk[][] multi frame output Frame left = new DeepSelectThresh(threshold, true).doAll(f.numCols(),doAllFr).outputFrame(Key.make(), f.names(), f.domains()); Frame rite = new DeepSelectThresh(threshold, false).doAll(f.numCols(),doAllFr).outputFrame(Key.make(), f.names(), f.domains()); UKV.remove(rv._key); return new Frame[]{left,rite}; } private static class DeepSelect extends MRTask2<DeepSelect> { @Override public void map( Chunk chks[], NewChunk nchks[] ) { Chunk pred = chks[chks.length-1]; for(int i = 0; i < pred._len; ++i) { if(pred.at0(i) != 0) { for( int j = 0; j < chks.length - 1; j++ ) { Chunk chk = chks[j]; if( chk._vec.isUUID() ) nchks[j].addUUID(chk,i); else nchks[j].addNum(chk.at0(i)); } } } } } private static class DeepSelectThresh extends MRTask2<DeepSelectThresh> { private final float _threshold; private final boolean _left; DeepSelectThresh(float threshold, boolean left) { _threshold = threshold; _left = left; } private void addRow(Chunk[] cs, NewChunk[] ncs, int i) { for (int j = 0; j < cs.length -1; ++j) { Chunk c = cs[j]; if (c._vec.isUUID()) ncs[j].addUUID(c,i); else ncs[j].addNum(c.at0(i)); // NewChunk will compress later ... not set0s } } @Override public void map(Chunk cs[], NewChunk ncs[]) { Chunk rv = cs[cs.length-1]; for (int i = 0; i < rv._len; ++i) { if (_left) { if (rv.at0(i) <= _threshold) addRow(cs, ncs, i); } else { if (rv.at0(i) > _threshold) addRow(cs, ncs, i); } } } } private Frame copyRollups( Frame fr, boolean isACopy ) { if( !isACopy ) return fr; // Not a clean copy, do not copy rollups (will do rollups "the hard way" on first ask) Vec vecs0[] = vecs(); Vec vecs1[] = fr.vecs(); for( int i=0; i<fr._names.length; i++ ) { assert vecs1[i]._naCnt== -1; // not computed yet, right after slice Vec v0 = vecs0[find(fr._names[i])]; Vec v1 = vecs1[i]; v1.setRollupStats(v0); } return fr; } // ------------------------------------------------------------------------------ public <Y extends Flow.PerRow<Y>> // Type parameter Flow.FlowPerRow<Y> // Return type of with() with // The method name ( Flow.PerRow<Y> pr ) // Arguments for with() { return new Flow.FlowPerRow<Y>(pr,new Flow.FlowFrame(this)); } public Flow.FlowFilter with( Flow.Filter fr ) { return new Flow.FlowFilter(fr,new Flow.FlowFrame(this)); } public Flow.FlowGroupBy with( Flow.GroupBy fr ) { return new Flow.FlowGroupBy(fr,new Flow.FlowFrame(this)); } }