package water.fvec; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import water.*; import water.api.schemas3.KeyV3; import water.exceptions.H2OIllegalArgumentException; import water.parser.BufferedString; import water.rapids.Merge; import water.util.*; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; import java.util.HashMap; /** A collection of named {@link Vec}s, essentially an R-like Distributed Data Frame. * * <p>Frames represent a large distributed 2-D table with named columns * ({@link Vec}s) and numbered rows. A reasonable <em>column</em> limit is * 100K columns, but there's no hard-coded limit. There's no real <em>row</em> * limit except memory; Frames (and Vecs) with many billions of rows are used * routinely. * * <p>A Frame is a collection of named Vecs; a Vec is a collection of numbered * {@link Chunk}s. A Frame is small, cheaply and easily manipulated, it is * commonly passed-by-Value. It exists on one node, and <em>may</em> be * stored in the {@link DKV}. Vecs, on the other hand, <em>must</em> be stored in the * {@link DKV}, as they represent the shared common management state for a collection * of distributed Chunks. * * <p>Multiple Frames can reference the same Vecs, although this sharing can * make Vec lifetime management complex. Commonly temporary Frames are used * to work with a subset of some other Frame (often during algorithm * execution, when some columns are dropped from the modeling process). The * temporary Frame can simply be ignored, allowing the normal GC process to * reclaim it. Such temp Frames usually have a {@code null} key. * * <p>All the Vecs in a Frame belong to the same {@link Vec.VectorGroup} which * then enforces {@link Chunk} row alignment across Vecs (or at least enforces * a low-cost access model). Parallel and distributed execution touching all * the data in a Frame relies on this alignment to get good performance. * * <p>Example: Make a Frame from a CSV file:<pre> * File file = ... * NFSFileVec nfs = NFSFileVec.make(file); // NFS-backed Vec, lazily read on demand * Frame fr = water.parser.ParseDataset.parse(Key.make("myKey"),nfs._key); * </pre> * * <p>Example: Find and remove the Vec called "unique_id" from the Frame, * since modeling with a unique_id can lead to overfitting: * <pre> * Vec uid = fr.remove("unique_id"); * </pre> * * <p>Example: Move the response column to the last position: * <pre> * fr.add("response",fr.remove("response")); * </pre> * */ public class Frame extends Lockable<Frame> { /** Vec names */ public String[] _names; private boolean _lastNameBig; // Last name is "Cxxx" and has largest number private Key<Vec>[] _keys; // Keys for the vectors private transient Vec[] _vecs; // The Vectors (transient to avoid network traffic) private transient Vec _col0; // First readable vec; fast access to the VectorGroup's Chunk layout public boolean hasNAs(){ for(Vec v:bulkRollups()) if(v.naCnt() > 0) return true; return false; } public boolean hasInfs() { // return if frame contains positive infinity for (Vec v : bulkRollups()) if (v.pinfs() > 0 || v.ninfs() > 0) return true; return false; } private long _naCnt = -1; synchronized public long naCount() { if (_naCnt !=- 1) return _naCnt; _naCnt = 0; for(Vec v: vecs()) _naCnt += v.naCnt(); return _naCnt; } public double naFraction() { return naCount() / (numCols() * numRows()); } /** Creates an internal frame composed of the given Vecs and default names. The frame has no key. */ public Frame(Vec... vecs){ this(null, vecs); } /** Creates an internal frame composed of the given Vecs and names. The frame has no key. */ public Frame(String names[], Vec vecs[]) { this(null, names, vecs); } /** Creates an empty frame with given key. */ public Frame(Key<Frame> key) { this(key, null, new Vec[0]); } /** * Special constructor for data with unnamed columns (e.g. svmlight) bypassing *all* checks. */ public Frame(Key<Frame> key, Vec vecs[], boolean noChecks) { super(key); assert noChecks; _vecs = vecs; String[] names = new String[vecs.length]; _keys = makeVecKeys(vecs.length); for (int i = 0; i < vecs.length; i++) { names[i] = defaultColName(i); _keys[i] = vecs[i]._key; } setNames(names); } /** Creates a frame with given key, names and vectors. */ public Frame(Key<Frame> key, String names[], Vec vecs[] ) { super(key); // Require all Vecs already be installed in the K/V store for( Vec vec : vecs ) DKV.prefetch(vec._key); for( Vec vec : vecs ) { assert DKV.get(vec._key) != null : " null vec: "+vec._key; } // Always require names if( names==null ) { // Make default names, all known to be unique setNames(new String[vecs.length]); _keys = makeVecKeys(vecs.length); _vecs = vecs; for( int i=0; i<vecs.length; i++ ) _names[i] = defaultColName(i); for( int i=0; i<vecs.length; i++ ) _keys [i] = vecs[i]._key; for( int i=0; i<vecs.length; i++ ) checkCompatibility(_names[i],vecs[i]); _lastNameBig = true; } else { // Make empty to dodge asserts, then "add()" them all which will check // for compatible Vecs & names. _names = new String[0]; _keys = makeVecKeys(0); _vecs = new Vec [0]; add(names,vecs); } assert _names.length == vecs.length; } void setNamesNoCheck(String[] columns){ _names = columns; } public final void setNames(String[] columns){ if (_vecs != null && columns.length != _vecs.length) { throw new IllegalArgumentException("Number of column names=" + columns.length + " must be the number of vecs=" + _vecs.length); } _names = columns; } /** Deep copy of Vecs and Keys and Names (but not data!) to a new random Key. * The resulting Frame does not share with the original, so the set of Vecs * can be freely hacked without disturbing the original Frame. */ public Frame( Frame fr ) { super( Key.<Frame>make() ); setNames(fr._names.clone()); _keys = fr._keys .clone(); _vecs = fr.vecs().clone(); _lastNameBig = fr._lastNameBig; } /** Default column name maker */ public static String defaultColName( int col ) { return "C"+(1+col); } /** * Helper method to initialize `_keys` array (which requires an unchecked cast). * @param size number of elements in the array that will be created. */ @SuppressWarnings("unchecked") private Key<Vec>[] makeVecKeys(int size) { return new Key[size]; } // Make unique names. Efficient for the special case of appending endless // versions of "C123" style names where the next name is +1 over the prior // name. All other names take the O(n^2) lookup. private int pint( String name ) { try { return Integer.valueOf(name.substring(1)); } catch(NumberFormatException ignored) { } return 0; } public String uniquify( String name ) { String n = name; int lastName = 0; if( name.length() > 0 && name.charAt(0)=='C' ) lastName = pint(name); if( _lastNameBig && _names.length > 0 ) { String last = _names[_names.length-1]; if( !last.equals("") && last.charAt(0)=='C' && lastName == pint(last)+1 ) return name; } int cnt=0, again, max=0; do { again = cnt; for( String s : _names ) { if( lastName > 0 && s.charAt(0)=='C' ) max = Math.max(max,pint(s)); if( n.equals(s) ) n = name+(cnt++); } } while( again != cnt ); if( lastName == max+1 ) _lastNameBig = true; return n; } /** Check that the vectors are all compatible. All Vecs have their content * sharded using same number of rows per chunk, and all names are unique. * Throw an IAE if something does not match. */ private void checkCompatibility(String name, Vec vec ) { if( vec instanceof AppendableVec ) return; // New Vectors are endlessly compatible Vec v0 = anyVec(); if( v0 == null ) return; // No fixed-size Vecs in the Frame // Vector group has to be the same, or else the layout has to be the same, // or else the total length has to be small. if( !v0.isCompatibleWith(vec) ) { if(!Vec.VectorGroup.sameGroup(v0,vec)) Log.err("Unexpected incompatible vector group, " + v0.group() + " != " + vec.group()); if(!Arrays.equals(v0.espc(), vec.espc())) Log.err("Unexpected incompatible espc, " + Arrays.toString(v0.espc()) + " != " + Arrays.toString(vec.espc())); throw new IllegalArgumentException("Vec " + name + " is not compatible with the rest of the frame"); } } /** Frames are compatible if they have the same layout (number of rows and chunking) and the same vector group (chunk placement).. */ public boolean isCompatible( Frame fr ) { if( numRows() != fr.numRows() ) return false; for( int i=0; i<vecs().length; i++ ) if( !vecs()[i].isCompatibleWith(fr.vecs()[i]) ) return false; return true; } /** Number of columns * @return Number of columns */ public int numCols() { return _keys == null? 0 : _keys.length; } /** Number of rows * @return Number of rows */ public long numRows() { Vec v = anyVec(); return v==null ? 0 : v.length(); } /** Returns the first readable vector. * @return the first readable Vec */ public final Vec anyVec() { Vec c0 = _col0; // single read if( c0 != null ) return c0; for( Vec v : vecs() ) if( v.readable() ) return (_col0 = v); return null; } /** The array of column names. * @return the array of column names */ public String[] names() { return _names; } /** A single column name. * @return the column name */ public String name(int i) { return _names[i]; } /** The array of keys. * @return the array of keys for each vec in the frame. */ public Key<Vec>[] keys() { return _keys; } public Iterable<Key<Vec>> keysList() { return Arrays.asList(_keys); } /** The internal array of Vecs. For efficiency Frames contain an array of * Vec Keys - and the Vecs themselves are lazily loaded from the {@link DKV}. * @return the internal array of Vecs */ public final Vec[] vecs() { Vec[] tvecs = _vecs; // read the content return tvecs == null ? (_vecs=vecs_impl()) : tvecs; } public final Vec[] vecs(int [] idxs) { Vec [] all = vecs(); Vec [] res = new Vec[idxs.length]; for(int i = 0; i < idxs.length; ++i) res[i] = all[idxs[i]]; return res; } public Vec[] vecs(String[] names) { Vec [] res = new Vec[names.length]; for(int i = 0; i < names.length; ++i) res[i] = vec(names[i]); return res; } // Compute vectors for caching private Vec[] vecs_impl() { // Load all Vec headers; load them all in parallel by starting prefetches for( Key<Vec> key : _keys ) DKV.prefetch(key); Vec [] vecs = new Vec[_keys.length]; for( int i=0; i<_keys.length; i++ ) vecs[i] = _keys[i].get(); return vecs; } /** Convenience to accessor for last Vec * @return last Vec */ public Vec lastVec() { vecs(); return _vecs [_vecs.length -1]; } /** Convenience to accessor for last Vec name * @return last Vec name */ public String lastVecName() { return _names[_names.length-1]; } /** Force a cache-flush and reload, assuming vec mappings were altered * remotely, or that the _vecs array was shared and now needs to be a * defensive copy. * @return the new instance of the Frame's Vec[] */ public final Vec[] reloadVecs() { _vecs=null; return vecs(); } /** Returns the Vec by given index, implemented by code: {@code vecs()[idx]}. * @param idx idx of column * @return this frame idx-th vector, never returns <code>null</code> */ public final Vec vec(int idx) { return vecs()[idx]; } /** Return a Vec by name, or null if missing * @return a Vec by name, or null if missing */ public Vec vec(String name) { int idx = find(name); return idx==-1 ? null : vecs()[idx]; } /** Finds the column index with a matching name, or -1 if missing * @return the column index with a matching name, or -1 if missing */ public int find( String name ) { if( name == null ) return -1; assert _names != null; // TODO: add a hashtable: O(n) is just stupid. for( int i=0; i<_names.length; i++ ) if( name.equals(_names[i]) ) return i; return -1; } /** Finds the matching column index, or -1 if missing * @return the matching column index, or -1 if missing */ public int find( Vec vec ) { Vec[] vecs = vecs(); //warning: side-effect if (vec == null) return -1; for( int i=0; i<vecs.length; i++ ) if( vec.equals(vecs[i]) ) return i; return -1; } /** Finds the matching column index, or -1 if missing * @return the matching column index, or -1 if missing */ public int find( Key key ) { for( int i=0; i<_keys.length; i++ ) if( key.equals(_keys[i]) ) return i; return -1; } /** Bulk {@link #find(String)} api * @return An array of column indices matching the {@code names} array */ public int[] find(String[] names) { if( names == null ) return null; int[] res = new int[names.length]; for(int i = 0; i < names.length; ++i) res[i] = find(names[i]); return res; } public void insertVec(int i, String name, Vec vec) { String [] names = new String[_names.length+1]; Vec [] vecs = new Vec[_vecs.length+1]; Key<Vec>[] keys = makeVecKeys(_keys.length + 1); System.arraycopy(_names,0,names,0,i); System.arraycopy(_vecs,0,vecs,0,i); System.arraycopy(_keys,0,keys,0,i); names[i] = name; vecs[i] = vec; keys[i] = vec._key; System.arraycopy(_names,i,names,i+1,_names.length-i); System.arraycopy(_vecs,i,vecs,i+1,_vecs.length-i); System.arraycopy(_keys,i,keys,i+1,_keys.length-i); _vecs = vecs; setNames(names); _keys = keys; } /** Pair of (column name, Frame key). */ public static class VecSpecifier extends Iced implements Vec.Holder { public Key<Frame> _frame; public String _column_name; public Vec vec() { Value v = DKV.get(_frame); if (null == v) return null; Frame f = v.get(); if (null == f) return null; return f.vec(_column_name); } } /** Type for every Vec */ public byte[] types() { Vec[] vecs = vecs(); byte bs[] = new byte[vecs.length]; for( int i=0; i<vecs.length; i++ ) bs[i] = vecs[i]._type; return bs; } /** String name for each Vec type */ public String[] typesStr() { // typesStr not strTypes since shows up in intelliJ next to types Vec[] vecs = vecs(); String s[] = new String[vecs.length]; for(int i=0;i<vecs.length;++i) s[i] = vecs[i].get_type_str(); return s; } /** All the domains for categorical columns; null for non-categorical columns. * @return the domains for categorical columns */ public String[][] domains() { Vec[] vecs = vecs(); String ds[][] = new String[vecs.length][]; for( int i=0; i<vecs.length; i++ ) ds[i] = vecs[i].domain(); return ds; } /** Number of categorical levels for categorical columns; -1 for non-categorical columns. * @return the number of levels for categorical columns */ public int[] cardinality() { Vec[] vecs = vecs(); int[] card = new int[vecs.length]; for( int i=0; i<vecs.length; i++ ) card[i] = vecs[i].cardinality(); return card; } public Vec[] bulkRollups() { Futures fs = new Futures(); Vec[] vecs = vecs(); for(Vec v : vecs) v.startRollupStats(fs); fs.blockForPending(); return vecs; } /** Majority class for categorical columns; -1 for non-categorical columns. * @return the majority class for categorical columns */ public int[] modes() { Vec[] vecs = bulkRollups(); int[] modes = new int[vecs.length]; for( int i = 0; i < vecs.length; i++ ) { modes[i] = vecs[i].isCategorical() ? vecs[i].mode() : -1; } return modes; } /** All the column means. * @return the mean of each column */ public double[] means() { Vec[] vecs = bulkRollups(); double[] means = new double[vecs.length]; for( int i = 0; i < vecs.length; i++ ) means[i] = vecs[i].mean(); return means; } /** One over the standard deviation of each column. * @return Reciprocal the standard deviation of each column */ public double[] mults() { Vec[] vecs = bulkRollups(); double[] mults = new double[vecs.length]; for( int i = 0; i < vecs.length; i++ ) { double sigma = vecs[i].sigma(); mults[i] = standardize(sigma) ? 1.0 / sigma : 1.0; } return mults; } private static boolean standardize(double sigma) { // TODO unify handling of constant columns return sigma > 1e-6; } /** The {@code Vec.byteSize} of all Vecs * @return the {@code Vec.byteSize} of all Vecs */ public long byteSize() { try { Vec[] vecs = bulkRollups(); long sum = 0; for (Vec vec : vecs) sum += vec.byteSize(); return sum; } catch(RuntimeException ex) { Log.debug("Failure to obtain byteSize() - missing chunks?"); return -1; } } /** 64-bit checksum of the checksums of the vecs. SHA-265 checksums of the * chunks are XORed together. Since parse always parses the same pieces of * files into the same offsets in some chunk this checksum will be * consistent across reparses. * @return 64-bit Frame checksum */ @Override protected long checksum_impl() { Vec[] vecs = vecs(); long _checksum = 0; for( int i = 0; i < _names.length; ++i ) { long vec_checksum = vecs[i].checksum(); _checksum ^= vec_checksum; long tmp = (2147483647L * i); _checksum ^= tmp; } _checksum *= (0xBABE + Arrays.hashCode(_names)); // TODO: include column types? Vec.checksum() should include type? return _checksum; } // Add a bunch of vecs public void add( String[] names, Vec[] vecs) { bulkAdd(names, vecs); } public void add( String[] names, Vec[] vecs, int cols ) { if (null == vecs || null == names) return; if (cols == names.length && cols == vecs.length) { bulkAdd(names, vecs); } else { for (int i = 0; i < cols; i++) add(names[i], vecs[i]); } } /** Append multiple named Vecs to the Frame. Names are forced unique, by appending a * unique number if needed. */ private void bulkAdd(String[] names, Vec[] vecs) { String[] tmpnames = names.clone(); int N = names.length; assert(names.length == vecs.length):"names = " + Arrays.toString(names) + ", vecs len = " + vecs.length; for (int i=0; i<N; ++i) { vecs[i] = vecs[i] != null ? makeCompatible(new Frame(vecs[i]))[0] : null; checkCompatibility(tmpnames[i]=uniquify(tmpnames[i]),vecs[i]); // Throw IAE is mismatch } int ncols = _keys.length; // make temp arrays and don't assign them back until they are fully filled - otherwise vecs() can cache null's and NPE. String[] tmpnam = Arrays.copyOf(_names, ncols+N); Key<Vec>[] tmpkeys = Arrays.copyOf(_keys, ncols+N); Vec[] tmpvecs = Arrays.copyOf(_vecs, ncols+N); for (int i=0; i<N; ++i) { tmpnam[ncols+i] = tmpnames[i]; tmpkeys[ncols+i] = vecs[i]._key; tmpvecs[ncols+i] = vecs[i]; } _keys = tmpkeys; _vecs = tmpvecs; setNames(tmpnam); } /** Append a named Vec to the Frame. Names are forced unique, by appending a * unique number if needed. * @return the added Vec, for flow-coding */ public Vec add( String name, Vec vec ) { vec = makeCompatible(new Frame(vec))[0]; checkCompatibility(name=uniquify(name),vec); // Throw IAE is mismatch int ncols = _keys.length; String[] names = Arrays.copyOf(_names,ncols+1); names[ncols] = name; Key<Vec>[] keys = Arrays.copyOf(_keys ,ncols+1); keys [ncols] = vec._key; Vec[] vecs = Arrays.copyOf(_vecs ,ncols+1); vecs [ncols] = vec; _keys = keys; _vecs = vecs; setNames(names); return vec; } /** Append a Frame onto this Frame. Names are forced unique, by appending * unique numbers if needed. * @return the expanded Frame, for flow-coding */ public Frame add( Frame fr ) { add(fr._names,fr.vecs().clone(),fr.numCols()); return this; } /** Insert a named column as the first column */ public Frame prepend( String name, Vec vec ) { if( find(name) != -1 ) throw new IllegalArgumentException("Duplicate name '"+name+"' in Frame"); if( _vecs.length != 0 ) { if( !anyVec().group().equals(vec.group()) && !Arrays.equals(anyVec().espc(),vec.espc()) ) throw new IllegalArgumentException("Vector groups differs - adding vec '"+name+"' into the frame " + Arrays.toString(_names)); if( numRows() != vec.length() ) throw new IllegalArgumentException("Vector lengths differ - adding vec '"+name+"' into the frame " + Arrays.toString(_names)); } final int len = _names != null ? _names.length : 0; String[] _names2 = new String[len + 1]; Vec[] _vecs2 = new Vec[len + 1]; Key<Vec>[] _keys2 = makeVecKeys(len + 1); _names2[0] = name; _vecs2 [0] = vec; _keys2 [0] = vec._key; if (_names != null) { System.arraycopy(_names, 0, _names2, 1, len); System.arraycopy(_vecs, 0, _vecs2, 1, len); System.arraycopy(_keys, 0, _keys2, 1, len); } _vecs = _vecs2; _keys = _keys2; setNames(_names2); return this; } /** Swap two Vecs in-place; useful for sorting columns by some criteria */ public void swap( int lo, int hi ) { assert 0 <= lo && lo < _keys.length; assert 0 <= hi && hi < _keys.length; if( lo==hi ) return; Vec vecs[] = vecs(); Vec v = vecs [lo]; vecs [lo] = vecs [hi]; vecs [hi] = v; Key<Vec> k = _keys[lo]; _keys[lo] = _keys[hi]; _keys[hi] = k; String n=_names[lo]; _names[lo] = _names[hi]; _names[hi] = n; } /** move the provided columns to be first, in-place. For Merge currently since method='hash' was coded like that */ public void moveFirst( int cols[] ) { boolean colsMoved[] = new boolean[_keys.length]; Vec tmpvecs[] = vecs().clone(); Key<Vec> tmpkeys[] = _keys.clone(); String tmpnames[] = _names.clone(); // Move the desired ones first for (int i=0; i<cols.length; i++) { int w = cols[i]; if (colsMoved[w]) throw new IllegalArgumentException("Duplicates in column numbers passed in"); if (w<0 || w>=_keys.length) throw new IllegalArgumentException("column number out of 0-based range"); colsMoved[w] = true; tmpvecs[i] = _vecs[w]; tmpkeys[i] = _keys[w]; tmpnames[i] = _names[w]; } // Put the other ones afterwards int w = cols.length; for (int i=0; i<_keys.length; i++) { if (!colsMoved[i]) { tmpvecs[w] = _vecs[i]; tmpkeys[w] = _keys[i]; tmpnames[w] = _names[i]; w++; } } // Copy back over the original in-place for (int i=0; i<_keys.length; i++) { _vecs[i] = tmpvecs[i]; _keys[i] = tmpkeys[i]; _names[i] = tmpnames[i]; } } /** Returns a subframe of this frame containing only vectors with desired names. * * @param names list of vector names * @return a new frame which collects vectors from this frame with desired names. * @throws IllegalArgumentException if there is no vector with desired name in this frame. */ public Frame subframe(String[] names) { return subframe(names, false, 0)[0]; } /** Create a subframe from this frame based on desired names. * Throws an exception if desired column is not in this frame and <code>replaceBy</code> is <code>false</code>. * Else replace a missing column by a constant column with given value. * * @param names list of column names to extract * @param replaceBy should be missing column replaced by a constant column * @param c value for constant column * @return array of 2 frames, the first is containing a desired subframe, the second one contains newly created columns or null * @throws IllegalArgumentException if <code>replaceBy</code> is false and there is a missing column in this frame */ private Frame[] subframe(String[] names, boolean replaceBy, double c){ Vec [] vecs = new Vec[names.length]; Vec [] cvecs = replaceBy ? new Vec [names.length] : null; String[] cnames = replaceBy ? new String[names.length] : null; int ccv = 0; // counter of constant columns vecs(); // Preload the vecs HashMap<String, Integer> map = new HashMap<>((int) ((names.length/0.75f)+1)); // avoid rehashing by set up initial capacity for(int i = 0; i < _names.length; ++i) map.put(_names[i], i); for(int i = 0; i < names.length; ++i) if(map.containsKey(names[i])) vecs[i] = _vecs[map.get(names[i])]; else if (replaceBy) { Log.warn("Column " + names[i] + " is missing, filling it in with " + c); cnames[ccv] = names[i]; vecs[i] = cvecs[ccv++] = anyVec().makeCon(c); } return new Frame[] { new Frame(Key.<Frame>make("subframe" + Key.make().toString()), names, vecs), ccv > 0? new Frame(Key.<Frame>make("subframe" + Key.make().toString()), Arrays.copyOf(cnames, ccv), Arrays.copyOf(cvecs,ccv)) : null }; } /** Allow rollups for all written-into vecs; used by {@link MRTask} once * writing is complete. * @return the original Futures, for flow-coding */ public Futures postWrite(Futures fs) { for( Vec v : vecs() ) v.postWrite(fs); return fs; } /** Actually remove/delete all Vecs from memory, not just from the Frame. * @return the original Futures, for flow-coding */ @Override protected Futures remove_impl(Futures fs) { final Key[] keys = _keys; if( keys.length==0 ) return fs; // Get the nChunks without calling anyVec - which loads all Vecs eagerly, // only to delete them. Supports Frames with some Vecs already deleted, as // a Scope cleanup action might delete Vecs out of order. Vec v = _col0; if (v == null) { Vec[] vecs = _vecs; // Read once, in case racily being cleared if (vecs != null) for (Vec vec : vecs) if ((v = vec) != null) // Stop on finding the 1st Vec break; } if (v == null) // Ok, now do DKV gets for (Key<Vec> _key1 : _keys) if ((v = _key1.get()) != null) break; // Stop on finding the 1st Vec if (v == null) return fs; _vecs = new Vec[0]; setNames(new String[0]); _keys = makeVecKeys(0); // Bulk dumb local remove - no JMM, no ordering, no safety. Vec.bulk_remove(keys, v.nChunks()); return fs; } /** Write out K/V pairs, in this case Vecs. */ @Override protected AutoBuffer writeAll_impl(AutoBuffer ab) { for( Key k : _keys ) ab.putKey(k); return super.writeAll_impl(ab); } @Override protected Keyed readAll_impl(AutoBuffer ab, Futures fs) { for( Key k : _keys ) ab.getKey(k,fs); return super.readAll_impl(ab,fs); } /** Replace one column with another. Caller must perform global update (DKV.put) on * this updated frame. * @return The old column, for flow-coding */ public Vec replace(int col, Vec nv) { Vec rv = vecs()[col]; nv = ((new Frame(rv)).makeCompatible(new Frame(nv)))[0]; DKV.put(nv); assert DKV.get(nv._key)!=null; // Already in DKV assert rv.isCompatibleWith(nv); _vecs[col] = nv; _keys[col] = nv._key; return rv; } /** Create a subframe from given interval of columns. * @param startIdx index of first column (inclusive) * @param endIdx index of the last column (exclusive) * @return a new Frame containing specified interval of columns */ public Frame subframe(int startIdx, int endIdx) { return new Frame(Arrays.copyOfRange(_names,startIdx,endIdx),Arrays.copyOfRange(vecs(),startIdx,endIdx)); } /** Split this Frame; return a subframe created from the given column interval, and * remove those columns from this Frame. * @param startIdx index of first column (inclusive) * @param endIdx index of the last column (exclusive) * @return a new Frame containing specified interval of columns */ public Frame extractFrame(int startIdx, int endIdx) { Frame f = subframe(startIdx, endIdx); remove(startIdx, endIdx); return f; } /** Removes the column with a matching name. * @return The removed column */ public Vec remove( String name ) { return remove(find(name)); } public Frame remove( String[] names ) { for( String name : names ) remove(find(name)); return this; } /** Removes a list of columns by index; the index list must be sorted * @return an array of the removed columns */ public Vec[] remove( int[] idxs ) { for( int i : idxs ) if(i < 0 || i >= vecs().length) throw new ArrayIndexOutOfBoundsException(); Arrays.sort(idxs); Vec[] res = new Vec[idxs.length]; Vec[] rem = new Vec[_vecs.length-idxs.length]; String[] names = new String[rem.length]; Key<Vec>[] keys = makeVecKeys(rem.length); int j = 0; int k = 0; int l = 0; for(int i = 0; i < _vecs.length; ++i) { if(j < idxs.length && i == idxs[j]) { ++j; res[k++] = _vecs[i]; } else { rem [l] = _vecs [i]; names[l] = _names[i]; keys [l] = _keys [i]; ++l; } } _vecs = rem; setNames(names); _keys = keys; assert l == rem.length && k == idxs.length; return res; } /** Removes a numbered column. * @return the removed column */ public final Vec remove( int idx ) { int len = _names.length; if( idx < 0 || idx >= len ) return null; Vec v = vecs()[idx]; if( v == _col0 ) _col0 = null; _vecs = ArrayUtils.remove(_vecs, idx); setNames(ArrayUtils.remove(_names, idx)); _keys = ArrayUtils.remove(_keys, idx); return v; } /** * Remove all the vecs from frame. */ public Vec[] removeAll() { return remove(0, _names.length); } /** Remove given interval of columns from frame. Motivated by R intervals. * @param startIdx - start index of column (inclusive) * @param endIdx - end index of column (exclusive) * @return array of removed columns */ Vec[] remove(int startIdx, int endIdx) { int len = _names.length; int nlen = len - (endIdx-startIdx); String[] names = new String[nlen]; Key<Vec>[] keys = makeVecKeys(nlen); Vec[] vecs = new Vec[nlen]; vecs(); if (startIdx > 0) { System.arraycopy(_names, 0, names, 0, startIdx); System.arraycopy(_vecs, 0, vecs, 0, startIdx); System.arraycopy(_keys, 0, keys, 0, startIdx); } nlen -= startIdx; if (endIdx < _names.length+1) { System.arraycopy(_names, endIdx, names, startIdx, nlen); System.arraycopy(_vecs, endIdx, vecs, startIdx, nlen); System.arraycopy(_keys, endIdx, keys, startIdx, nlen); } Vec[] vecX = Arrays.copyOfRange(_vecs,startIdx,endIdx); _vecs = vecs; _keys = keys; setNames(names); _col0 = null; return vecX; } /** Restructure a Frame completely */ public void restructure( String[] names, Vec[] vecs) { restructure(names, vecs, vecs.length); } /** Restructure a Frame completely, but only for a specified number of columns (counting up) */ public void restructure( String[] names, Vec[] vecs, int cols) { // Make empty to dodge asserts, then "add()" them all which will check for // compatible Vecs & names. _keys = makeVecKeys(0); _vecs = new Vec [0]; setNames(new String[0]); add(names,vecs,cols); } // -------------------------------------------- // Utilities to help external Frame constructors, e.g. Spark. // Make an initial Frame & lock it for writing. Build Vec Keys. void preparePartialFrame( String[] names ) { // Nuke any prior frame (including freeing storage) & lock this one if( _keys != null ) delete_and_lock(); else write_lock(); _keys = new Vec.VectorGroup().addVecs(names.length); setNamesNoCheck(names); // No Vectors tho!!! These will be added *after* the import } // Only serialize strings, not H2O internal structures // Make NewChunks to for holding data from e.g. Spark. Once per set of // Chunks in a Frame, before filling them. This can be called in parallel // for different Chunk#'s (cidx); each Chunk can be filled in parallel. static NewChunk[] createNewChunks(String name, byte[] type, int cidx) { Frame fr = (Frame) Key.make(name).get(); NewChunk[] nchks = new NewChunk[fr.numCols()]; for (int i = 0; i < nchks.length; i++) { nchks[i] = new NewChunk(new AppendableVec(fr._keys[i], type[i]), cidx); } return nchks; } // Compress & DKV.put NewChunks. Once per set of Chunks in a Frame, after // filling them. Can be called in parallel for different sets of Chunks. static void closeNewChunks(NewChunk[] nchks) { Futures fs = new Futures(); for (NewChunk nchk : nchks) { nchk.close(fs); } fs.blockForPending(); } // Build real Vecs from loose Chunks, and finalize this Frame. Called once // after any number of [create,close]NewChunks. void finalizePartialFrame( long[] espc, String[][] domains, byte[] types ) { // Compute elems-per-chunk. // Roll-up elem counts, so espc[i] is the starting element# of chunk i. int nchunk = espc.length; long espc2[] = new long[nchunk+1]; // Shorter array long x=0; // Total row count so far for( int i=0; i<nchunk; i++ ) { espc2[i] = x; // Start elem# for chunk i x += espc[i]; // Raise total elem count } espc2[nchunk]=x; // Total element count in last // For all Key/Vecs - insert Vec header Futures fs = new Futures(); _vecs = new Vec[_keys.length]; for( int i=0; i<_keys.length; i++ ) { // Insert Vec header Vec vec = _vecs[i] = new Vec( _keys[i], Vec.ESPC.rowLayout(_keys[i],espc2), domains!=null ? domains[i] : null, types[i]); // Here we have to save vectors since // saving during unlock will invoke Frame vector // refresh DKV.put(_keys[i],vec,fs); } fs.blockForPending(); unlock(); } // -------------------------------------------------------------------------- static final int MAX_EQ2_COLS = 100000; // Limit of columns user is allowed to request /** In support of R, a generic Deep Copy and Slice. * * <p>Semantics are a little odd, to match R's. Each dimension spec can be:<ul> * <li><em>null</em> - all of them * <li><em>a sorted list of negative numbers (no dups)</em> - all BUT these * <li><em>an unordered list of positive</em> - just these, allowing dups * </ul> * * <p>The numbering is 1-based; zero's are not allowed in the lists, nor are out-of-range values. * @return the sliced Frame */ public Frame deepSlice( Object orows, Object ocols ) { // ocols is either a long[] or a Frame-of-1-Vec long[] cols; if( ocols == null ) cols = null; else if (ocols instanceof long[]) cols = (long[])ocols; else if (ocols instanceof Frame) { Frame fr = (Frame) ocols; if (fr.numCols() != 1) throw new IllegalArgumentException("Columns Frame must have only one column (actually has " + fr.numCols() + " columns)"); long n = fr.anyVec().length(); if (n > MAX_EQ2_COLS) throw new IllegalArgumentException("Too many requested columns (requested " + n +", max " + MAX_EQ2_COLS + ")"); cols = new long[(int)n]; Vec.Reader v = fr.anyVec().new Reader(); for (long i = 0; i < v.length(); i++) cols[(int)i] = v.at8(i); } else throw new IllegalArgumentException("Columns is specified by an unsupported data type (" + ocols.getClass().getName() + ")"); // Since cols is probably short convert to a positive list. int c2[]; if( cols==null ) { c2 = new int[numCols()]; for( int i=0; i<c2.length; i++ ) c2[i]=i; } else if( cols.length==0 ) { c2 = new int[0]; } else if( cols[0] >= 0 ) { c2 = new int[cols.length]; for( int i=0; i<cols.length; i++ ) c2[i] = (int)cols[i]; // Conversion of 1-based cols to 0-based is handled by a 1-based front-end! } else { c2 = new int[numCols()-cols.length]; int j=0; for( int i=0; i<numCols(); i++ ) { if( j >= cols.length || i < (-(1+cols[j])) ) c2[i-j] = i; else j++; } } for (int aC2 : c2) if (aC2 >= numCols()) throw new IllegalArgumentException("Trying to select column " + (aC2 + 1) + " but only " + numCols() + " present."); if( c2.length==0 ) throw new IllegalArgumentException("No columns selected (did you try to select column 0 instead of column 1?)"); // Do Da Slice // orows is either a long[] or a Vec if (numRows() == 0) { return new MRTask() { @Override public void map(Chunk[] chks, NewChunk[] nchks) { for (NewChunk nc : nchks) nc.addNA(); } }.doAll(types(c2), this).outputFrame(names(c2), domains(c2)); } if (orows == null) return new DeepSlice(null,c2,vecs()).doAll(types(c2),this).outputFrame(names(c2),domains(c2)); else if (orows instanceof long[]) { final long CHK_ROWS=1000000; final long[] rows = (long[])orows; if (this.numRows() == 0) { return this; } if( rows.length==0 || rows[0] < 0 ) { if (rows.length != 0 && rows[0] < 0) { Vec v0 = this.anyVec().makeZero(); Vec v = new MRTask() { @Override public void map(Chunk cs) { for (long er : rows) { if (er >= 0) continue; er = Math.abs(er); if (er < cs._start || er > (cs._len + cs._start - 1)) continue; cs.set((int) (er - cs._start), 1); } } }.doAll(v0).getResult()._fr.anyVec(); Keyed.remove(v0._key); Frame slicedFrame = new DeepSlice(rows, c2, vecs()).doAll(types(c2), this.add("select_vec", v)).outputFrame(names(c2), domains(c2)); Keyed.remove(v._key); Keyed.remove(this.remove(this.numCols() - 1)._key); return slicedFrame; } else { return new DeepSlice(rows.length == 0 ? null : rows, c2, vecs()).doAll(types(c2), this).outputFrame(names(c2), domains(c2)); } } // Vec'ize the index array Futures fs = new Futures(); AppendableVec av = new AppendableVec(Vec.newKey(),Vec.T_NUM); int r = 0; int c = 0; while (r < rows.length) { NewChunk nc = new NewChunk(av, c); long end = Math.min(r+CHK_ROWS, rows.length); for (; r < end; r++) { nc.addNum(rows[r]); } nc.close(c++, fs); } Vec c0 = av.layout_and_close(fs); // c0 is the row index vec fs.blockForPending(); Frame ff = new Frame(new String[]{"rownames"}, new Vec[]{c0}); Frame fr2 = new Slice(c2, this).doAll(types(c2),ff).outputFrame(names(c2), domains(c2)); Keyed.remove(c0._key); Keyed.remove(av._key); ff.delete(); return fr2; } Frame frows = (Frame)orows; // It's a compatible Vec; use it as boolean selector. // Build column names for the result. Vec [] vecs = new Vec[c2.length]; String [] names = new String[c2.length]; for(int i = 0; i < c2.length; ++i){ vecs[i] = _vecs[c2[i]]; names[i] = _names[c2[i]]; } Frame ff = new Frame(names, vecs); ff.add("predicate", frows.anyVec()); return new DeepSelect().doAll(types(c2),ff).outputFrame(names(c2),domains(c2)); } // Slice and return in the form of new chunks. private static class Slice extends MRTask<Slice> { final Frame _base; // the base frame to slice from final int[] _cols; Slice(int[] cols, Frame base) { _cols = cols; _base = base; } @Override public void map(Chunk[] ix, NewChunk[] ncs) { final Vec[] vecs = new Vec[_cols.length]; final Vec anyv = _base.anyVec(); final long nrow = anyv.length(); long r = ix[0].at8(0); int last_ci = anyv.elem2ChunkIdx(r<nrow?r:0); // memoize the last chunk index long last_c0 = anyv.espc()[last_ci]; // ... last chunk start long last_c1 = anyv.espc()[last_ci + 1]; // ... last chunk end Chunk[] last_cs = new Chunk[vecs.length]; // ... last chunks for (int c = 0; c < _cols.length; c++) { vecs[c] = _base.vecs()[_cols[c]]; last_cs[c] = vecs[c].chunkForChunkIdx(last_ci); } for (int i = 0; i < ix[0]._len; i++) { // select one row r = ix[0].at8(i); // next row to select if (r < 0) continue; if (r >= nrow) { for (int c = 0; c < vecs.length; c++) ncs[c].addNA(); } else { if (r < last_c0 || r >= last_c1) { last_ci = anyv.elem2ChunkIdx(r); last_c0 = anyv.espc()[last_ci]; last_c1 = anyv.espc()[last_ci + 1]; for (int c = 0; c < vecs.length; c++) last_cs[c] = vecs[c].chunkForChunkIdx(last_ci); } int ir = (int)(r - last_cs[0].start()); for (int c = 0; c < vecs.length; c++) last_cs[c].extractRows(ncs[c],ir); } } } } // Convert len rows starting at off to a 2-d ascii table @Override public String toString( ) { return ("Frame key: " + _key + "\n") + " cols: " + numCols() + "\n" + " rows: " + numRows() + "\n" + " chunks: " + (anyVec() == null ? "N/A" : anyVec().nChunks()) + "\n" + " size: " + byteSize() + "\n"; } public String toString(long off, int len) { return toTwoDimTable(off, len).toString(); } public String toString(long off, int len, boolean rollups) { return toTwoDimTable(off, len, rollups).toString(); } public TwoDimTable toTwoDimTable() { return toTwoDimTable(0,10); } public TwoDimTable toTwoDimTable(long off, int len ) { return toTwoDimTable(off,len,true); } public TwoDimTable toTwoDimTable(long off, int len, boolean rollups ) { if( off > numRows() ) off = numRows(); if( off+len > numRows() ) len = (int)(numRows()-off); String[] rowHeaders = new String[len]; int H=0; if( rollups ) { H = 5; rowHeaders = new String[len+H]; rowHeaders[0] = "min"; rowHeaders[1] = "mean"; rowHeaders[2] = "stddev"; rowHeaders[3] = "max"; rowHeaders[4] = "missing"; for( int i=0; i<len; i++ ) rowHeaders[i+H]=""+(off+i); } final int ncols = numCols(); final Vec[] vecs = vecs(); String[] coltypes = new String[ncols]; String[][] strCells = new String[len+H][ncols]; double[][] dblCells = new double[len+H][ncols]; final BufferedString tmpStr = new BufferedString(); for( int i=0; i<ncols; i++ ) { if( DKV.get(_keys[i]) == null ) { // deleted Vec in Frame coltypes[i] = "string"; for( int j=0; j<len+H; j++ ) dblCells[j][i] = TwoDimTable.emptyDouble; for( int j=0; j<len; j++ ) strCells[j+H][i] = "NO_VEC"; continue; } Vec vec = vecs[i]; if( rollups ) { dblCells[0][i] = vec.min(); dblCells[1][i] = vec.mean(); dblCells[2][i] = vec.sigma(); dblCells[3][i] = vec.max(); dblCells[4][i] = vec.naCnt(); } switch( vec.get_type() ) { case Vec.T_BAD: coltypes[i] = "string"; for( int j=0; j<len; j++ ) { strCells[j+H][i] = null; dblCells[j+H][i] = TwoDimTable.emptyDouble; } break; case Vec.T_STR : coltypes[i] = "string"; for( int j=0; j<len; j++ ) { strCells[j+H][i] = vec.isNA(off+j) ? "" : vec.atStr(tmpStr,off+j).toString(); dblCells[j+H][i] = TwoDimTable.emptyDouble; } break; case Vec.T_CAT: coltypes[i] = "string"; for( int j=0; j<len; j++ ) { strCells[j+H][i] = vec.isNA(off+j) ? "" : vec.factor(vec.at8(off+j)); dblCells[j+H][i] = TwoDimTable.emptyDouble; } break; case Vec.T_TIME: coltypes[i] = "string"; DateTimeFormatter fmt = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); for( int j=0; j<len; j++ ) { strCells[j+H][i] = vec.isNA(off+j) ? "" : fmt.print(vec.at8(off+j)); dblCells[j+H][i] = TwoDimTable.emptyDouble; } break; case Vec.T_NUM: coltypes[i] = vec.isInt() ? "long" : "double"; for( int j=0; j<len; j++ ) { dblCells[j+H][i] = vec.isNA(off+j) ? TwoDimTable.emptyDouble : vec.at(off + j); strCells[j+H][i] = null; } break; case Vec.T_UUID: throw H2O.unimpl(); default: System.err.println("bad vector type during debug print: "+vec.get_type()); throw H2O.fail(); } } return new TwoDimTable("Frame "+_key,numRows()+" rows and "+numCols()+" cols",rowHeaders,/* clone the names, the TwoDimTable will replace nulls with ""*/_names.clone(),coltypes,null, "", strCells, dblCells); } // Bulk (expensive) copy from 2nd cols into 1st cols. // Sliced by the given cols & rows private static class DeepSlice extends MRTask<DeepSlice> { final int _cols[]; final long _rows[]; final byte _isInt[]; DeepSlice( long rows[], int cols[], Vec vecs[] ) { _cols=cols; _rows=rows; _isInt = new byte[cols.length]; for( int i=0; i<cols.length; i++ ) _isInt[i] = (byte)(vecs[cols[i]].isInt() ? 1 : 0); } @Override public boolean logVerbose() { return false; } @Override public void map( Chunk chks[], NewChunk nchks[] ) { long rstart = chks[0]._start; int rlen = chks[0]._len; // Total row count int rx = 0; // Which row to in/ex-clude int rlo = 0; // Lo/Hi for this block of rows int rhi = rlen; while (true) { // Still got rows to include? if (_rows != null) { // Got a row selector? if (rx >= _rows.length) break; // All done with row selections long r = _rows[rx++];// Next row selector if (r < rstart) continue; rlo = (int) (r - rstart); rhi = rlo + 1; // Stop at the next row while (rx < _rows.length && (_rows[rx] - rstart) == rhi && rhi < rlen) { rx++; rhi++; // Grab sequential rows } } // Process this next set of rows // For all cols in the new set; BufferedString tmpStr = new BufferedString(); for (int i = 0; i < _cols.length; i++) chks[_cols[i]].extractRows(nchks[i], rlo,rhi); rlo = rhi; if (_rows == null) break; } } } /** * Create a copy of the input Frame and return that copied Frame. All Vecs in this are copied in parallel. * Caller must do the DKV.put * @param keyName Key for resulting frame. If null, no key will be given. * @return The fresh copy of fr. */ public Frame deepCopy(String keyName) { final Vec [] vecs = vecs().clone(); Key [] ks = anyVec().group().addVecs(vecs.length); Futures fs = new Futures(); for(int i = 0; i < vecs.length; ++i) DKV.put(vecs[i] = new Vec(ks[i], anyVec()._rowLayout, vecs[i].domain(),vecs()[i]._type),fs); new MRTask() { @Override public void map(Chunk[] cs) { int cidx = cs[0].cidx(); for(int i = 0; i < cs.length; ++i) DKV.put(vecs[i].chunkKey(cidx),cs[i].deepCopy(),_fs); } }.doAll(this);//.outputFrame(keyName==null?null:Key.make(keyName),this.names(),this.domains()); fs.blockForPending(); return new Frame((keyName==null?null:Key.<Frame>make(keyName)),this.names(),vecs); } /** * Last column is a bit vec indicating whether or not to take the row. */ public static class DeepSelect extends MRTask<DeepSelect> { @Override public void map( Chunk[] chks, NewChunk [] nchks ) { Chunk pred = chks[chks.length - 1]; int[] ids = pred.getIntegers(new int[pred._len],0,pred._len,0); int zeros = 0; for(int i = 0; i < ids.length; ++i) if(ids[i] == 1){ ids[i-zeros] = i; } else zeros++; ids = Arrays.copyOf(ids,ids.length-zeros); for (int c = 0; c < chks.length-1; ++c) chks[c].extractRows(nchks[c], ids); } } private String[][] domains(int [] cols){ Vec[] vecs = vecs(); String[][] res = new String[cols.length][]; for(int i = 0; i < cols.length; ++i) res[i] = vecs[cols[i]].domain(); return res; } private String [] names(int [] cols){ if(_names == null)return null; String [] res = new String[cols.length]; for(int i = 0; i < cols.length; ++i) res[i] = _names[cols[i]]; return res; } private byte[] types(int [] cols){ Vec[] vecs = vecs(); byte[] res = new byte[cols.length]; for(int i = 0; i < cols.length; ++i) res[i] = vecs[cols[i]]._type; return res; } public Vec[] makeCompatible( Frame f) {return makeCompatible(f,false);} /** Return array of Vectors if 'f' is compatible with 'this', else return a new * array of Vectors compatible with 'this' and a copy of 'f's data otherwise. Note * that this can, in the worst case, copy all of {@code this}s' data. * @return This Frame's data in an array of Vectors that is compatible with {@code f}. */ public Vec[] makeCompatible( Frame f, boolean force) { // Small data frames are always "compatible" if (anyVec() == null) // Or it is small return f.vecs(); // Then must be compatible Vec v1 = anyVec(); Vec v2 = f.anyVec(); if (v1 != null && v2 != null && v1.length() != v2.length()) throw new IllegalArgumentException("Can not make vectors of different length compatible!"); if (v1 == null || v2 == null || (!force && v1.isCompatibleWith(v2))) return f.vecs(); // Ok, here make some new Vecs with compatible layout Key k = Key.make(); H2O.submitTask(new RebalanceDataSet(this, f, k)).join(); Frame f2 = (Frame)k.get(); DKV.remove(k); for (Vec v : f2.vecs()) Scope.track(v); return f2.vecs(); } public static Job export(Frame fr, String path, String frameName, boolean overwrite, int nParts) { boolean forceSingle = nParts == 1; // Validate input if (forceSingle) { boolean fileExists = H2O.getPM().exists(path); if (overwrite && fileExists) { Log.warn("File " + path + " exists, but will be overwritten!"); } else if (!overwrite && fileExists) { throw new H2OIllegalArgumentException(path, "exportFrame", "File " + path + " already exists!"); } } else { if (! H2O.getPM().isEmptyDirectoryAllNodes(path)) { throw new H2OIllegalArgumentException(path, "exportFrame", "Cannot use path " + path + " to store part files! The target needs to be either an existing empty directory or not exist yet."); } } Job job = new Job<>(fr._key, "water.fvec.Frame", "Export dataset"); FrameUtils.ExportTaskDriver t = new FrameUtils.ExportTaskDriver(fr, path, frameName, overwrite, job, nParts); return job.start(t, fr.anyVec().nChunks()); } /** Convert this Frame to a CSV (in an {@link InputStream}), that optionally * is compatible with R 3.1's recent change to read.csv()'s behavior. * * WARNING: Note that the end of a file is denoted by the read function * returning 0 instead of -1. * * @return An InputStream containing this Frame as a CSV */ public InputStream toCSV(boolean headers, boolean hex_string) { return new CSVStream(this, headers, hex_string); } public static class CSVStream extends InputStream { private final boolean _hex_string; byte[] _line; int _position; int _chkRow; Chunk[] _curChks; int _lastChkIdx; public volatile int _curChkIdx; // used only for progress reporting public CSVStream(Frame fr, boolean headers, boolean hex_string) { this(firstChunks(fr), headers ? fr.names() : null, fr.anyVec().nChunks(), hex_string); } private static Chunk[] firstChunks(Frame fr) { Vec anyvec = fr.anyVec(); if (anyvec == null || anyvec.nChunks() == 0 || anyvec.length() == 0) { return null; } Chunk[] chks = new Chunk[fr.vecs().length]; for (int i = 0; i < fr.vecs().length; i++) { chks[i] = fr.vec(i).chunkForRow(0); } return chks; } public CSVStream(Chunk[] chks, String[] names, int nChunks, boolean hex_string) { if (chks == null) nChunks = 0; _lastChkIdx = (chks != null) ? chks[0].cidx() + nChunks - 1 : -1; _hex_string = hex_string; StringBuilder sb = new StringBuilder(); if (names != null) { sb.append('"').append(names[0]).append('"'); for(int i = 1; i < names.length; i++) sb.append(',').append('"').append(names[i]).append('"'); sb.append('\n'); } _line = StringUtils.bytesOf(sb); _chkRow = -1; // first process the header line _curChks = chks; } public int getCurrentRowSize() throws IOException { int av = available(); assert av > 0; return _line.length; } byte[] getBytesForRow() { StringBuilder sb = new StringBuilder(); BufferedString tmpStr = new BufferedString(); for (int i = 0; i < _curChks.length; i++ ) { Vec v = _curChks[i]._vec; if(i > 0) sb.append(','); if(!_curChks[i].isNA(_chkRow)) { if( v.isCategorical() ) sb.append('"').append(v.factor(_curChks[i].at8(_chkRow))).append('"'); else if( v.isUUID() ) sb.append(PrettyPrint.UUID(_curChks[i].at16l(_chkRow), _curChks[i].at16h(_chkRow))); else if( v.isInt() ) sb.append(_curChks[i].at8(_chkRow)); else if (v.isString()) sb.append('"').append(_curChks[i].atStr(tmpStr, _chkRow)).append('"'); else { double d = _curChks[i].atd(_chkRow); // R 3.1 unfortunately changed the behavior of read.csv(). // (Really type.convert()). // // Numeric values with too much precision now trigger a type conversion in R 3.1 into a factor. // // See these discussions: // https://bugs.r-project.org/bugzilla/show_bug.cgi?id=15751 // https://stat.ethz.ch/pipermail/r-devel/2014-April/068778.html // http://stackoverflow.com/questions/23072988/preserve-old-pre-3-1-0-type-convert-behavior String s = _hex_string ? Double.toHexString(d) : Double.toString(d); sb.append(s); } } } sb.append('\n'); return StringUtils.bytesOf(sb); } @Override public int available() throws IOException { // Case 1: There is more data left to read from the current line. if (_position != _line.length) { return _line.length - _position; } // Case 2: There are no chunks to work with (eg. the whole Frame was empty). if (_curChks == null) { return 0; } _chkRow++; Chunk anyChunk = _curChks[0]; // Case 3: Out of data. if (anyChunk._start + _chkRow == anyChunk._vec.length()) { return 0; } // Case 4: Out of data in the current chunks => fast-forward to the next set of non-empty chunks. if (_chkRow == anyChunk.len()) { _curChkIdx = anyChunk._vec.elem2ChunkIdx(anyChunk._start + _chkRow); // skips empty chunks // Case 4: Processed all requested chunks. if (_curChkIdx > _lastChkIdx) { return 0; } // fetch the next non-empty chunks Chunk[] newChks = new Chunk[_curChks.length]; for (int i = 0; i < _curChks.length; i++) { newChks[i] = _curChks[i]._vec.chunkForChunkIdx(_curChkIdx); // flush the remote chunk Key oldKey = _curChks[i]._vec.chunkKey(_curChks[i]._cidx); if (! oldKey.home()) { H2O.raw_remove(oldKey); } } _curChks = newChks; _chkRow = 0; } // Case 5: Return data for the current row. _line = getBytesForRow(); _position = 0; return _line.length; } @Override public void close() throws IOException { super.close(); _line = null; } @Override public int read() throws IOException { return available() == 0 ? -1 : _line[_position++]; } @Override public int read(byte[] b, int off, int len) throws IOException { int n = available(); if(n > 0) { n = Math.min(n, len); System.arraycopy(_line, _position, b, off, n); _position += n; } return n; } } @Override public Class<KeyV3.FrameKeyV3> makeSchema() { return KeyV3.FrameKeyV3.class; } /** Sort rows of a frame, using the set of columns as keys. * @return Copy of frame, sorted */ public Frame sort( int[] cols ) { return Merge.sort(this,cols); } }