package water.fvec; import jsr166y.CountedCompleter; import water.Futures; import water.H2O; import water.Key; import water.MRTask; import java.util.Arrays; /** * Created by tomasnykodym on 3/28/14. * * Utility to rebalance dataset so that it has requested number of chunks and * each chunk has the same number of rows +/-1. * * It *does not* guarantee even chunk-node placement. (This can not currently * be done in H2O, since the placement of chunks is governed only by key-hash * /vector group/ for Vecs) */ public class RebalanceDataSet extends H2O.H2OCountedCompleter { final Frame _in; final int _nchunks; Key _okey; Frame _out; final Key _jobKey; final transient Vec.VectorGroup _vg; transient long[] _espc; /** * Constructor for make-compatible task. * * To be used to make frame compatible with other frame (i.e. make all vecs compatible with other vector group and rows-per-chunk). */ public RebalanceDataSet(Frame modelFrame, Frame srcFrame, Key dstKey) {this(modelFrame,srcFrame,dstKey,null,null);} public RebalanceDataSet(Frame modelFrame, Frame srcFrame, Key dstKey, H2O.H2OCountedCompleter cmp, Key jobKey) { super(cmp); _in = srcFrame; _jobKey = jobKey; _okey = dstKey; _espc = modelFrame.anyVec().espc(); // Get prior layout _vg = modelFrame.anyVec().group(); _nchunks = modelFrame.anyVec().nChunks(); } public RebalanceDataSet(Frame srcFrame, Key dstKey, int nchunks) { this(srcFrame, dstKey,nchunks,null,null);} public RebalanceDataSet(Frame srcFrame, Key dstKey, int nchunks, H2O.H2OCountedCompleter cmp, Key jobKey) { super(cmp); _in = srcFrame; _nchunks = nchunks; _jobKey = jobKey; _okey = dstKey; _vg = new Vec.VectorGroup(); } public Frame getResult(){join(); return _out;} @Override public void compute2() { // Simply create a bogus new vector (don't even put it into KV) with // appropriate number of lines per chunk and then use it as a source to do // multiple makeZero calls to create empty vecs and than call RebalanceTask // on each one of them. RebalanceTask will fetch the appropriate training_frame // chunks and fetch the data from them. long[] espc; if (_espc != null) espc = _espc; else { int rpc = (int) (_in.numRows() / _nchunks); int rem = (int) (_in.numRows() % _nchunks); espc = new long[_nchunks + 1]; Arrays.fill(espc, rpc); for (int i = 0; i < rem; ++i) ++espc[i]; long sum = 0; for (int i = 0; i < espc.length; ++i) { long s = espc[i]; espc[i] = sum; sum += s; } assert espc[espc.length - 1] == _in.numRows() : "unexpected number of rows, expected " + _in.numRows() + ", got " + espc[espc.length - 1]; } final int rowLayout = Vec.ESPC.rowLayout(_vg._key,espc); final Vec[] srcVecs = _in.vecs(); _out = new Frame(_okey,_in.names(), new Vec(_vg.addVec(),rowLayout).makeCons(srcVecs.length,0L,_in.domains(),_in.types())); _out.delete_and_lock(_jobKey); new RebalanceTask(this,srcVecs).dfork(_out); } @Override public void onCompletion(CountedCompleter caller) { assert _out.numRows() == _in.numRows(); Vec vec = _out.anyVec(); assert vec.nChunks() == _nchunks; _out.update(_jobKey); _out.unlock(_jobKey); } @Override public boolean onExceptionalCompletion(Throwable t, CountedCompleter caller) { t.printStackTrace(); if( _out != null ) _out.delete(_jobKey,new Futures()).blockForPending(); return true; } public static class RebalanceTask extends MRTask<RebalanceTask> { final Vec [] _srcVecs; public RebalanceTask(H2O.H2OCountedCompleter cmp, Vec... srcVecs){super(cmp);_srcVecs = srcVecs;} @Override public boolean logVerbose() { return false; } private void rebalanceChunk(int i, Chunk c, NewChunk nc){ int N = c._len; int len = 0; int lastId = -1; while(N > len) { Chunk srcRaw = _srcVecs[i].chunkForRow(c._start+len); assert lastId == -1 || lastId == srcRaw.cidx()-1; lastId = srcRaw.cidx(); int off = (int)((c._start+len) - srcRaw._start); assert off >=0 && off < srcRaw._len; int x = Math.min(N-len,srcRaw._len-off); srcRaw.extractRows(nc, off,off+x); len += x; } nc.close(_fs); } @Override public void map(Chunk [] chks){ for(int c = 0; c < chks.length; ++c){ rebalanceChunk(c,chks[c],new NewChunk(chks[c])); } } } }