package hex;
import java.util.Arrays;
import jsr166y.CountedCompleter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.fvec.*;
/**
* Frame splitter function to divide given frame into
* multiple partitions based on given ratios.
*
* <p>The task creates <code>ratios.length+1</code> output frame each containing a
* demanded fraction of rows from source dataset</p>
*
* <p>The tasks internally extract data from source chunks and create output chunks in preserving order of parts.
* I.e., the 1st partition contains the first P1-rows, the 2nd partition contains following P2-rows, ...
* </p>
*
* <p>Assumptions and invariants</p>
* <ul>
* <li>number of demanding split parts is reasonable number, i.e., <10. The task is not designed to split into many small parts.</li>
* <li>the worker DOES NOT preserves distribution of new chunks over the cloud according to source dataset chunks.</li>
* <li>rows inside one output chunk are not shuffled, they are extracted deterministically in the same order as they appear in source chunk.</li>
* <li>workers can enforce data transfers if they need to obtain data from remote chunks.</li>
* </ul>
*
* <p>NOTE: the implementation is data-transfer expensive and in some cases it would be beneficial to use original
* implementation from <a href="https://github.com/0xdata/h2o/commits/9af3f4e">9af3f4e</a>.</p>.
*/
public class FrameSplitter extends H2OCountedCompleter<FrameSplitter> {
/** Dataset to split */
final Frame dataset;
/** Split ratios - resulting number of split is ratios.length+1 */
final double[] ratios;
/** Destination keys for each output frame split. */
final Key<Frame>[] destKeys;
/** Optional job key */
final Key<Job> jobKey;
/** Output frames for each output split part */
private Frame[] splits;
public FrameSplitter(Frame dataset, double[] ratios, Key<Frame>[] destKeys, Key<Job> jobKey) {
this(null, dataset, ratios,destKeys,jobKey);
}
public FrameSplitter(H2OCountedCompleter cc, Frame dataset, double[] ratios, Key<Frame>[] destKeys, Key<Job> jobKey) {
super(cc);
assert ratios.length > 0 : "No ratio specified!";
assert ratios.length < 100 : "Too many frame splits demanded!";
assert destKeys!=null : "Destination keys are not specified!";
assert destKeys.length == ratios.length+1 : "Unexpected number of destination keys.";
this.dataset = dataset;
this.ratios = ratios;
this.jobKey = jobKey;
this.destKeys = destKeys;
}
@Override public void compute2() {
// Lock all possible data
dataset.read_lock(jobKey);
// Create a template vector for each segment
final Vec[][] templates = makeTemplates(dataset, ratios);
final int nsplits = templates.length;
assert nsplits == ratios.length+1 : "Unexpected number of split templates!";
// Launch number of distributed FJ for each split part
final Vec[] datasetVecs = dataset.vecs();
splits = new Frame[nsplits];
for (int s=0; s<nsplits; s++) {
Frame split = new Frame(destKeys[s], dataset.names(), templates[s] );
split.delete_and_lock(jobKey);
splits[s] = split;
}
setPendingCount(nsplits);
for (int s=0; s<nsplits; s++)
new FrameSplitTask(this,datasetVecs, ratios, s).dfork(splits[s]);
tryComplete(); // complete the computation of thrown tasks
}
/** Blocking call to obtain a result of computation. */
public Frame[] getResult() {
join();
return splits;
}
@Override public void onCompletion(CountedCompleter caller) {
dataset.unlock(jobKey);
if (splits!=null)
for (Frame s : splits)
if (s!=null)
s.update(jobKey).unlock(jobKey);
}
@Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
dataset.unlock(jobKey);
Futures fs = new Futures();
if (splits!=null)
for (Frame s : splits)
if (s!=null)
s.unlock(jobKey).delete(jobKey,fs);
fs.blockForPending();
return true;
}
// Make vector templates for all output frame vectors
private Vec[][] makeTemplates(Frame dataset, double[] ratios) {
Vec anyVec = dataset.anyVec();
final long[][] espcPerSplit = computeEspcPerSplit(anyVec.espc(), anyVec.length(), ratios);
final int num = dataset.numCols(); // number of columns in input frame
final int nsplits = espcPerSplit.length; // number of splits
final String[][] domains = dataset.domains(); // domains
final byte[] types = new byte[num];
int j=0;
for (Vec v : dataset.vecs()) types[j++] = v.get_type();
Vec[][] t = new Vec[nsplits][/*num*/]; // resulting vectors for all
for (int i=0; i<nsplits; i++) {
// vectors for j-th split
Key vkey = Vec.newKey();
int rowLayout = Vec.ESPC.rowLayout(vkey,espcPerSplit[i]);
t[i] = new Vec(vkey,rowLayout).makeCons(num, 0, domains, types);
}
return t;
}
// The task computes ESPC per split
static long[/*nsplits*/][/*nchunks*/] computeEspcPerSplit(long[] espc, long len, double[] ratios) {
assert espc.length>0 && espc[0] == 0;
assert espc[espc.length-1] == len;
long[] partSizes = partitione(len, ratios); // Split of whole vector
int nparts = ratios.length+1;
long[][] r = new long[nparts][espc.length]; // espc for each partition
long nrows = 0;
long start = 0;
for (int p=0,c=0; p<nparts; p++) {
int nc = 0; // number of chunks for this partition
for(;c<espc.length-1 && (espc[c+1]-start) <= partSizes[p];c++) r[p][++nc] = espc[c+1]-start;
if (r[p][nc] < partSizes[p]) r[p][++nc] = partSizes[p]; // last item in espc contains number of rows
r[p] = Arrays.copyOf(r[p], nc+1);
// Transfer rest of lines to the next part
nrows = nrows-partSizes[p];
start += partSizes[p];
}
return r;
}
/** MR task extract specified part of <code>_srcVecs</code>
* into output chunk.*/
private static class FrameSplitTask extends MRTask<FrameSplitTask> {
final Vec [] _srcVecs; // a source frame given by list of its columns
final double[] _ratios; // split ratios
final int _partIdx; // part index
transient int _pcidx; // Start chunk index for this partition
transient int _psrow; // Start row in chunk for this partition
public FrameSplitTask(H2OCountedCompleter completer, Vec[] srcVecs, double[] ratios, int partIdx) {
super(completer);
_srcVecs = srcVecs;
_ratios = ratios;
_partIdx = partIdx;
}
@Override protected void setupLocal() {
// Precompute the first input chunk index and start row inside that chunk for this partition
Vec anyInVec = _srcVecs[0];
long[] partSizes = partitione(anyInVec.length(), _ratios);
long pnrows = 0;
for (int p=0; p<_partIdx; p++) pnrows += partSizes[p];
long[] espc = anyInVec.espc();
while (_pcidx < espc.length-1 && (pnrows -= (espc[_pcidx+1]-espc[_pcidx])) >= 0 ) _pcidx++;
assert pnrows <= 0;
_psrow = (int) (pnrows + espc[_pcidx+1]-espc[_pcidx]);
}
@Override public void map(Chunk[] cs) { // Output chunks
int coutidx = cs[0].cidx(); // Index of output Chunk
int cinidx = _pcidx + coutidx;
int startRow = coutidx > 0 ? 0 : _psrow; // where to start extracting
int nrows = cs[0]._len;
// For each output chunk extract appropriate rows for partIdx-th part
for (int i=0; i<cs.length; i++) {
// WARNING: this implementation does not preserve co-location of chunks so we are forcing here network transfer!
ChunkSplitter.extractChunkPart(_srcVecs[i].chunkForChunkIdx(cinidx), cs[i], startRow, nrows, _fs);
}
}
}
static final long[] partitione(long len, double[] ratio) {
long[] r = new long[ratio.length+1];
long sum = 0;
int i = 0;
float sr = 0;
for (i=0; i<ratio.length; i++) {
r[i] = (int) (ratio[i]*len);
sum += r[i];
sr += ratio[i];
}
if (sr<1f) r[i] = len - sum;
else r[i-1] += (len-sum);
return r;
}
}