package hex;
import java.util.Arrays;
import jsr166y.CountedCompleter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.fvec.*;
import water.util.Utils;
/**
* Frame splitter function to divide given frame into
* multiple partitions based on given ratios.
*
* <p>The task creates <code>ratios.length+1</code> output frame each containing a
* demanded fraction of rows from source dataset</p>
*
* <p>The tasks internally extract data from source chunks and create output chunks in preserving order of parts.
* I.e., the 1st partition contains the first P1-rows, the 2nd partition contains following P2-rows, ...
* </p>
*
* <p>Assumptions and invariants</p>
* <ul>
* <li>number of demanding split parts is reasonable number, i.e., <10. The task is not designed to split into many small parts.</li>
* <li>the worker DOES NOT preserves distribution of new chunks over the cloud according to source dataset chunks.</li>
* <li>rows inside one output chunk are not shuffled, they are extracted deterministically in the same order as they appear in source chunk.</li>
* <li>workers can enforce data transfers if they need to obtain data from remote chunks.</li>
* </ul>
*
* <p>NOTE: the implementation is data-transfer expensive and in some cases it would be beneficial to use original
* implementation from <a href="https://github.com/0xdata/h2o/commits/9af3f4e">9af3f4e</a>.</p>.
*/
public class FrameSplitter extends H2OCountedCompleter {
/** Dataset to split */
final Frame dataset;
/** Split ratios - resulting number of split is ratios.length+1 */
final float[] ratios;
/** Destination keys for each output frame split. */
final Key[] destKeys;
/** Optional job key */
final Key jobKey;
/** Output frames for each output split part */
private Frame[] splits;
/** Temporary variable holding exceptions of workers */
private Throwable[] workersExceptions;
public FrameSplitter(Frame dataset, float[] ratios) {
this(dataset, ratios, null, null);
}
public FrameSplitter(Frame dataset, float[] ratios, Key[] destKeys, Key jobKey) {
assert ratios.length > 0 : "No ratio specified!";
assert ratios.length < 100 : "Too many frame splits demanded!";
this.dataset = dataset;
this.ratios = ratios;
this.destKeys = destKeys!=null ? destKeys : Utils.generateNumKeys(dataset._key, ratios.length+1);
assert this.destKeys.length == this.ratios.length+1 : "Unexpected number of destination keys.";
this.jobKey = jobKey;
}
@Override public void compute2() {
// Lock all possible data
dataset.read_lock(jobKey);
// Create a template vector for each segment
final Vec[][] templates = makeTemplates(dataset, ratios);
final int nsplits = templates.length;
assert nsplits == ratios.length+1 : "Unexpected number of split templates!";
// Launch number of distributed FJ for each split part
final Vec[] datasetVecs = dataset.vecs();
splits = new Frame[nsplits];
for (int s=0; s<nsplits; s++) {
Frame split = new Frame(destKeys[s], dataset.names(), templates[s] );
split.delete_and_lock(jobKey);
splits[s] = split;
}
setPendingCount(1);
H2O.submitTask(new H2OCountedCompleter(FrameSplitter.this) {
@Override public void compute2() {
setPendingCount(nsplits);
for (int s=0; s<nsplits; s++) {
new FrameSplitTask(new H2OCountedCompleter(this) { // Completer for this task
@Override public void compute2() { }
@Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
synchronized( FrameSplitter.this ) { // synchronized on this since can be accessed from different workers
workersExceptions = workersExceptions!=null ? Arrays.copyOf(workersExceptions, workersExceptions.length+1) : new Throwable[1];
workersExceptions[workersExceptions.length-1] = ex;
}
tryComplete(); // we handle the exception so wait perform normal completion
return false;
}
}, datasetVecs, ratios, s).asyncExec(splits[s]);
}
tryComplete(); // complete the computation of nsplits-tasks
}
});
tryComplete(); // complete the computation of thrown tasks
}
/** Blocking call to obtain a result of computation. */
public Frame[] getResult() {
join();
if (workersExceptions!=null) throw new RuntimeException(workersExceptions[0]);
return splits;
}
@Override public void onCompletion(CountedCompleter caller) {
boolean exceptional = workersExceptions!=null;
dataset.unlock(jobKey);
if (splits!=null) {
for (Frame s : splits) {
if (s!=null) {
if (!exceptional) {
s.update(jobKey);
s.unlock(jobKey);
} else { // Have to unlock and delete here
s.unlock(jobKey);
s.delete(jobKey, 3.14f); // delete all splits
}
}
}
}
}
// Make vector templates for all output frame vectors
private Vec[][] makeTemplates(Frame dataset, float[] ratios) {
Vec anyVec = dataset.anyVec();
final long[][] espcPerSplit = computeEspcPerSplit(anyVec._espc, anyVec.length(), ratios);
final int num = dataset.numCols(); // number of columns in input frame
final int nsplits = espcPerSplit.length; // number of splits
final String[][] domains = dataset.domains(); // domains
final boolean[] uuids = dataset.uuids();
final byte [] times = dataset.times();
Vec[][] t = new Vec[nsplits][/*num*/]; // resulting vectors for all
for (int i=0; i<nsplits; i++) {
// vectors for j-th split
t[i] = new Vec(Vec.newKey(),espcPerSplit[i/*-th split*/]).makeZeros(num, domains, uuids, times);
}
return t;
}
// The task computes ESPC per split
static long[/*nsplits*/][/*nchunks*/] computeEspcPerSplit(long[] espc, long len, float[] ratios) {
assert espc.length>0 && espc[0] == 0;
assert espc[espc.length-1] == len;
long[] partSizes = Utils.partitione(len, ratios); // Split of whole vector
int nparts = ratios.length+1;
long[][] r = new long[nparts][espc.length]; // espc for each partition
long nrows = 0;
long start = 0;
for (int p=0,c=0; p<nparts; p++) {
int nc = 0; // number of chunks for this partition
for(;c<espc.length-1 && (espc[c+1]-start) <= partSizes[p];c++) r[p][++nc] = espc[c+1]-start;
if (r[p][nc] < partSizes[p]) r[p][++nc] = partSizes[p]; // last item in espc contains number of rows
r[p] = Arrays.copyOf(r[p], nc+1);
// Transfer rest of lines to the next part
nrows = nrows-partSizes[p];
start += partSizes[p];
}
return r;
}
/** MR task extract specified part of <code>_srcVecs</code>
* into output chunk.*/
private static class FrameSplitTask extends MRTask2<FrameSplitTask> {
final Vec [] _srcVecs; // a source frame given by list of its columns
final float[] _ratios; // split ratios
final int _partIdx; // part index
transient int _pcidx; // Start chunk index for this partition
transient int _psrow; // Start row in chunk for this partition
public FrameSplitTask(H2OCountedCompleter completer, Vec[] srcVecs, float[] ratios, int partIdx) {
super(completer);
_srcVecs = srcVecs;
_ratios = ratios;
_partIdx = partIdx;
}
@Override protected void setupLocal() {
// Precompute the first input chunk index and start row inside that chunk for this partition
Vec anyInVec = _srcVecs[0];
long[] partSizes = Utils.partitione(anyInVec.length(), _ratios);
long pnrows = 0;
for (int p=0; p<_partIdx; p++) pnrows += partSizes[p];
long[] espc = anyInVec._espc;
while (_pcidx < espc.length-1 && (pnrows -= (espc[_pcidx+1]-espc[_pcidx])) > 0 ) _pcidx++;
assert pnrows <= 0;
_psrow = (int) (pnrows + espc[_pcidx+1]-espc[_pcidx]);
}
@Override public void map(Chunk[] cs) { // Output chunks
int coutidx = cs[0].cidx(); // Index of output Chunk
int cinidx = _pcidx + coutidx;
int startRow = coutidx > 0 ? 0 : _psrow; // where to start extracting
int nrows = cs[0]._len;
// For each output chunk extract appropriate rows for partIdx-th part
for (int i=0; i<cs.length; i++) {
// WARNING: this implementation does not preserve co-location of chunks so we are forcing here network transfer!
ChunkSplitter.extractChunkPart(_srcVecs[i].chunkForChunkIdx(cinidx), cs[i], startRow, nrows, _fs);
}
}
}
}