FrameSplitter.java example

Explorer
h2o-3-master
package hex;

import java.util.Arrays;

import jsr166y.CountedCompleter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.fvec.*;

/**
 * Frame splitter function to divide given frame into
 * multiple partitions based on given ratios.
 *
 * <p>The task creates <code>ratios.length+1</code> output frame each containing a
 * demanded fraction of rows from source dataset</p>
 *
 * <p>The tasks internally extract data from source chunks and create output chunks in preserving order of parts.
 * I.e., the 1st partition contains the first P1-rows, the 2nd partition contains following P2-rows, ...
 * </p>
 *
 * <p>Assumptions and invariants</p>
 * <ul>
 * <li>number of demanding split parts is reasonable number, i.e., <10. The task is not designed to split into many small parts.</li>
 * <li>the worker DOES NOT preserves distribution of new chunks over the cloud according to source dataset chunks.</li>
 * <li>rows inside one output chunk are not shuffled, they are extracted deterministically in the same order as they appear in source chunk.</li>
 * <li>workers can enforce data transfers if they need to obtain data from remote chunks.</li>
 * </ul>
 *
 * <p>NOTE: the implementation is data-transfer expensive and in some cases it would be beneficial to use original
 * implementation from <a href="https://github.com/0xdata/h2o/commits/9af3f4e">9af3f4e</a>.</p>.
 */
public class FrameSplitter extends H2OCountedCompleter<FrameSplitter> {
  /** Dataset to split */
  final Frame   dataset;
  /** Split ratios - resulting number of split is ratios.length+1 */
  final double[] ratios;
  /** Destination keys for each output frame split. */
  final Key<Frame>[]   destKeys;
  /** Optional job key */
  final Key<Job>     jobKey;

  /** Output frames for each output split part */
  private Frame[] splits;

  public FrameSplitter(Frame dataset, double[] ratios, Key<Frame>[] destKeys, Key<Job> jobKey) {
    this(null, dataset, ratios,destKeys,jobKey);
  }
  public FrameSplitter(H2OCountedCompleter cc, Frame dataset, double[] ratios, Key<Frame>[] destKeys, Key<Job> jobKey) {
    super(cc);
    assert ratios.length > 0 : "No ratio specified!";
    assert ratios.length < 100 : "Too many frame splits demanded!";
    assert destKeys!=null : "Destination keys are not specified!";
    assert destKeys.length == ratios.length+1 : "Unexpected number of destination keys.";
    this.dataset  = dataset;
    this.ratios   = ratios;
    this.jobKey   = jobKey;
    this.destKeys = destKeys;
  }

  @Override public void compute2() {
    // Lock all possible data
    dataset.read_lock(jobKey);

    // Create a template vector for each segment
    final Vec[][] templates = makeTemplates(dataset, ratios);

    final int nsplits = templates.length;
    assert nsplits == ratios.length+1 : "Unexpected number of split templates!";
    // Launch number of distributed FJ for each split part
    final Vec[] datasetVecs = dataset.vecs();
    splits = new Frame[nsplits];
    for (int s=0; s<nsplits; s++) {
      Frame split = new Frame(destKeys[s], dataset.names(), templates[s] );
      split.delete_and_lock(jobKey);
      splits[s] = split;
    }
    setPendingCount(nsplits);
    for (int s=0; s<nsplits; s++)
      new FrameSplitTask(this,datasetVecs, ratios, s).dfork(splits[s]);
    tryComplete(); // complete the computation of thrown tasks
  }

  /** Blocking call to obtain a result of computation. */
  public Frame[] getResult() {
    join();
    return splits;
  }

  @Override public void onCompletion(CountedCompleter caller) {
    dataset.unlock(jobKey);
    if (splits!=null)
      for (Frame s : splits)
        if (s!=null)
          s.update(jobKey).unlock(jobKey);
  }
  @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
    dataset.unlock(jobKey);
    Futures fs = new Futures();
    if (splits!=null)
      for (Frame s : splits)
        if (s!=null)
          s.unlock(jobKey).delete(jobKey,fs);
    fs.blockForPending();
    return true;
  }

  // Make vector templates for all output frame vectors
  private Vec[][] makeTemplates(Frame dataset, double[] ratios) {
    Vec anyVec = dataset.anyVec();
    final long[][] espcPerSplit = computeEspcPerSplit(anyVec.espc(), anyVec.length(), ratios);
    final int num = dataset.numCols(); // number of columns in input frame
    final int nsplits = espcPerSplit.length; // number of splits
    final String[][] domains = dataset.domains(); // domains
    final byte[] types = new byte[num];
    int j=0;
    for (Vec v : dataset.vecs()) types[j++] = v.get_type();

    Vec[][] t = new Vec[nsplits][/*num*/]; // resulting vectors for all
    for (int i=0; i<nsplits; i++) {
      // vectors for j-th split
      Key vkey = Vec.newKey();
      int rowLayout = Vec.ESPC.rowLayout(vkey,espcPerSplit[i]);
      t[i] = new Vec(vkey,rowLayout).makeCons(num, 0, domains, types);
    }
    return t;
  }

  // The task computes ESPC per split
  static long[/*nsplits*/][/*nchunks*/] computeEspcPerSplit(long[] espc, long len, double[] ratios) {
    assert espc.length>0 && espc[0] == 0;
    assert espc[espc.length-1] == len;
    long[] partSizes = partitione(len, ratios); // Split of whole vector
    int nparts = ratios.length+1;
    long[][] r = new long[nparts][espc.length]; // espc for each partition
    long nrows = 0;
    long start = 0;
    for (int p=0,c=0; p<nparts; p++) {
      int nc = 0; // number of chunks for this partition
      for(;c<espc.length-1 && (espc[c+1]-start) <= partSizes[p];c++) r[p][++nc] = espc[c+1]-start;
      if (r[p][nc] < partSizes[p]) r[p][++nc] = partSizes[p]; // last item in espc contains number of rows
      r[p] = Arrays.copyOf(r[p], nc+1);
      // Transfer rest of lines to the next part
      nrows = nrows-partSizes[p];
      start += partSizes[p];
    }
    return r;
  }


  /** MR task extract specified part of <code>_srcVecs</code>
   * into output chunk.*/
  private static class FrameSplitTask extends MRTask<FrameSplitTask> {
    final Vec  [] _srcVecs; // a source frame given by list of its columns
    final double[] _ratios;  // split ratios
    final int     _partIdx; // part index

    transient int _pcidx; // Start chunk index for this partition
    transient int _psrow; // Start row in chunk for this partition

    public FrameSplitTask(H2OCountedCompleter completer, Vec[] srcVecs, double[] ratios, int partIdx) {
      super(completer);
      _srcVecs = srcVecs;
      _ratios  = ratios;
      _partIdx = partIdx;
    }
    @Override protected void setupLocal() {
      // Precompute the first input chunk index and start row inside that chunk for this partition
      Vec anyInVec = _srcVecs[0];
      long[] partSizes = partitione(anyInVec.length(), _ratios);
      long pnrows = 0;
      for (int p=0; p<_partIdx; p++) pnrows += partSizes[p];
      long[] espc = anyInVec.espc();
      while (_pcidx < espc.length-1 && (pnrows -= (espc[_pcidx+1]-espc[_pcidx])) >= 0 ) _pcidx++;
      assert pnrows <= 0;
      _psrow = (int) (pnrows + espc[_pcidx+1]-espc[_pcidx]);
    }
    @Override public void map(Chunk[] cs) { // Output chunks
      int coutidx = cs[0].cidx(); // Index of output Chunk
      int cinidx = _pcidx + coutidx;
      int startRow = coutidx > 0 ? 0 : _psrow; // where to start extracting
      int nrows = cs[0]._len;
      // For each output chunk extract appropriate rows for partIdx-th part
      for (int i=0; i<cs.length; i++) {
        // WARNING: this implementation does not preserve co-location of chunks so we are forcing here network transfer!
        ChunkSplitter.extractChunkPart(_srcVecs[i].chunkForChunkIdx(cinidx), cs[i], startRow, nrows, _fs);
      }
    }
  }
  static final long[] partitione(long len, double[] ratio) {
    long[] r = new long[ratio.length+1];
    long sum = 0;
    int i = 0;
    float sr = 0;
    for (i=0; i<ratio.length; i++) {
      r[i] = (int) (ratio[i]*len);
      sum += r[i];
      sr  += ratio[i];
    }
    if (sr<1f) r[i] = len - sum;
    else r[i-1] += (len-sum);
    return r;
  }
}