FrameSplitter.java example

Explorer
h2o-2-master
package hex;

import java.util.Arrays;

import jsr166y.CountedCompleter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.fvec.*;
import water.util.Utils;

/**
 * Frame splitter function to divide given frame into
 * multiple partitions based on given ratios.
 *
 * <p>The task creates <code>ratios.length+1</code> output frame each containing a
 * demanded fraction of rows from source dataset</p>
 *
 * <p>The tasks internally extract data from source chunks and create output chunks in preserving order of parts.
 * I.e., the 1st partition contains the first P1-rows, the 2nd partition contains following P2-rows, ...
 * </p>
 *
 * <p>Assumptions and invariants</p>
 * <ul>
 * <li>number of demanding split parts is reasonable number, i.e., <10. The task is not designed to split into many small parts.</li>
 * <li>the worker DOES NOT preserves distribution of new chunks over the cloud according to source dataset chunks.</li>
 * <li>rows inside one output chunk are not shuffled, they are extracted deterministically in the same order as they appear in source chunk.</li>
 * <li>workers can enforce data transfers if they need to obtain data from remote chunks.</li>
 * </ul>
 *
 * <p>NOTE: the implementation is data-transfer expensive and in some cases it would be beneficial to use original
 * implementation from <a href="https://github.com/0xdata/h2o/commits/9af3f4e">9af3f4e</a>.</p>.
 */
public class FrameSplitter extends H2OCountedCompleter {
  /** Dataset to split */
  final Frame   dataset;
  /** Split ratios - resulting number of split is ratios.length+1 */
  final float[] ratios;
  /** Destination keys for each output frame split. */
  final Key[]   destKeys;
  /** Optional job key */
  final Key     jobKey;

  /** Output frames for each output split part */
  private Frame[] splits;
  /** Temporary variable holding exceptions of workers */
  private Throwable[] workersExceptions;

  public FrameSplitter(Frame dataset, float[] ratios) {
    this(dataset, ratios, null, null);
  }
  public FrameSplitter(Frame dataset, float[] ratios, Key[] destKeys, Key jobKey) {
    assert ratios.length > 0 : "No ratio specified!";
    assert ratios.length < 100 : "Too many frame splits demanded!";
    this.dataset  = dataset;
    this.ratios    = ratios;
    this.destKeys = destKeys!=null ? destKeys : Utils.generateNumKeys(dataset._key, ratios.length+1);
    assert this.destKeys.length == this.ratios.length+1 : "Unexpected number of destination keys.";
    this.jobKey   = jobKey;
  }

  @Override public void compute2() {
    // Lock all possible data
    dataset.read_lock(jobKey);
    // Create a template vector for each segment
    final Vec[][] templates = makeTemplates(dataset, ratios);
    final int nsplits = templates.length;
    assert nsplits == ratios.length+1 : "Unexpected number of split templates!";
    // Launch number of distributed FJ for each split part
    final Vec[] datasetVecs = dataset.vecs();
    splits = new Frame[nsplits];
    for (int s=0; s<nsplits; s++) {
      Frame split = new Frame(destKeys[s], dataset.names(), templates[s] );
      split.delete_and_lock(jobKey);
      splits[s] = split;
    }
    setPendingCount(1);
    H2O.submitTask(new H2OCountedCompleter(FrameSplitter.this) {
      @Override public void compute2() {
        setPendingCount(nsplits);
        for (int s=0; s<nsplits; s++) {
          new FrameSplitTask(new H2OCountedCompleter(this) { // Completer for this task
            @Override public void compute2() { }
            @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) {
              synchronized( FrameSplitter.this ) { // synchronized on this since can be accessed from different workers
                workersExceptions = workersExceptions!=null ? Arrays.copyOf(workersExceptions, workersExceptions.length+1) : new Throwable[1];
                workersExceptions[workersExceptions.length-1] = ex;
              }
              tryComplete(); // we handle the exception so wait perform normal completion
              return false;
            }
          }, datasetVecs, ratios, s).asyncExec(splits[s]);
        }
        tryComplete(); // complete the computation of nsplits-tasks
      }
    });
    tryComplete(); // complete the computation of thrown tasks
  }

  /** Blocking call to obtain a result of computation. */
  public Frame[] getResult() {
    join();
    if (workersExceptions!=null) throw new RuntimeException(workersExceptions[0]);
    return splits;
  }

  @Override public void onCompletion(CountedCompleter caller) {
    boolean exceptional = workersExceptions!=null;
    dataset.unlock(jobKey);
    if (splits!=null) {
      for (Frame s : splits) {
        if (s!=null) {
          if (!exceptional) {
            s.update(jobKey);
            s.unlock(jobKey);
          } else { // Have to unlock and delete here
            s.unlock(jobKey);
            s.delete(jobKey, 3.14f); // delete all splits
          }
        }
      }
    }
  }

  // Make vector templates for all output frame vectors
  private Vec[][] makeTemplates(Frame dataset, float[] ratios) {
    Vec anyVec = dataset.anyVec();
    final long[][] espcPerSplit = computeEspcPerSplit(anyVec._espc, anyVec.length(), ratios);
    final int num = dataset.numCols(); // number of columns in input frame
    final int nsplits = espcPerSplit.length; // number of splits
    final String[][] domains = dataset.domains(); // domains
    final boolean[] uuids = dataset.uuids();
    final byte   [] times = dataset.times();
    Vec[][] t = new Vec[nsplits][/*num*/]; // resulting vectors for all
    for (int i=0; i<nsplits; i++) {
      // vectors for j-th split
      t[i] = new Vec(Vec.newKey(),espcPerSplit[i/*-th split*/]).makeZeros(num, domains, uuids, times);
    }
    return t;
  }

  // The task computes ESPC per split
  static long[/*nsplits*/][/*nchunks*/] computeEspcPerSplit(long[] espc, long len, float[] ratios) {
    assert espc.length>0 && espc[0] == 0;
    assert espc[espc.length-1] == len;
    long[] partSizes = Utils.partitione(len, ratios); // Split of whole vector
    int nparts = ratios.length+1;
    long[][] r = new long[nparts][espc.length]; // espc for each partition
    long nrows = 0;
    long start = 0;
    for (int p=0,c=0; p<nparts; p++) {
      int nc = 0; // number of chunks for this partition
      for(;c<espc.length-1 && (espc[c+1]-start) <= partSizes[p];c++) r[p][++nc] = espc[c+1]-start;
      if (r[p][nc] < partSizes[p]) r[p][++nc] = partSizes[p]; // last item in espc contains number of rows
      r[p] = Arrays.copyOf(r[p], nc+1);
      // Transfer rest of lines to the next part
      nrows = nrows-partSizes[p];
      start += partSizes[p];
    }
    return r;
  }

  /** MR task extract specified part of <code>_srcVecs</code>
   * into output chunk.*/
  private static class FrameSplitTask extends MRTask2<FrameSplitTask> {
    final Vec  [] _srcVecs; // a source frame given by list of its columns
    final float[] _ratios;  // split ratios
    final int     _partIdx; // part index

    transient int _pcidx; // Start chunk index for this partition
    transient int _psrow; // Start row in chunk for this partition

    public FrameSplitTask(H2OCountedCompleter completer, Vec[] srcVecs, float[] ratios, int partIdx) {
      super(completer);
      _srcVecs = srcVecs;
      _ratios  = ratios;
      _partIdx = partIdx;
    }
    @Override protected void setupLocal() {
      // Precompute the first input chunk index and start row inside that chunk for this partition
      Vec anyInVec = _srcVecs[0];
      long[] partSizes = Utils.partitione(anyInVec.length(), _ratios);
      long pnrows = 0;
      for (int p=0; p<_partIdx; p++) pnrows += partSizes[p];
      long[] espc = anyInVec._espc;
      while (_pcidx < espc.length-1 && (pnrows -= (espc[_pcidx+1]-espc[_pcidx])) > 0 ) _pcidx++;
      assert pnrows <= 0;
      _psrow = (int) (pnrows + espc[_pcidx+1]-espc[_pcidx]);
    }
    @Override public void map(Chunk[] cs) { // Output chunks
      int coutidx = cs[0].cidx(); // Index of output Chunk
      int cinidx = _pcidx + coutidx;
      int startRow = coutidx > 0 ? 0 : _psrow; // where to start extracting
      int nrows = cs[0]._len;
      // For each output chunk extract appropriate rows for partIdx-th part
      for (int i=0; i<cs.length; i++) {
        // WARNING: this implementation does not preserve co-location of chunks so we are forcing here network transfer!
        ChunkSplitter.extractChunkPart(_srcVecs[i].chunkForChunkIdx(cinidx), cs[i], startRow, nrows, _fs);
      }
    }
  }
}