MRTask.java example

Explorer
h2o-3-master
package water;

import jsr166y.CountedCompleter;
import jsr166y.ForkJoinPool;
import water.fvec.*;
import water.util.DistributedException;
import water.util.PrettyPrint;
import water.fvec.Vec.VectorGroup;

import java.util.Arrays;

/**
 * Map/Reduce style distributed computation.
 * <p>
 * MRTask provides several <code>map</code> and <code>reduce</code> methods
 * that can be overridden to specify a computation. Several instances of this
 * class will be created to distribute the computation over F/J threads and
 * machines.  Non-transient fields are copied and serialized to instances
 * created for map invocations.  Reduce methods can store their results in
 * fields.  Results are serialized and reduced all the way back to the invoking
 * node.  When the last reduce method has been called, fields of the initial
 * MRTask instance contains the computation results.</p>
 * <p>
 * Apart from small reduced POJO returned to the calling node, MRTask can
 * produce output vector(s) as a result.  These will have chunks co-located
 * with the input dataset, however, their number of lines will generally differ
 * so they won't be strictly compatible with the original.  To produce output
 * vectors, call doAll.dfork version with required number of outputs and
 * override appropriate <code>map</code> call taking required number of
 * NewChunks.  MRTask will automatically close the new Appendable vecs and a
 * call to <code>outputFrame</code> will make a frame with newly created Vecs.
 * </p>
 *
 * <p><b>Overview</b></p>
 * <p>
 * Distributed computation starts by calling <code>doAll</code>,
 * <code>dfork</code>, or <code>dfork</code>.  <code>doAll</code> simply
 * calls <code>dfork</code> and <code>dfork</code> before blocking;
 * <code>dfork</code> and <code>dfork</code> are non-blocking.  The main
 * pardigm is divide-conquer-combine using ForkJoin. </p>
 * <p>
 * If <code>doAll</code> is called with Keys, then one <code>map</code> call is
 * made per Key, on the Key's home node.  If MRTask is invoked on a Frame (or
 * equivalently a Vec[]), then one <code>map</code> call is made per Chunk for
 * all Vecs at once, on the Chunk's home node.  In both modes,
 * <code>reduce</code> is called between each pair of calls to
 * <code>map</code>.  </p>
 * <p>
 * MRTask can also be called with <code>doAllNodes</code>, in which case only
 * the setupLocal call is made once per node; neither map nor reduce are
 * called.</p>
 * <p>

 * Computation is tailored primarily by overriding.  The main method is the
 * <code>map</code> call, coupled sometimes with a <code>reduce</code> call.
 * <code>setupLocal</code> is called once per node before any map calls are
 * made on that node (but perhaps other nodes have already started); in reverse
 * <code>closeLocal</code> is called after the last map call completes on a
 * node (but perhaps other nodes are still computing maps).
 * <code>postGlobal</code> is called once only after all maps, reduces and
 * closeLocals, and only on the home node.</p>
 */
public abstract class MRTask<T extends MRTask<T>> extends DTask<T> implements ForkJoinPool.ManagedBlocker {

  /*
  * Technical note to developers:
  *
  *   There are several internal flags and counters used throughout. They are gathered in
  *   this note to help you reason about the execution of an MRTask.
  *
  *    internal "top-level" fields
  *    ---------------------------
  *     - RPC<T> _nleft, _nrite: "child" node/JVMs that are doing work
  *     - boolean _topLocal    : "root" MRTask on a local machine
  *     - boolean _topGlobal   : "root" MRTask on the "root" node
  *     - T _left, _rite       : "child" MRTasks on a local machine
  *     - T _res               : "result" MRTask (everything reduced into here)
  *     - int _nlo,_nhi        : range of nodes to do remote work on (divide-conquer; see Diagram 2)
  *     - Futures _fs          : _topLocal task blocks on _fs for _left and _rite to complete
  *
  *       Diagram 1: N is for Node; T is for Task
  *       -------------------------------------
  *              3 node cloud              Inside one of the 'N' nodes:
  *                   N1                               T  _topLocal**
  *                 /   \                            /  \
  *         N2 (_nleft)  N3 (_nrite)         T (_left)   T (_rite)
  *
  *                  **: T is also _topGlobal if N==N1
  *
  *    These fields get set in the <code>SetupLocal0<code> call. Let's see what it does:
  *
  *     Diagram 2:
  *     ----------
  *       dfork on N1
  *         - _topGlobal=true
  *         - _nlo=0
  *         - _nhi=CLOUD_SIZE
  *                ||
  *                ||
  *                ||
  *                ==>       setupLocal0 on N1
  *                            - topLocal=true
  *                            - _fs = new Futures()
  *                            - nmid = (_nlo + _nhi) >> 1 => split the range of nodes (divide-conquer)
  *                            - _nleft = remote_compute(_nlo,nmid) => chooses a node in range and does new RPC().call()
  *                            - _nrite = remote_compute(nmid,_nhi)    serializing MRTask and call dinvoke on remote.
  *                           /                                 \
  *                         /                                     \
  *                       /                                         \
  *                  dinvoke on N2                              dinvoke on N3
  *                   setupLocal0 on N2                           setupLocal0 on N3
  *                     - topLocal=true                             - topLocal=true
  *                     - _fs = new Futures()                       - _fs = new Futures()
  *                     - (continue splitting)                      - (continue splitting)
  *                   H2O.submitTask(this) => compute2            H2O.submitTask(this) => compute2
  *
  */

  public MRTask() { super(); }
  protected MRTask(H2O.H2OCountedCompleter cmp) {super(cmp); }
  protected MRTask(byte prior) { super(prior); }

  /**
   * This Frame instance is the handle for computation over a set of Vec instances. Recall
   * that a Frame is a collection Vec instances, so this includes any invocation of
   * <code>doAll</code> with Frame and Vec[] instances. Top-level calls to
   * <code>doAll</code> wrap Vec instances into a new Frame instance and set this into
   * <code>_fr</code> during a call to <code>dfork</code>.
   */
  public Frame _fr;

  /** This <code>Key[]</code> instance is the handle used for computation when
   *  an MRTask is invoked over an array of <code>Key</code>instances. */
  public Key[] _keys;

  /** The number and type of output Vec instances produced by an MRTask.  If
   *  null then there are no outputs, _appendables will be null, and calls to
   *  <code>outputFrame</code> will return null. */
  private byte _output_types[];

  /** First reserved VectorGroup key index for all output Vecs */
  private int _vid;

  /** New Output vectors; may be null.
   * @return the set of AppendableVec instances or null if _output_types is null  */
  public AppendableVec[] appendables() { return _appendables; }

  /** Appendables are treated separately (roll-ups computed in map/reduce
   *  style, can not be passed via K/V store).*/
  protected AppendableVec[] _appendables;

  /** Internal field to track the left & right remote nodes/JVMs to work on */
  transient protected RPC<T> _nleft, _nrite;

  /** Internal field to track if this is a top-level local call */
  transient protected boolean _topLocal; // Top-level local call, returning results over the wire

  /** Internal field to track if this is a top-level call. */
  transient boolean _topGlobal = false;

  /** Internal field to track the left & right sub-range of chunks to work on */
  transient protected T _left, _rite; // In-progress execution tree

  /** Internal field upon which all reduces occur. */
  transient private T _res;           // Result

  /** The range of Nodes to work on remotely */
  protected short _nlo, _nhi;

  /** Internal field to track a range of local Chunks to work on */
  transient protected int _lo, _hi;

  /** We can add more things to block on - in case we want a bunch of lazy
   *  tasks produced by children to all end before this top-level task ends.
   *  Semantically, these will all complete before we return from the top-level
   *  task.  Pragmatically, we block on a finer grained basis. */
  transient protected Futures _fs; // More things to block on

  /** If true, run entirely local - which will pull all the data locally. */
  protected boolean _run_local;

  public String profString() { return _profile != null ? _profile.toString() : "Profiling turned off"; }
  MRProfile _profile;

  /** Used to invoke profiling.  Call as: <code>new MRTask().profile().doAll();*/
  public T profile() { _profile = new MRProfile(this); return (T)this; }

  /** Get the resulting Frame from this invoked MRTask.  <b>This Frame is not
   *  in the DKV.</b> AppendableVec instances are closed into Vec instances,
   *  which then appear in the DKV.
   *
   *  @return null if no outputs, otherwise returns the resulting Frame from
   *  the MRTask.  The Frame has no column names nor domains.
   */
  public Frame outputFrame() { return outputFrame(null,null,null); }

  /** Get the resulting Frame from this invoked MRTask.  <b>This Frame is not in
   *  the DKV.</b> AppendableVec instances are closed into Vec instances, which
   *  then appear in the DKV.
   *
   *  @param names The names of the columns in the resulting Frame.
   *  @param domains The domains of the columns in the resulting Frame.
   *  @return The result Frame, or null if no outputs
   */
  public Frame outputFrame(String [] names, String [][] domains){ return outputFrame(null,names,domains); }

  /**
   * Get the resulting Frame from this invoked MRTask. If the passed in <code>key</code>
   * is not null, then the resulting Frame will appear in the DKV. AppendableVec instances
   * are closed into Vec instances, which then appear in the DKV.
   *
   * @param key If null, then the Frame will not appear in the DKV. Otherwise, this result
   *            will appear in the DKV under this key.
   * @param names The names of the columns in the resulting Frame.
   * @param domains The domains of the columns in the resulting Frame.
   * @return null if _noutputs is 0, otherwise returns a Frame.
   */
  public Frame outputFrame(Key<Frame> key, String [] names, String [][] domains){
    Futures fs = new Futures();
    Frame res = closeFrame(key, names, domains, fs);
    if( key != null ) DKV.put(res,fs);
    fs.blockForPending();
    return res;
  }

  // the work-horse for the outputFrame calls
  private Frame closeFrame(Key key, String[] names, String[][] domains, Futures fs) {
    if( _output_types == null ) return null;
    final int noutputs = _output_types.length;
    Vec[] vecs = new Vec[noutputs];
    if( _appendables==null || _appendables.length == 0)  // Zero rows?
      for( int i = 0; i < noutputs; i++ )
        vecs[i] = _fr.anyVec().makeZero();
    else {
      int rowLayout = _appendables[0].compute_rowLayout();
      for( int i = 0; i < noutputs; i++ ) {
        _appendables[i].setDomain(domains==null ? null : domains[i]);
        vecs[i] = _appendables[i].close(rowLayout,fs);
      }
    }
    return new Frame(key,names,vecs);
  }

  /** Override with your map implementation.  This overload is given a single
   *  <strong>local</strong> input Chunk.  It is meant for map/reduce jobs that use a
   *  single column in a input Frame.  All map variants are called, but only one is
   *  expected to be overridden. */
  public void map( Chunk c ) { }
  public void map( Chunk c, NewChunk nc ) { }

  /** Override with your map implementation.  This overload is given two
   *  <strong>local</strong> Chunks.  All map variants are called, but only one
   *  is expected to be overridden. */
  public void map( Chunk c0, Chunk c1 ) { }
  public void map( Chunk c0, Chunk c1, NewChunk nc) { }
  public void map( Chunk c0, NewChunk nc0, NewChunk nc1) { }
  //public void map( Chunk c0, Chunk c1, NewChunk nc1, NewChunk nc2 ) { }

  /** Override with your map implementation.  This overload is given three
   * <strong>local</strong> input Chunks.  All map variants are called, but only one
   * is expected to be overridden. */
  public void map( Chunk c0, Chunk c1, Chunk c2 ) { }
  //public void map( Chunk c0, Chunk c1, Chunk c2, NewChunk nc ) { }
  //public void map( Chunk c0, Chunk c1, Chunk c2, NewChunk nc1, NewChunk nc2 ) { }

  /** Override with your map implementation.  This overload is given an array
   *  of <strong>local</strong> input Chunks, for Frames with arbitrary column
   *  numbers.  All map variants are called, but only one is expected to be
   *  overridden. */
  public void map( Chunk cs[] ) { }

  /** The handy method to generate a new vector based on existing vectors.
   *
   * Note: This method is used by Sparkling Water examples.
   *
   * @param cs  input vectors
   * @param nc  output vector
   */
  public void map( Chunk cs[], NewChunk nc ) { }
  public void map( Chunk cs[], NewChunk nc1, NewChunk nc2 ) { }
  public void map( Chunk cs[], NewChunk [] ncs ) { }

  /** Override with your map implementation.  Used when doAll is called with
   *  an array of Keys, and called once-per-Key on the Key's Home node */
  public void map( Key key ) { }

  /** Override to combine results from 'mrt' into 'this' MRTask.  Both 'this'
   *  and 'mrt' are guaranteed to either have map() run on them, or be the
   *  results of a prior reduce().  Reduce is optional if, e.g., the result is
   *  some output vector.  */
  public void reduce( T mrt ) { }

  /** Override to do any remote initialization on the 1st remote instance of
   *  this object, for initializing node-local shared data structures.  */
  protected void setupLocal() {}
  /** Override to do any remote cleaning on the last remote instance of
   *  this object, for disposing of node-local shared data structures.  */
  protected void closeLocal() { }

  /** Compute a permissible node index on which to launch remote work. */
  private int addShift( int x ) { x += _nlo; int sz = H2O.CLOUD.size(); return x < sz ? x : x-sz; }
  private int subShift( int x ) { x -= _nlo; int sz = H2O.CLOUD.size(); return x <  0 ? x+sz : x; }
  private short selfidx() { int idx = H2O.SELF.index(); if( idx>= 0 ) return (short)idx; assert H2O.SELF._heartbeat._client; return 0; }

  // Profiling support.  Time for each subpart of a single M/R task, plus any
  // nested MRTasks.  All numbers are CTM stamps or millisecond times.
  private static class MRProfile extends Iced {
    String _clz;
    public MRProfile(MRTask mrt) {
      _clz = mrt.getClass().toString();
      _localdone = System.currentTimeMillis();
    }
    // See where these are set to understand their meaning.  If we split the
    // job, then _lstart & _rstart are the start of left & right jobs.  If we
    // do NOT split, then _rstart is 0 and _lstart is for the user map job(s).
    long _localstart, _rpcLstart, _rpcRstart, _rpcRdone, _localdone; // Local setup, RPC network i/o times
    long _mapstart, _userstart, _closestart, _mapdone; // MAP phase
    long _onCstart, _reducedone, _closeLocalDone, _remoteBlkDone, _localBlkDone, _onCdone; // REDUCE phase
    // If we split the job left/right, then we get a total recording of the
    // last job, and the exec time & completion time of 1st job done.
    long _time1st, _done1st;
    int _size_rez0, _size_rez1; // i/o size in bytes during reduce
    MRProfile _last;
    long sumTime() { return _onCdone - (_localstart==0 ? _mapstart : _localstart); }
    void gather( MRProfile p, int size_rez ) {
      p._clz=null;
      if( _last == null ) { _last=p; _time1st = p.sumTime(); _done1st = p._onCdone; }
      else {
        MRProfile first = _last._onCdone <= p._onCdone ? _last : p;
        _last           = _last._onCdone >  p._onCdone ? _last : p;
        if( first._onCdone > _done1st ) { _time1st = first.sumTime(); _done1st = first._onCdone; }
      }
      if( size_rez !=0 )        // Record i/o result size
        if( _size_rez0 == 0 ) _size_rez0=size_rez;
        else                  _size_rez1=size_rez;
      assert _userstart !=0 || _last != null;
      assert _last._onCdone >= _done1st;
    }

    @Override public String toString() { return print(new StringBuilder(),0).toString(); }
    private StringBuilder print(StringBuilder sb, int d) {
      if( d==0 ) sb.append(_clz).append("\n");
      for( int i=0; i<d; i++ ) sb.append("  ");
      if( _localstart != 0 ) sb.append("Node local ").append(_localdone - _localstart).append("ms, ");
      if( _last != null ) {   // Forked job?
        sb.append("Slow wait ").append(_mapstart-_localdone).append("ms + work ").append(_last.sumTime()).append("ms, ");
        sb.append("Fast work ").append(_time1st).append("ms + wait ").append(_onCstart-_done1st).append("ms\n");
        _last.print(sb,d+1); // Nested slow-path print
        for( int i=0; i<d; i++ ) sb.append("  ");
        sb.append("join-i/o ").append(_onCstart-_last._onCdone).append("ms, ");
      }
      if( _userstart != 0 ) {                  // Leaf map call?
        sb.append("Map ").append(_mapdone - _mapstart).append("ms (prep ").append(_userstart - _mapstart);
        sb.append("ms, user ").append(_closestart-_userstart);
        sb.append("ms, closeChk ").append(_mapdone-_closestart).append("ms), ");
      }
      sb.append("Red ").append(_onCdone - _onCstart);
      sb.append("ms (locRed ").append(_reducedone-_onCstart).append("ms");
      if( _remoteBlkDone!=0 ) {
        sb.append(  ", close " ).append(_closeLocalDone-    _reducedone);
        sb.append("ms, remBlk ").append( _remoteBlkDone-_closeLocalDone);
        sb.append("ms, locBlk ").append(  _localBlkDone- _remoteBlkDone);
        sb.append("ms, close " ).append(       _onCdone-  _localBlkDone);
        sb.append("ms, size "  ).append(PrettyPrint.bytes(_size_rez0)).append("+").append(PrettyPrint.bytes(_size_rez1));
      }
      sb.append(")\n");
      return sb;
    }
  }

  // Support for fluid-programming with strong types
  protected T self() { return (T)this; }

  /** Invokes the map/reduce computation over the given Vecs.  This call is
   *  blocking. */
  public final T doAll( Vec... vecs ) { return doAll(null,vecs); }
  public final T doAll(byte[] types, Vec... vecs ) { return doAll(types,new Frame(vecs), false); }
  public final T doAll(byte type, Vec... vecs ) { return doAll(new byte[]{type},new Frame(vecs), false); }
  public final T doAll( Vec vec, boolean run_local ) { return doAll(null,vec, run_local); }
  public final T doAll(byte[] types, Vec vec, boolean run_local ) { return doAll(types,new Frame(vec), run_local); }

  /** Invokes the map/reduce computation over the given Frame.  This call is
   *  blocking.  */
  public final T doAll( Frame fr, boolean run_local) { return doAll(null,fr, run_local); }
  public final T doAll( Frame fr ) { return doAll(null,fr, false); }
  public final T doAll( byte[] types, Frame fr) {return doAll(types,fr,false);}
  public final T doAll( byte type, Frame fr) {return doAll(new byte[]{type},fr,false);}
  public final T doAll( byte[] types, Frame fr, boolean run_local) {
    dfork(types,fr, run_local);
    return getResult();
  }
  // Output is several vecs of the same type
  public final T doAll( int nouts, byte type, Frame fr) {
    byte[] types = new byte[nouts];
    Arrays.fill(types, type);
    return doAll(types,fr,false);
  }

  // Special mode doing 1 map per key.  No frame
  public T doAll( Key... keys ) {
    dfork(keys);
    return getResult();         // Block For All
  }
  // Special mode doing 1 map per key.  No frame
  public void dfork(Key... keys ) {
    _topGlobal = true;
    _keys = keys;
    _nlo = selfidx(); _nhi = (short)H2O.CLOUD.size(); // Do Whole Cloud
    setupLocal0();              // Local setup
    H2O.submitTask(this);       // Begin normal execution on a FJ thread
  }

  // Special mode to run once-per-node
  public T doAllNodes() { return doAll((Key[])null); }

  public void asyncExecOnAllNodes() { dfork((Key[]) null); }

  /**
   * Invokes the map/reduce computation over the given Vec instances and produces
   * <code>outputs</code> Vec instances. This call is asynchronous. It returns 'this', on
   * which <code>getResult</code> may be invoked by the caller to block for pending
   * computation to complete.
   *
   * @param types The type of output Vec instances to create.
   * @param vecs The input set of Vec instances upon which computation is performed.
   * @return this
   */
  public final T dfork( byte[] types, Vec... vecs) { return dfork(types,new Frame(vecs),false); }

  public final T dfork(Vec... vecs){ return dfork(null,new Frame(vecs),false); }
  /**
   * Invokes the map/reduce computation over the given Frame instance. This call is
   * asynchronous. It returns 'this', on which <code>getResult</code> may be invoked
   * by the caller to block for pending computation to complete. This call produces no
   * output Vec instances or Frame instances.
   *
   * @param fr Perform the computation on this Frame instance.
   * @return this
   */
  public final T dfork(Frame fr){ return dfork(null,fr,false); }

  /** Fork the task in strictly non-blocking fashion.
   *  Same functionality as dfork, but does not raise priority, so user is should
   *  *never* block on it.
   *  Because it does not raise priority, these can be tail-call chained together
   *  for any length.
   */
  public final T dfork( byte[] types, Frame fr, boolean run_local) {
    _topGlobal = true;
    _output_types = types;
    if( types != null && types.length > 0 )
      _vid = fr.anyVec().group().reserveKeys(types.length);
    _fr = fr;                   // Record vectors to work on
    _nlo = selfidx(); _nhi = (short)H2O.CLOUD.size(); // Do Whole Cloud
    _run_local = run_local;     // Run locally by copying data, or run globally?
    setupLocal0();              // Local setup
    H2O.submitTask(this);       // Begin normal execution on a FJ thread
    return self();
  }

  /** Block for and get any final results from a dfork'd MRTask.
   *  Note: the desired name 'get' is final in ForkJoinTask.  */
  public final T getResult(boolean fjManagedBlock) {
    assert getCompleter()==null; // No completer allowed here; FJ never awakens threads with completers
    do {
      try {
        if(fjManagedBlock)
          ForkJoinPool.managedBlock(this);
        else
          // For the cases when we really want to block this thread without FJ framework scheduling a new worker thread.
          // Model use is in MultifileParseTask - we want to be parsing at most cluster ncores files in parallel.
          block();
        join(); // Throw any exception the map call threw
      } catch (InterruptedException ignore) {
        // do nothing
      } catch (Throwable re) {
        onExceptionalCompletion(re,null); // block for left and rite
        throw (re instanceof DistributedException)?new DistributedException(re.getMessage(),re.getCause()):new DistributedException(re);
      }
    } while( !isReleasable());
    assert _topGlobal:"lost top global flag";
    return self();
  }
  /** Block for and get any final results from a dfork'd MRTask.
   *  Note: the desired name 'get' is final in ForkJoinTask.  */
  public final T getResult() {return getResult(true);}

  // Return true if blocking is unnecessary, which is true if the Task isDone.
  public boolean isReleasable() {  return isDone();  }
  // Possibly blocks the current thread.  Returns true if isReleasable would
  // return true.  Used by the FJ Pool management to spawn threads to prevent
  // deadlock is otherwise all threads would block on waits.
  public boolean block() throws InterruptedException {
    while( !isDone() ) join();
    return true;
  }

  /** Called once on remote at top level, probably with a subset of the cloud.
   *  Called internal by D/F/J.  Not expected to be user-called.  */
  @Override public final void dinvoke(H2ONode sender) {
    setupLocal0();              // Local setup
    H2O.submitTask(this);
  }

  protected boolean modifiesVolatileVecs(){return true;}
  /*
   * Set top-level fields and fire off remote work (if there is any to do) to 2 selected
   * child JVM/nodes. Setup for local work: fire off any global work to cloud neighbors; do all
   * chunks; call user's init.
   */
  private void setupLocal0() {
    if(_profile != null)
      (_profile = new MRProfile(this))._localstart = System.currentTimeMillis();
    // Make a blockable Futures for both internal and user work to block on.
    _fs = new Futures();
    if(modifiesVolatileVecs() && _fr != null){
      for(Vec v:_fr.vecs())
        if(v.isVolatile())v.preWriting();
    }
    _topLocal = true;
    // Check for global vs local work
    int selfidx = selfidx();
    int nlo = subShift(selfidx);
    assert nlo < _nhi;
    final int nmid = (nlo+_nhi)>>>1; // Mid-point

    // Run remote IF:
    // - Not forced to run local (no remote jobs allowed) AND
    // - - There's remote work, or Client mode (always remote work)
    if( (!_run_local) && ((nlo+1 < _nhi) || H2O.ARGS.client) ) {
      if(_profile!=null) _profile._rpcLstart = System.currentTimeMillis();
      _nleft = remote_compute(H2O.ARGS.client ? nlo : nlo+1,nmid);
      if(_profile!=null) _profile._rpcRstart = System.currentTimeMillis();
      _nrite = remote_compute( nmid,_nhi);
      if(_profile!=null) _profile._rpcRdone  = System.currentTimeMillis();
    } else {
      if(_profile!=null)
        _profile._rpcLstart = _profile._rpcRstart = _profile._rpcRdone = System.currentTimeMillis();
    }

    if( _fr != null ) {                       // Doing a Frame
      _lo = 0;  _hi = _fr.numCols()==0 ? 0 : _fr.anyVec().nChunks(); // Do All Chunks
      // get the Vecs from the K/V store, to avoid racing fetches from the map calls
      _fr.vecs();
    } else if( _keys != null ) {    // Else doing a set of Keys
      _lo = 0;  _hi = _keys.length; // Do All Keys
    }
    // Setup any user's shared local structures for both normal cluster nodes
    // and any client; want this for possible reduction ONTO client
    setupLocal();
    if(_profile!=null) _profile._localdone = System.currentTimeMillis();
  }

  // Make an RPC call to some node in the middle of the given range.  Add a
  // pending completion to self, so that we complete when the RPC completes.
  private RPC<T> remote_compute( int nlo, int nhi ) {
    if( nlo < nhi ) {  // have remote work
      int node = addShift(nlo);
      assert node != H2O.SELF.index(); // Not the same as selfidx() if this is a client
      T mrt = copyAndInit();
      mrt._nhi = (short) nhi;
      addToPendingCount(1); // Not complete until the RPC returns
      // Set self up as needing completion by this RPC: when the ACK comes back
      // we'll get a wakeup.
      // Note the subtle inter-play of onCompletion madness here:
      // - when run on the remote, the RPCCall (NOT RPC!) is completed by the
      //   last map/compute2 call, signals end of the remote work, and ACK's
      //   back the result. i.e., last-map calls RPCCall.onCompletion.
      // - when launched on the local (right here, in this next line of code)
      //   the completed RPC calls our self completion.  i.e. the completed RPC
      //   calls MRTask.onCompletion
      return new RPC<>(H2O.CLOUD._memary[node], mrt).addCompleter(this).call();
    }
    return null; // nlo >= nhi => no remote work
  }

  /** Called from FJ threads to do local work.  The first called Task (which is
   *  also the last one to Complete) also reduces any global work.  Called
   *  internal by F/J.  Not expected to be user-called.  */
  @Override public final void compute2() {
    assert _left == null && _rite == null && _res == null;
    if(_profile!=null) _profile._mapstart = System.currentTimeMillis();
    if( (_hi-_lo) >= 2 ) { // Multi-chunk case: just divide-and-conquer to 1 chunk
      final int mid = (_lo+_hi)>>>1; // Mid-point
      _left = copyAndInit();
      _rite = copyAndInit();
      _left._hi = mid;          // Reset mid-point
      _rite._lo = mid;          // Also set self mid-point
      addToPendingCount(1);     // One fork awaiting completion
      if( !isCompletedAbnormally() ) _left.fork();     // Runs in another thread/FJ instance
      if( !isCompletedAbnormally() ) _rite.compute2(); // Runs in THIS F/J thread
      if(_profile!=null) _profile._mapdone = System.currentTimeMillis();
      return;                   // Not complete until the fork completes
    }
    // Zero or 1 chunks, and further chunk might not be homed here
    if( _fr==null ) {           // No Frame, so doing Keys?
      if( _keys == null ||     // Once-per-node mode
          _hi > _lo && _keys[_lo].home() ) {
        assert(_keys == null || !H2O.ARGS.client) : "Client node should not process any keys in MRTask!";
        if(_profile!=null) _profile._userstart = System.currentTimeMillis();
        if( _keys != null ) map(_keys[_lo]);
        _res = self();        // Save results since called map() at least once!
        if(_profile!=null) _profile._closestart = System.currentTimeMillis();
      }
    } else if( _hi > _lo ) {    // Frame, Single chunk?
      Vec v0 = _fr.anyVec();
      if( _run_local || v0.chunkKey(_lo).home() ) { // And chunk is homed here?
        assert(_run_local || !H2O.ARGS.client) : "Client node should not process any keys in MRTask!";

        // Make decompression chunk headers for these chunks
        Vec vecs[] = _fr.vecs();
        Chunk bvs[] = new Chunk[vecs.length];
        NewChunk [] appendableChunks = null;
        for( int i=0; i<vecs.length; i++ )
          if( vecs[i] != null ) {
            assert _run_local || vecs[i].chunkKey(_lo).home()
              : "Chunk="+_lo+" v0="+v0+", k="+v0.chunkKey(_lo)+"   v["+i+"]="+vecs[i]+", k="+vecs[i].chunkKey(_lo);
            bvs[i] = vecs[i].chunkForChunkIdx(_lo);
          }

        if(_output_types != null) {
          final VectorGroup vg = vecs[0].group();
          _appendables = new AppendableVec[_output_types.length];
          appendableChunks = new NewChunk[_output_types.length];
          for(int i = 0; i < _appendables.length; ++i) {
            _appendables[i] = new AppendableVec(vg.vecKey(_vid+i),_output_types[i]);
            appendableChunks[i] = _appendables[i].chunkForChunkIdx(_lo);
          }
        }
        // Call all the various map() calls that apply
        if(_profile!=null)
          _profile._userstart = System.currentTimeMillis();

        int num_fr_vecs = _fr.vecs().length;
        int num_outputs = _output_types == null? 0 : _output_types.length;
        if (num_outputs == 0) {
          if (num_fr_vecs == 1) map(bvs[0]);
          else if (num_fr_vecs == 2) map(bvs[0], bvs[1]);
          else if (num_fr_vecs == 3) map(bvs[0], bvs[1], bvs[2]);
          map(bvs);
        }
        else if (num_outputs == 1) {  // convenience versions for cases with single output.
          assert appendableChunks != null;
          if (num_fr_vecs == 1) map(bvs[0], appendableChunks[0]);
          else if (num_fr_vecs == 2) map(bvs[0], bvs[1], appendableChunks[0]);
          // else if (fr_vecs_length == 3) map(bvs[0], bvs[1], bvs[2], appendableChunks[0]);
          map(bvs, appendableChunks[0]);
        }
        else if (num_outputs == 2) {  // convenience versions for cases with 2 outputs (e.g split).
          assert appendableChunks != null;
          if (num_fr_vecs == 1) map(bvs[0], appendableChunks[0], appendableChunks[1]);
          // else if (fr_vecs_length == 2) map(bvs[0], bvs[1], appendableChunks[0], appendableChunks[1]);
          // else if (fr_vecs_length == 3) map(bvs[0], bvs[1], bvs[2], appendableChunks[0], appendableChunks[1]);
          map(bvs, appendableChunks[0], appendableChunks[1]);
        }
        if (num_outputs >= 0)
          map(bvs, appendableChunks);

        _res = self();          // Save results since called map() at least once!
        // Further D/K/V put any new vec results.
        if(_profile!=null)
          _profile._closestart = System.currentTimeMillis();
        for( Chunk bv : bvs )  bv.close(_lo,_fs);
        if( _output_types != null) for(NewChunk nch:appendableChunks)nch.close(_lo, _fs);
      }
    }
    if(_profile!=null)
      _profile._mapdone = System.currentTimeMillis();
    tryComplete();
  }

  /** OnCompletion - reduce the left and right into self.  Called internal by
   *  F/J.  Not expected to be user-called. */
  @Override public final void onCompletion( CountedCompleter caller ) {
    if(_profile!=null) _profile._onCstart = System.currentTimeMillis();
    // Reduce results into 'this' so they collapse going up the execution tree.
    // NULL out child-references so we don't accidentally keep large subtrees
    // alive since each one may be holding large partial results.
    reduce2(_left); _left = null;
    reduce2(_rite); _rite = null;
    if(_profile!=null) _profile._reducedone = System.currentTimeMillis();
    // Only on the top local call, have more completion work
    if( _topLocal ) postLocal0();
    if(_profile!=null) _profile._onCdone = System.currentTimeMillis();
  }


  // Call 'reduce' on pairs of mapped MRTask's.
  // Collect all pending Futures from both parties as well.
  private void reduce2( MRTask<T> mrt ) {
    if( mrt == null ) return;
    if(_profile!=null)
      _profile.gather(mrt._profile,0);
    if( _res == null ) _res = mrt._res;
    else if( mrt._res != null ) _res.reduce4(mrt._res);
    // Futures are shared on local node and transient (so no remote updates)
    assert _fs == mrt._fs;
  }

  protected void postGlobal(){}

  // Work done after all the main local work is done.
  // Gather/reduce remote work.
  // User cleanup.
  // Block for other queued pending tasks.
  // Copy any final results into 'this', such that a return of 'this' has the results.
  private void postLocal0() {
    closeLocal();               // User's node-local cleanup
    if(_profile!=null) _profile._closeLocalDone = System.currentTimeMillis();
    reduce3(_nleft);            // Reduce global results from neighbors.
    reduce3(_nrite);
    if(_profile!=null) _profile._remoteBlkDone = System.currentTimeMillis();
    _fs.blockForPending();      // Block any pending user tasks
    if(_profile!=null) _profile._localBlkDone = System.currentTimeMillis();
    // Finally, must return all results in 'this' because that is the API -
    // what the user expects
    if( _res == null ) _nhi=-1; // Flag for no local results *at all*
    else if( _res != this ) {   // There is a local result, and its not self
      _res._profile = _profile; // Use my profile (not child's)
      copyOver(_res);           // So copy into self
    }
    if( _topGlobal ) {
      if (_fr != null)     // Do any post-writing work (zap rollup fields, etc)
        _fr.postWrite(_fs).blockForPending();
      postGlobal();             // User's continuation work
    }

  }

  // Block for RPCs to complete, then reduce global results into self results
  private void reduce3( RPC<T> rpc ) {
    if( rpc == null ) return;
    T mrt = rpc.get();          // This is a blocking remote call
    // Note: because _fs is transient it is not set or cleared by the RPC.
    // Because the MRT object is a clone of 'self' it's likely to contain a ptr
    // to the self _fs which will be not-null and still have local pending
    // blocks.  Not much can be asserted there.
    if(_profile!=null)
      _profile.gather(mrt._profile, rpc.size_rez());
    // Unlike reduce2, results are in mrt directly not mrt._res.
    if( mrt._nhi != -1L ) {     // Any results at all?
      if( _res == null ) _res = mrt;
      else _res.reduce4(mrt);
    }
  }

  /** Call user's reduction.  Also reduce any new AppendableVecs.  Called
   *  internal by F/J.  Not expected to be user-called.  */
  void reduce4( T mrt ) {
    // Reduce any AppendableVecs
    if( _output_types != null )
      for( int i=0; i<_appendables.length; i++ )
        _appendables[i].reduce(mrt._appendables[i]);
    if( _ex == null ) _ex = mrt._ex;
    // User's reduction
    reduce(mrt);
  }

  // Full local work-tree cancellation
  void self_cancel2() { if( !isDone() ) { cancel(true); self_cancel1(); } }
  private void self_cancel1() {
    T l = _left; if( l != null ) { l.self_cancel2(); }
    T r = _rite; if( r != null ) { r.self_cancel2(); }
  }

  /** Cancel/kill all work as we can, then rethrow... do not invisibly swallow
   *  exceptions (which is the F/J default).  Called internal by F/J.  Not
   *  expected to be user-called.  */
  @Override public final boolean onExceptionalCompletion( Throwable ex, CountedCompleter caller ) {
    self_cancel1();
    // Block for completion - we don't want the work, but we want all the
    // workers stopped before we complete this task.  Otherwise this task quits
    // early and begins post-task processing (generally cleanup from the
    // exception) but the work is still on-going - often trying to use the same
    // Keys as are being cleaned-up!

    // Since blocking can throw (generally the same exception, again and again)
    // catch & ignore, keeping only the first one we already got.
    RPC<T> nl = _nleft; if( nl != null ) try { nl.get(); } catch( Throwable ignore ) { } _nleft = null;
    RPC<T> nr = _nrite; if( nr != null ) try { nr.get(); } catch( Throwable ignore ) { } _nrite = null;
    return true;
  }

  // Make copy, setting final-field completer and clearing out a bunch of fields
  private T copyAndInit() {
    T x = clone();
    x._topGlobal = false;
    x.setCompleter(this); // Set completer, what used to be a final field
    x._topLocal = false;  // Not a top job
    x._nleft = x._nrite = null;
    x. _left = x. _rite = null;
    x._fs = _fs;
    if( _profile!=null )  x._profile = new MRProfile(this);
    else                  x._profile = null;    // Clone needs its own profile
    x.setPendingCount(0); // Volatile write for completer field; reset pending count also
    return x;
  }
}