SharedTreeModelBuilder.java example

Explorer
h2o-2-master
package hex.gbm;

import water.api.AUCData;
import static water.util.MRUtils.sampleFrameStratified;
import static water.util.ModelUtils.getPrediction;
import hex.ConfusionMatrix;
import hex.VarImp;
import hex.drf.DRF;
import hex.rng.MersenneTwisterRNG;
import jsr166y.CountedCompleter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.Job.ValidatedJob;
import water.api.AUC;
import water.api.DocGen;
import water.api.ParamImportance;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.Log;
import water.util.Log.Tag.Sys;
import water.util.MRUtils;
import water.util.ModelUtils;
import water.util.Utils;

import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import java.util.Random;

/**
 *  Shared (distributed) trees builder.
 *
 *  <p>Used for both <em>Gradient Boosted Method</em> (see {@link GBM}) and <em>Random
 *  Forest</em> (see {@link DRF}), and really could be used for any decision-tree builder.</p>
 *
 *  <p>While this is a wholly H<sub>2</sub>O-design, we found these papers afterwards that
 *  describes our design fairly well:</p>
 * <ul>
 *  <li><a href="http://www.cse.wustl.edu/~kilian/papers/fr819-tyreeA.pdf">Parallel GBRT</a></li>
 *  <li><a href="http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf">Streaming parallel decision tree</a></li>
 * </ul>
 *
 * <p>Note that our <em>dynamic histogram</em> technique is different (surely faster, and
 * probably less mathematically clean).  I'm sure a host of other smaller details
 * differ also - but in the Big Picture the paper and our algorithm are similar.</p>
*/
public abstract class SharedTreeModelBuilder<TM extends DTree.TreeModel> extends ValidatedJob {
  static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields
  static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.

  @API(help = "Number of trees. Grid Search, comma sep values:50,100,150,200", filter = Default.class, lmin=1, lmax=1000000, json=true, importance=ParamImportance.CRITICAL)
  public int ntrees = 50;

  @API(help = "Maximum tree depth. Grid Search, comma sep values:5,7", filter = Default.class, lmin=1, lmax=10000, json=true, importance=ParamImportance.CRITICAL)
  public int max_depth = 5;

  @API(help = "Fewest allowed observations in a leaf (in R called 'nodesize'). Grid Search, comma sep values", filter = Default.class, lmin=1, json=true, importance=ParamImportance.SECONDARY)
  public int min_rows = 10;

  @API(help = "Build a histogram of this many bins, then split at the best point", filter = Default.class, lmin=2, lmax=10000, json=true, importance=ParamImportance.SECONDARY)
  public int nbins = 20;

  @API(help = "Perform scoring after each iteration (can be slow)", filter = Default.class, json=true)
  public boolean score_each_iteration = false;

  @API(help = "Compute variable importance (true/false).", filter = Default.class )
  protected boolean importance = true; // compute variable importance

  /**
   * For imbalanced data, balance training data class counts via
   * over/under-sampling. This can result in improved predictive accuracy.
   */
  @API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
  public boolean balance_classes = false;

  /**
   * Desired over/under-sampling ratios per class (lexicographic order). Only when balance_classes is enabled. If not specified, they will be automatically computed to obtain class balance during training.
   */
  @API(help = "Desired over/under-sampling ratios per class (lexicographic order).", filter = Default.class, dmin = 0, json = true, importance = ParamImportance.SECONDARY)
  public float[] class_sampling_factors;

  /**
   * When classes are balanced, limit the resulting dataset size to the
   * specified multiple of the original dataset size.
   */
  @API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, importance = ParamImportance.EXPERT)
  public float max_after_balance_size = Float.POSITIVE_INFINITY;

  @API(help = "Model checkpoint to start building a new model from", filter = Default.class, json = true, required = false)
  public Key checkpoint;

  @API(help = "Overwrite checkpoint", filter = Default.class, json = true, required = false)
  public boolean overwrite_checkpoint = true;

//  @API(help = "Active feature columns")
  protected int _ncols;

//  @API(help = "Rows in training dataset")
  protected long _nrows;

//  @API(help = "Number of classes")
  protected int _nclass;

  @API(help = "Class distribution")
  protected long _distribution[];

  // Distribution of classes in response
  protected float[] _priorClassDist = null;
  // New distribution of classes if input frame was modified (resampled, balanced)
  protected float[] _modelClassDist = null;

  // Number of trees inherited from checkpoint
  protected int _ntreesFromCheckpoint;

  /** Maximal number of supported levels in response. */
  public static final int MAX_SUPPORTED_LEVELS = 1000;

  /** Marker for already decided row. */
  static public final int DECIDED_ROW = -1;
  /** Marker for sampled out rows */
  static public final int OUT_OF_BAG = -2;

  @Override public float progress(){
    Value value = DKV.get(dest());
    DTree.TreeModel m = value != null ? (DTree.TreeModel) value.get() : null;
    return m == null ? 0 : cv_progress(m.ntrees() / (float) m.N);
  }

  // Verify input parameters
  @Override protected void init() {
    super.init();
    // Check parameters
    assert 0 <= ntrees && ntrees < 1000000; // Sanity check
    // Should be handled by input
    //assert response.isEnum() : "Response is not enum";
    assert (classification && (response.isInt() || response.isEnum())) ||   // Classify Int or Enums
           (!classification && !response.isEnum()) : "Classification="+classification + " and response="+response.isInt();  // Regress  Int or Float

    if (source.numRows() - response.naCnt() <=0)
      throw new IllegalArgumentException("Dataset contains too many NAs!");

    _ncols = _train.length;
    _nrows = source.numRows() - response.naCnt();

    assert (_nrows>0) : "Dataset contains no rows - validation of input parameters is probably broken!";
    // Transform response to enum
    // TODO: moved to shared model job
    if( !response.isEnum() && classification ) {
      response = response.toEnum();
      gtrash(response); //_gen_enum = true;
    }
    _nclass = response.isEnum() ? (char)(response.domain().length) : 1;
    if (classification && _nclass <= 1)
      throw new IllegalArgumentException("Constant response column!");
    if (_nclass > MAX_SUPPORTED_LEVELS)
      throw new IllegalArgumentException("Too many levels in response column!");

    int usableColumns = 0;
    assert _ncols == _train.length : "Number of selected train columns does not correspond to a number of columns!";
    for (int i = 0; i < _ncols; i++) {
      Vec v = _train[i];
      if (v.isBad() || v.isConst()) continue;
      usableColumns++;
    }
    if (usableColumns==0) throw new IllegalArgumentException("There is no usable column to generate model!");

    if (checkpoint!=null && DKV.get(checkpoint)==null) throw new IllegalArgumentException("Checkpoint "+checkpoint.toString() + " does not exists!");
  }

  @Override protected Key defaultDestKey() {
    if (checkpoint!=null && overwrite_checkpoint)
      return checkpoint;
    else
      return super.defaultDestKey();
  }

  // --------------------------------------------------------------------------
  // Driver for model-building.
  public void buildModel(long seed) {
    final Key outputKey = dest();
    final Key dataKey = source != null ? source._key : null;
    final Key testKey = validation != null ? validation._key : dataKey;

    // Lock the input datasets against deletes
    source.read_lock(self());
    if( validation != null && !source._key.equals(validation._key) )
      validation.read_lock(self());

    // Prepare a frame for this tree algorithm run
    Frame fr = new Frame(_names, _train);
    fr.add(_responseName,response);
    final Frame frm = new Frame(fr); // Model-Frame; no extra columns
    String names[] = frm.names();
    String domains[][] = frm.domains();

    // For doing classification on Integer (not Enum) columns, we want some
    // handy names in the Model.  This really should be in the Model code.
    String[] domain = response.domain();
    if( domain == null && _nclass > 1 ) // No names?  Something is wrong since we converted response to enum already !
      assert false : "Response domain' names should be always presented in case of classification";
    if( domain == null ) domain = new String[] {"r"}; // For regression, give a name to class 0

    // Compute class distribution
    if (classification) {
      MRUtils.ClassDist cdmt = new MRUtils.ClassDist(_nclass).doAll(response);
      _distribution = cdmt.dist();
      _priorClassDist = cdmt.rel_dist();
    }

    // Handle imbalanced classes by stratified over/under-sampling
    // initWorkFrame sets the modeled class distribution, and model.score() corrects the probabilities back using the distribution ratios
    float[] trainSamplingFactors;
    if (class_sampling_factors != null && !balance_classes) {
      Log.info("Ignoring class_sampling_factors since balance_classes is not enabled.");
    }
    if (classification && balance_classes) {
      int response_idx = fr.find(_responseName);
      trainSamplingFactors = new float[domain.length]; //leave initialized to 0 -> will be filled up below
      if (class_sampling_factors != null) {
        if (class_sampling_factors.length != fr.vecs()[response_idx].domain().length)
          throw new IllegalArgumentException("class_sampling_factors must have " + fr.vecs()[response_idx].domain().length + " elements");
        trainSamplingFactors = class_sampling_factors.clone(); //clone: don't modify the original
      }
      Frame stratified = sampleFrameStratified(
              fr, fr.lastVec(), trainSamplingFactors, (long)(max_after_balance_size*fr.numRows()), seed, true, false);
      if (stratified != fr) {
        fr = stratified;
        _nrows = fr.numRows();
        response = fr.vecs()[response_idx];
        // Recompute distribution since the input frame was modified
        MRUtils.ClassDist cdmt = new MRUtils.ClassDist(_nclass).doAll(response);
        _distribution = cdmt.dist();
        _modelClassDist = cdmt.rel_dist();
        gtrash(stratified);
      }
    }
    Log.info(logTag(), "Prior class distribution: " + Arrays.toString(_priorClassDist));
    Log.info(logTag(), "Model class distribution: " + Arrays.toString(_modelClassDist));

    // Also add to the basic working Frame these sets:
    //   nclass Vecs of current forest results (sum across all trees)
    //   nclass Vecs of working/temp data
    //   nclass Vecs of NIDs, allowing 1 tree per class

    // Current forest values: results of summing the prior M trees
    for( int i=0; i<_nclass; i++ )
      fr.add("Tree_"+domain[i], response.makeZero());

    // Initial work columns.  Set-before-use in the algos.
    for( int i=0; i<_nclass; i++ )
      fr.add("Work_"+domain[i], response.makeZero());

    // One Tree per class, each tree needs a NIDs.  For empty classes use a -1
    // NID signifying an empty regression tree.
    for( int i=0; i<_nclass; i++ )
      fr.add("NIDs_"+domain[i], response.makeCon(_distribution==null ? 0 : (_distribution[i]==0?-1:0)));

    // Timer  for model building
    Timer bm_timer =  new Timer();
    long before = System.currentTimeMillis();
    // Fetch checkpoint
    assert checkpoint==null || (!(overwrite_checkpoint && checkpoint!=null) || outputKey==checkpoint): "If checkpoint is to be overwritten then outputkey has to equal to checkpoint key";
    TM checkpointModel = checkpoint!=null ? (TM) UKV.get(checkpoint) : null;
    // Create an INITIAL MODEL based on given parameters
    TM model = makeModel(outputKey, dataKey, testKey, checkpointModel!=null?ntrees+checkpointModel.ntrees():ntrees,names, domains, getCMDomain(), _priorClassDist, _modelClassDist);
    // Update the model by a checkpoint
    if (checkpointModel!=null) {
      checkpointModel.read_lock(self()); // lock it for read to avoid any other job to start working on it
      try {
        // Create a new initial model based on given checkpoint
        // TODO: check compatibility of parameters !
        model = updateModel(model, checkpointModel, overwrite_checkpoint);
        _ntreesFromCheckpoint = checkpointModel.ntrees();
      } finally { checkpointModel.unlock(self()); }
    }

    // Save the model ! (delete_and_lock has side-effect of saving model into DKV)
    if (checkpoint!=null && overwrite_checkpoint)
      model.write_lock(self()); // do not delete previous model since it would trigger delete of stored trees which we need
    else
      model.delete_and_lock(self()); // we can safely delete any previous model since this one should be the first one
    // Prepare and cache adapted validation dataset if it is necessary
    prepareValidationWithModel(model);

    try {
      // Initialized algorithm
      initAlgo(model);
      // Init working frame
      initWorkFrame(model, fr);
      // Compute the model
      model = buildModel(model, fr, names, domains, bm_timer);
    //} catch (Throwable t) { t.printStackTrace();
    } finally {
      model.unlock(self());  // Update and unlock model
      cleanUp(fr,bm_timer);  // Shared cleanup
      model.start_training(before);
      model.stop_training();
    }
  }

  // Tree model cleanup
  protected void cleanUp(Frame fr, Timer t_build) {
    //super.cleanUp(fr, t_build);
    Log.info(logTag(),"Modeling done in "+t_build);

    // Remove temp vectors; cleanup the Frame
    while( fr.numCols() > _ncols+1/*Do not delete the response vector*/ )
      UKV.remove(fr.remove(fr.numCols()-1)._key);

    // Unlock the input datasets against deletes
    source.unlock(self());
    if( validation != null && !source._key.equals(validation._key) )
      validation.unlock(self());
  }

  transient long _timeLastScoreStart, _timeLastScoreEnd, _firstScore;
  protected TM doScoring(TM model, Frame fTrain, DTree[] ktrees, int tid, DTree.TreeModel.TreeStats tstats, boolean finalScoring, boolean oob, boolean build_tree_one_node ) {
    long now = System.currentTimeMillis();
    if( _firstScore == 0 ) _firstScore=now;
    long sinceLastScore = now-_timeLastScoreStart;
    Score sc = null;
    // If validation is specified we use a model for scoring, so we need to update it!
    // First we save model with trees (i.e., make them available for scoring)
    // and then update it with resulting error
    model = makeModel(model, ktrees, tstats);
    model.update(self());
    // Now model already contains tid-trees in serialized form
    if( score_each_iteration ||
        finalScoring ||
        (now-_firstScore < 4000) || // Score every time for 4 secs
        // Throttle scoring to keep the cost sane; limit to a 10% duty cycle & every 4 secs
        (sinceLastScore > 4000 && // Limit scoring updates to every 4sec
         (double)(_timeLastScoreEnd-_timeLastScoreStart)/sinceLastScore < 0.1) ) { // 10% duty cycle
      _timeLastScoreStart = now;
      // Perform scoring - first get adapted validation response
      Response2CMAdaptor vadaptor = getValidAdaptor();
      sc = new Score().doIt(model, fTrain, vadaptor, oob, build_tree_one_node).report(logTag(),tid,ktrees);
      _timeLastScoreEnd = System.currentTimeMillis();
    }

    // Compute variable importance for this tree if necessary
    VarImp varimp = null;
    if (importance && ktrees!=null) { // compute this tree votes but skip the first scoring call which is done over empty forest
      Timer vi_timer = new Timer();
      varimp  = doVarImpCalc(model, ktrees, tid-1, fTrain, false);
      Log.info(logTag(), "Computation of variable importance with "+tid+"th-tree took: " + vi_timer.toString());
    }
    // Double update - after scoring
    model = makeModel(model,
                      sc==null ? Double.NaN : sc.mse(),
                      sc==null ? null : (_nclass>1? new ConfusionMatrix(sc._cm):null),
                      varimp,
                      sc==null ? null : (_nclass==2 ? makeAUC(toCMArray(sc._cms), ModelUtils.DEFAULT_THRESHOLDS) : null)
                      );
    model.update(self());
    return model;
  }

  protected abstract VarImp doVarImpCalc(TM model, DTree[] ktrees, int tid, Frame validationFrame, boolean scale);

  private ConfusionMatrix[] toCMArray(long[][][] cms) {
    int n = cms.length;
    ConfusionMatrix[] res = new ConfusionMatrix[n];
    for (int i = 0; i < n; i++) res[i] = new ConfusionMatrix(cms[i]);
    return res;
  }

  public boolean supportsBagging() { return false; }

  // --------------------------------------------------------------------------
  // Convenvience accessor for a complex chunk layout.
  // Wish I could name the array elements nicer...
  protected Chunk chk_resp( Chunk chks[]        ) { return chks[_ncols]; }
  protected Chunk chk_tree( Chunk chks[], int c ) { return chks[_ncols+1+c]; }
  protected Chunk chk_work( Chunk chks[], int c ) { return chks[_ncols+1+_nclass+c]; }
  protected Chunk chk_nids( Chunk chks[], int t ) { return chks[_ncols+1+_nclass+_nclass+t]; }
  // Out-of-bag trees counter - only one since it is shared via k-trees
  protected Chunk chk_oobt(Chunk chks[]) { return chks[_ncols+1+_nclass+_nclass+_nclass]; }

  protected final Vec vec_nids( Frame fr, int t) { return fr.vecs()[_ncols+1+_nclass+_nclass+t]; }
  protected final Vec vec_resp( Frame fr, int t) { return fr.vecs()[_ncols]; }
  protected final Vec vec_tree( Frame fr, int c ) { return fr.vecs()[_ncols+1+c]; }

  protected double[] data_row( Chunk chks[], int row, double[] data) {
    assert data.length == _ncols;
    for(int f=0; f<_ncols; f++) data[f] = chks[f].at0(row);
    return data;
  }

  // --------------------------------------------------------------------------
  // Fuse 2 conceptual passes into one:
  //
  // Pass 1: Score a prior partially-built tree model, and make new Node
  //         assignments to every row.  This involves pulling out the current
  //         assigned DecidedNode, "scoring" the row against that Node's
  //         decision criteria, and assigning the row to a new child
  //         UndecidedNode (and giving it an improved prediction).
  //
  // Pass 2: Build new summary DHistograms on the new child UndecidedNodes
  //         every row got assigned into.  Collect counts, mean, variance, min,
  //         max per bin, per column.
  //
  // The result is a set of DHistogram arrays; one DHistogram array for
  // each unique 'leaf' in the tree being histogramed in parallel.  These have
  // node ID's (nids) from 'leaf' to 'tree._len'.  Each DHistogram array is
  // for all the columns in that 'leaf'.
  //
  // The other result is a prediction "score" for the whole dataset, based on
  // the previous passes' DHistograms.
  public class ScoreBuildHistogram extends MRTask2<ScoreBuildHistogram> {
    final int   _k;    // Which tree
    final DTree _tree; // Read-only, shared (except at the histograms in the Nodes)
    final int   _leaf; // Number of active leaves (per tree)
    // Histograms for every tree, split & active column
    final DHistogram _hcs[/*tree-relative node-id*/][/*column*/];
    final boolean _subset;      // True if working a subset of cols
    public ScoreBuildHistogram(H2OCountedCompleter cc, int k, DTree tree, int leaf, DHistogram hcs[][], boolean subset) {
      super(cc);
      _k   = k;
      _tree= tree;
      _leaf= leaf;
      _hcs = hcs;
      _subset = subset;
    }

    // Once-per-node shared init
    @Override public void setupLocal( ) {
      // Init all the internal tree fields after shipping over the wire
      _tree.init_tree();
      // Allocate local shared memory histograms
      for( int l=_leaf; l<_tree._len; l++ ) {
        DTree.UndecidedNode udn = _tree.undecided(l);
        DHistogram hs[] = _hcs[l-_leaf];
        int sCols[] = udn._scoreCols;
        if( sCols != null ) { // Sub-selecting just some columns?
          for( int j=0; j<sCols.length; j++) // For tracked cols
            hs[sCols[j]].init();
        } else {                // Else all columns
          for( int j=0; j<_ncols; j++) // For all columns
            if( hs[j] != null )        // Tracking this column?
              hs[j].init();
        }
      }
    }

    @Override public void map( Chunk[] chks ) {
      assert chks.length==_ncols+4;
      final Chunk tree = chks[_ncols+1];
      final Chunk wrks = chks[_ncols+2];
      final Chunk nids = chks[_ncols+3];

      // Pass 1: Score a prior partially-built tree model, and make new Node
      // assignments to every row.  This involves pulling out the current
      // assigned DecidedNode, "scoring" the row against that Node's decision
      // criteria, and assigning the row to a new child UndecidedNode (and
      // giving it an improved prediction).
      int nnids[] = new int[nids._len];
      if( _leaf > 0)            // Prior pass exists?
        score_decide(chks,nids,wrks,tree,nnids);
      else                      // Just flag all the NA rows
        for( int row=0; row<nids._len; row++ )
          if( isDecidedRow((int)nids.at0(row)) ) nnids[row] = -1;

      // Pass 2: accumulate all rows, cols into histograms
      if( _subset ) accum_subset(chks,nids,wrks,nnids);
      else          accum_all   (chks,     wrks,nnids);
    }

    @Override public void reduce( ScoreBuildHistogram sbh ) {
      // Merge histograms
      if( sbh._hcs == _hcs ) return; // Local histograms all shared; free to merge
      // Distributed histograms need a little work
      for( int i=0; i<_hcs.length; i++ ) {
        DHistogram hs1[] = _hcs[i], hs2[] = sbh._hcs[i];
        if( hs1 == null ) _hcs[i] = hs2;
        else if( hs2 != null )
          for( int j=0; j<hs1.length; j++ )
            if( hs1[j] == null ) hs1[j] = hs2[j];
            else if( hs2[j] != null )
              hs1[j].add(hs2[j]);
      }
    }

    // Pass 1: Score a prior partially-built tree model, and make new Node
    // assignments to every row.  This involves pulling out the current
    // assigned DecidedNode, "scoring" the row against that Node's decision
    // criteria, and assigning the row to a new child UndecidedNode (and
    // giving it an improved prediction).
    private void score_decide(Chunk chks[], Chunk nids, Chunk wrks, Chunk tree, int nnids[]) {
      for( int row=0; row<nids._len; row++ ) { // Over all rows
        int nid = (int)nids.at80(row);         // Get Node to decide from
        if( isDecidedRow(nid)) {               // already done
          nnids[row] = (nid-_leaf);
          continue;
        }
        // Score row against current decisions & assign new split
        boolean oob = isOOBRow(nid);
        if( oob ) nid = oob2Nid(nid); // sampled away - we track the position in the tree
        DTree.DecidedNode dn = _tree.decided(nid);
        if (dn._split._col == -1 && DTree.isRootNode(dn)) { nnids[row] = (nid-_leaf); continue; }
        if( dn._split._col == -1 ) { // Might have a leftover non-split
          nid = dn._pid;             // Use the parent split decision then
          int xnid = oob ? nid2Oob(nid) : nid;
          nids.set0(row, xnid);
          nnids[row] = xnid-_leaf;
          dn = _tree.decided(nid); // Parent steers us
        }

        assert !isDecidedRow(nid);
        nid = dn.ns(chks,row); // Move down the tree 1 level
        if( !isDecidedRow(nid) ) {
          int xnid = oob ? nid2Oob(nid) : nid;
          nids.set0(row, xnid);
          nnids[row] = xnid-_leaf;
        } else {
          nnids[row] = nid-_leaf;
        }
      }
    }

    // All rows, some cols, accumulate histograms
    private void accum_subset(Chunk chks[], Chunk nids, Chunk wrks, int nnids[]) {
      for( int row=0; row<nnids.length; row++ ) { // Over all rows
        int nid = nnids[row];                     // Get Node to decide from
        if( nid >= 0 ) {        // row already predicts perfectly or OOB
          assert !Double.isNaN(wrks.at0(row)); // Already marked as sampled-away
          DHistogram nhs[] = _hcs[nid];
          int sCols[] = _tree.undecided(nid+_leaf)._scoreCols; // Columns to score (null, or a list of selected cols)
          for( int j=0; j<sCols.length; j++) { // For tracked cols
            final int c = sCols[j];
            nhs[c].incr((float)chks[c].at0(row),wrks.at0(row)); // Histogram row/col
          }
        }
      }
    }

    // All rows, all cols, accumulate histograms.  This is the hot hot inner
    // loop of GBM, so we do some non-standard optimizations.  The rows in this
    // chunk are spread out amongst a modest set of NodeIDs/splits.  Normally
    // we would visit the rows in row-order, but this visits the NIDs in random
    // order.  The hot-part of this code updates the histograms racily (via
    // atomic updates) - once-per-row.  This optimized version updates the
    // histograms once-per-NID, but requires pre-sorting the rows by NID.
    private void accum_all(Chunk chks[], Chunk wrks, int nnids[]) {
      final DHistogram hcs[][] = _hcs;
      // Sort the rows by NID, so we visit all the same NIDs in a row
      // Find the count of unique NIDs in this chunk
      int nh[] = new int[hcs.length+1];
      for( int i : nnids ) if( i >= 0 ) nh[i+1]++;
      // Rollup the histogram of rows-per-NID in this chunk
      for( int i=0; i<hcs.length; i++ ) nh[i+1] += nh[i];
      // Splat the rows into NID-groups
      int rows[] = new int[nnids.length];
      for( int row=0; row<nnids.length; row++ )
        if( nnids[row] >= 0 )
          rows[nh[nnids[row]]++] = row;
      // rows[] has Chunk-local ROW-numbers now, in-order, grouped by NID.
      // nh[] lists the start of each new NID, and is indexed by NID+1.
      accum_all2(chks,wrks,nh,rows);
    }

    // For all columns, for all NIDs, for all ROWS...
    private void accum_all2(Chunk chks[], Chunk wrks, int nh[], int[] rows) {
      final DHistogram hcs[][] = _hcs;
      // Local temp arrays, no atomic updates.
      int    bins[] = new int   [nbins];
      double sums[] = new double[nbins];
      double ssqs[] = new double[nbins];
      // For All Columns
      for( int c=0; c<_ncols; c++) { // for all columns
        Chunk chk = chks[c];
        // For All NIDs
        for( int n=0; n<hcs.length; n++ ) {
          final DRealHistogram rh = ((DRealHistogram)hcs[n][c]);
          if( rh==null ) continue; // Ignore untracked columns in this split
          final int lo = n==0 ? 0 : nh[n-1];
          final int hi = nh[n];
          float min = rh._min2;
          float max = rh._maxIn;
          // While most of the time we are limited to nbins, we allow more bins
          // in a few cases (top-level splits have few total bins across all
          // the (few) splits) so it's safe to bin more; also categoricals want
          // to split one bin-per-level no matter how many levels).
          if( rh._bins.length >= bins.length ) { // Grow bins if needed
            bins = new int   [rh._bins.length];
            sums = new double[rh._bins.length];
            ssqs = new double[rh._bins.length];
          }

          // Gather all the data for this set of rows, for 1 column and 1 split/NID
          // Gather min/max, sums and sum-squares.
          for( int xrow=lo; xrow<hi; xrow++ ) {
            int row = rows[xrow];
            float col_data = (float)chk.at0(row);
            if( col_data < min ) min = col_data;
            if( col_data > max ) max = col_data;
            int b = rh.bin(col_data); // Compute bin# via linear interpolation
            bins[b]++;                // Bump count in bin
            double resp = wrks.at0(row);
            sums[b] += resp;
            ssqs[b] += resp*resp;
          }

          // Add all the data into the Histogram (atomically add)
          rh.setMin(min);       // Track actual lower/upper bound per-bin
          rh.setMax(max);
          for( int b=0; b<rh._bins.length; b++ ) { // Bump counts in bins
            if( bins[b] != 0 ) { Utils.AtomicIntArray.add(rh._bins,b,bins[b]); bins[b]=0; }
            if( ssqs[b] != 0 ) { rh.incr1(b,sums[b],ssqs[b]); sums[b]=ssqs[b]=0; }
          }
        }
      }
    }
  }


  // --------------------------------------------------------------------------
  // Build an entire layer of all K trees
  protected DHistogram[][][] buildLayer(final Frame fr, final DTree ktrees[], final int leafs[], final DHistogram hcs[][][], boolean subset, boolean build_tree_one_node) {
    // Build K trees, one per class.

    // Build up the next-generation tree splits from the current histograms.
    // Nearly all leaves will split one more level.  This loop nest is
    //           O( #active_splits * #bins * #ncols )
    // but is NOT over all the data.
    H2OCountedCompleter sb1ts[] = new H2OCountedCompleter[_nclass];
    Vec vecs[] = fr.vecs();
    for( int k=0; k<_nclass; k++ ) {
      final DTree tree = ktrees[k]; // Tree for class K
      if( tree == null ) continue;
      // Build a frame with just a single tree (& work & nid) columns, so the
      // nested MRTask2 ScoreBuildHistogram in ScoreBuildOneTree does not try
      // to close other tree's Vecs when run in parallel.
      Frame fr2 = new Frame(Arrays.copyOf(fr._names,_ncols+1), Arrays.copyOf(vecs,_ncols+1));
      fr2.add(fr._names[_ncols+1+k],vecs[_ncols+1+k]);
      fr2.add(fr._names[_ncols+1+_nclass+k],vecs[_ncols+1+_nclass+k]);
      fr2.add(fr._names[_ncols+1+_nclass+_nclass+k],vecs[_ncols+1+_nclass+_nclass+k]);
      // Start building one of the K trees in parallel
      H2O.submitTask(sb1ts[k] = new ScoreBuildOneTree(k,tree,leafs,hcs,fr2, subset, build_tree_one_node));
    }
    // Block for all K trees to complete.
    boolean did_split=false;
    for( int k=0; k<_nclass; k++ ) {
      final DTree tree = ktrees[k]; // Tree for class K
      if( tree == null ) continue;
      sb1ts[k].join();
      if( ((ScoreBuildOneTree)sb1ts[k])._did_split ) did_split=true;
    }
    // The layer is done.
    return did_split ? hcs : null;
  }

  private class ScoreBuildOneTree extends H2OCountedCompleter {
    final int _k;               // The tree
    final DTree _tree;
    final int _leafs[/*nclass*/];
    final DHistogram _hcs[/*nclass*/][][];
    final Frame _fr2;
    final boolean _build_tree_one_node;
    final boolean _subset;      // True if working a subset of cols
    boolean _did_split;
    ScoreBuildOneTree( int k, DTree tree, int leafs[], DHistogram hcs[][][], Frame fr2, boolean subset, boolean build_tree_one_node ) {
      _k    = k;
      _tree = tree;
      _leafs= leafs;
      _hcs  = hcs;
      _fr2  = fr2;
      _subset = subset;
      _build_tree_one_node = build_tree_one_node;
    }
    @Override public void compute2() {
      // Fuse 2 conceptual passes into one:
      // Pass 1: Score a prior DHistogram, and make new Node assignments
      // to every row.  This involves pulling out the current assigned Node,
      // "scoring" the row against that Node's decision criteria, and assigning
      // the row to a new child Node (and giving it an improved prediction).
      // Pass 2: Build new summary DHistograms on the new child Nodes every row
      // got assigned into.  Collect counts, mean, variance, min, max per bin,
      // per column.
      new ScoreBuildHistogram(this,_k,_tree,_leafs[_k],_hcs[_k],_subset).dfork(0,_fr2,_build_tree_one_node);
    }
    @Override public void onCompletion(CountedCompleter caller) {
      ScoreBuildHistogram sbh = (ScoreBuildHistogram)caller;
      //System.out.println(sbh.profString());

      final int leafk = _leafs[_k];
      int tmax = _tree.len();   // Number of total splits in tree K
      for( int leaf=leafk; leaf<tmax; leaf++ ) { // Visit all the new splits (leaves)
        DTree.UndecidedNode udn = _tree.undecided(leaf);
        //System.out.println((_nclass==1?"Regression":("Class "+_fr2.vecs()[_ncols]._domain[_k]))+",\n  Undecided node:"+udn);
        // Replace the Undecided with the Split decision
        DTree.DecidedNode dn = makeDecided(udn,sbh._hcs[leaf-leafk]);
        //System.out.println("--> Decided node: " + dn +
        //                   "  > Split: " + dn._split + " L/R:" + dn._split.rowsLeft()+" + "+dn._split.rowsRight());
        if( dn._split.col() == -1 ) udn.do_not_split();
        else _did_split = true;
      }
      _leafs[_k]=tmax;          // Setup leafs for next tree level
      int new_leafs = _tree.len()-tmax;
      _hcs[_k] = new DHistogram[new_leafs][/*ncol*/];
      for( int nl = tmax; nl<_tree.len(); nl ++ )
        _hcs[_k][nl-tmax] = _tree.undecided(nl)._hs;
      if (new_leafs>0) _tree.depth++; // Next layer done but update tree depth only if new leaves are generated
    }
  }

  // Builder-specific decision node
  protected abstract DTree.DecidedNode makeDecided( DTree.UndecidedNode udn, DHistogram hs[] );

  // --------------------------------------------------------------------------
  // Read the 'tree' columns, do model-specific math and put the results in the
  // fs[] array, and return the sum.  Dividing any fs[] element by the sum
  // turns the results into a probability distribution.
  protected abstract float score1( Chunk chks[], float fs[/*nclass*/], int row );

  // Call builder specific score code and then correct probabilities
  // if it is necessary.
  private float score2(Chunk chks[], float fs[/*nclass*/], int row ) {
    float sum = score1(chks, fs, row);
    if (/*false &&*/ classification && _priorClassDist!=null && _modelClassDist!=null && !Float.isInfinite(sum)  && sum>0f) {
      Utils.div(fs, sum);
      ModelUtils.correctProbabilities(fs, _priorClassDist, _modelClassDist);
      sum = 1.0f;
    }
    return sum;
  }

  // Score the *tree* columns, and produce a confusion matrix
  public class Score extends MRTask2<Score> {
    /* @OUT */ long    _cm[/*actual*/][/*predicted*/]; // Confusion matrix
    /* @OUT */ double  _sum;                           // Sum-squared-error
    /* @OUT */ long    _snrows;                        // Count of voted-on rows
    /* @OUT */ long    _cms[/*threshold*/][/*actual*/][/*predicted*/]; // Compute CM per threshold for binary classifiers
    /* @IN */  boolean _oob;
    /* @IN */  boolean _validation;
    /* @IN */  int     _cmlen;
    /* @IN */  boolean _cavr; // true if validation response needs to be adapted to CM domain

    public double   sum()   { return _sum; }
    public long[][] cm ()   { return _cm;  }
    public long     nrows() { return _snrows; }
    public double   mse()   { return sum() / nrows(); }

    /**
     * Compute CM and MSE on either the training or testing dataset.
     *
     * It expect already adapted validation dataset which is adapted to a model
     * and contains a response which is adapted to confusion matrix domain. Uff :)
     *
     * @param model a model which is used to perform computation
     * @param fr    a model training frame
     * @param vadaptor an adaptor which helps to adapt model/validation response to confusion matrix domain.
     * @param oob   perform out-of-bag validation on training frame
     * @param build_tree_one_node
     * @return this score object
     */
    public Score doIt(Model model, Frame fr, Response2CMAdaptor vadaptor, boolean oob, boolean build_tree_one_node) {
      assert !oob || vadaptor.getValidation()==null : "Validation frame cannot be specified if oob validation is demanded!"; // oob => validation==null
      assert _nclass == 1 || vadaptor.getCMDomain() != null : "CM domain has to be configured from classification!";

      _cmlen = _nclass > 1 ? vadaptor.getCMDomain().length : 1;
      _oob = oob;
      // Validation frame adapted to a model
      Frame adaptedValidation = vadaptor.getValidation();
      // No validation frame is specified, so perform computation on training data
      if( adaptedValidation == null ) return doAll(fr, build_tree_one_node);
      _validation = true;
      _cavr       = false;
      // Validation: need to score the set, getting a probability distribution for each class
      // Frame has nclass vectors (nclass, or 1 for regression), for classification it
      Frame res = model.score(adaptedValidation, false); // For classification: predicted values (~ values in res[0]) are in interval 0..domain().length-1, for regression just single column.
      Frame adapValidation = new Frame(adaptedValidation); // adapted validation dataset
      // All columns including response of validation frame are already adapted to model
      if (_nclass>1) { // Only for Classification
        for( int i=0; i<_nclass; i++ ) // Distribution of response classes
          adapValidation.add("ClassDist"+i,res.vecs()[i+1]);
        if (vadaptor.needsAdaptation2CM()) {
          Vec ar = vadaptor.adaptModelResponse2CM(res.vecs()[0]); // perform transformation of model results to be consistent with expected confusion matrix domain
          adapValidation.add("Prediction", ar); // add as a prediction
          adapValidation.add("ActualValidationResponse", vadaptor.getAdaptedValidationResponse2CM());
          _cavr = true; // signal that we have two predictions vectors in the frame.
          res.add("__dummyx__", ar); // add the vector to clean up list
        } else
          adapValidation.add("Prediction",res.vecs()[0]); // Predicted values
      } else { // Regression
        adapValidation.add("Prediction",res.vecs()[0]);
      }
      // Compute a CM & MSE
      try {
        doAll(adapValidation, build_tree_one_node);
      } finally {
        // Perform clean-up: remove temporary result
        res.delete();
      }
      return this;
    }

    @Override public void map( Chunk chks[] ) {
      Chunk ys = chk_resp(chks); // Response
      Chunk ays = _cavr ? chks[_ncols+1+_nclass+1] : ys; // Remember adapted response
      _cm = new long[_cmlen][_cmlen];
      float fs[] = new float[_nclass+1]; // Array to hold prediction and distribution given by the model.
      // For binary classifier allocate cms for individual thresholds
      _cms = new long[ModelUtils.DEFAULT_THRESHOLDS.length][2][2];
      // Score all Rows
      for( int row=0; row<ys._len; row++ ) {
        if( ays.isNA0(row) ) continue; // Ignore missing response vars only if it was actual NA
        float sum;
        if( _validation ) {     // Passed in a class distribution from scoring
          for( int i=0; i<_nclass; i++ )
            fs[i+1] = (float)chk_tree(chks,i).at0(row); // Get the class distros
          if (_nclass > 1 ) sum = 1.0f;  // Sum of a distribution is 1.0 for classification
          else              sum = fs[1]; // Sum is the same as prediction for regression.
        } else {               // Passed in the model-specific columns
          sum = score2(chks,fs,row);
        }
        float err;  int yact=0; // actual response from dataset
        int yact_orig = 0; // actual response from dataset before potential scaling
        if (_oob && inBagRow(chks, row)) continue; // score only on out-of-bag rows
        if( _nclass > 1 ) {    // Classification
          // Compute error
          if( sum == 0 ) {                          // This tree does not predict this row *at all* ! In prediction we will make random decision, but here compute error based on number of classes
            yact = yact_orig = (int) ys.at80(row);  // OPS: Pick an actual prediction adapted to model values <0, nclass-1)
            err = 1.0f-1.0f/_nclass;                // Then take ycls=0, uniform predictive power
          } else {
            if (_cavr && ys.isNA0(row)) {           // Handle adapted validation response - actual response was adapted but does not contain NA - it is implicit misprediction,
              err = 1f;
            } else {                                // No adaptation of validation response
              yact = yact_orig = (int) ys.at80(row);// OPS: Pick an actual prediction adapted to model values <0, nclass-1)
              assert 0 <= yact && yact < _nclass : "weird ycls="+yact+", y="+ys.at0(row);
              err = Float.isInfinite(sum)
                ? (Float.isInfinite(fs[yact+1]) ? 0f : 1f)
                : 1.0f-fs[yact+1]/sum;              // Error: distance from predicting ycls as 1.0
            }
          }
          assert !Double.isNaN(err) : "fs[cls]="+fs[yact+1] + ", sum=" + sum;
          // Overwrite response by adapted value to provide correct CM
          if (_cavr) yact = (int) ays.at80(row);
        } else {                // Regression
          err = (float)ys.at0(row) - sum;
        }
        _sum += err*err;               // Squared error
        assert !Double.isNaN(_sum);
        // Pick highest prob for our prediction.
        if (_nclass > 1) { // fill CM only for classification
          if(_nclass == 2) { // Binomial classification -> compute AUC, draw ROC
            float snd = _validation ? fs[2] : (!Float.isInfinite(sum) ? fs[2] / sum : Float.isInfinite(fs[2]) ? 1 : 0); // for validation dataset sum is always 1
            for(int i = 0; i < ModelUtils.DEFAULT_THRESHOLDS.length; i++) {
              int p = snd >= ModelUtils.DEFAULT_THRESHOLDS[i] ? 1 : 0; // Compute prediction based on threshold
              _cms[i][yact_orig][p]++; // Increase matrix
            }
          }
          int ypred = _validation ? (int) chks[_ncols+1+_nclass].at80(row) : getPrediction(fs, row);
          _cm[yact][ypred]++;      // actual v. predicted
        }
        _snrows++;
      }
    }

    @Override public void reduce( Score t ) {
      _sum += t._sum;
      Utils.add(_cm,t._cm);
      _snrows += t._snrows;
      if (_cms!=null)
        for (int i = 0; i < _cms.length; i++) Utils.add(_cms[i], t._cms[i]);
    }

    public Score report( Sys tag, int ntree, DTree[] trees ) {
      assert !Double.isNaN(_sum);
      Log.info(tag,"============================================================== ");
      int lcnt=0;
      if( trees!=null ) for( DTree t : trees ) if( t != null ) lcnt += t._len;
      long err=_snrows;
      for( int c=0; c<_nclass; c++ ) err -= _cm[c][c];
      Log.info(tag,"Mean Squared Error is "+(_sum/_snrows)+", with "+ntree+"x"+_nclass+" trees (average of "+((float)lcnt/_nclass)+" nodes)");
      if( _nclass > 1 )
        Log.info(tag,"Total of "+err+" errors on "+_snrows+" rows, CM= "+Arrays.deepToString(_cm));
      else
        Log.info("Reported on "+_snrows+" rows.");
      return this;
    }
  }

  @Override public String speedDescription() { return "time/tree"; }
  @Override public long speedValue() {
    Value value = DKV.get(dest());
    DTree.TreeModel m = value != null ? (DTree.TreeModel) value.get() : null;
    long numTreesBuiltSoFar = m == null ? 0 : m.ntrees();
    long sv = (numTreesBuiltSoFar <= 0) ? 0 : (runTimeMs() / numTreesBuiltSoFar);
    return sv;
  }

  /** Returns a log tag for a particular model builder (e.g., DRF, GBM) */
  protected abstract water.util.Log.Tag.Sys logTag();
  /**
   * Builds model
   * @param initialModel initial model created by makeModel() method.
   * @param trainFr training dataset which can contain additional temporary vectors prepared by buildModel() method.
   * @param names names of columns in <code>trainFr</code> used for model training
   * @param domains domains of columns in <code>trainFr</code> used for model training
   * @param t_build timer to measure model building process
   * @return resulting model
   */
  protected abstract TM buildModel( TM initialModel, Frame trainFr, String names[], String domains[][], Timer t_build );
  /**
   * Initialize algorithm - e.g., allocate algorithm specific datastructure.
   *
   * @param initialModel
   */
  protected abstract void initAlgo( TM initialModel);

  /**
   * Initialize working frame.
   * @param initialModel  initial model
   * @param fr working frame which contains train data and additional columns prepared by this builder.
   */
  protected abstract void initWorkFrame( TM initialModel, Frame fr);

  protected abstract TM makeModel( Key outputKey, Key dataKey, Key testKey, int ntrees, String names[], String domains[][], String[] cmDomain, float[] priorClassDist, float[] classDist);
  protected abstract TM makeModel( TM model, double err, ConfusionMatrix cm, VarImp varimp, AUCData validAUC);
  protected abstract TM makeModel( TM model, DTree ktrees[], DTree.TreeModel.TreeStats tstats);
  protected abstract TM updateModel( TM model, TM checkpoint, boolean overwriteCheckpoint);

  protected AUCData makeAUC(ConfusionMatrix[] cms, float[] threshold) {
    assert _nclass == 2;
    return cms != null ? new AUC(cms, threshold, _cmDomain).data() : null;
  }

  protected boolean inBagRow(Chunk[] chks, int row) { return false; }
  protected final boolean isClassification() { return _nclass > 1; }

  static public final boolean isOOBRow(int nid)     { return nid <= OUT_OF_BAG; }
  static public final boolean isDecidedRow(int nid) { return nid == DECIDED_ROW; }
  static public final int     oob2Nid(int oobNid)   { return -oobNid + OUT_OF_BAG; }
  static public final int     nid2Oob(int nid)      { return -nid + OUT_OF_BAG; }

  // Helper to unify use of M-T RNG
  public static Random createRNG(long seed) {
    return new MersenneTwisterRNG(new int[] { (int)(seed>>32L),(int)seed });
  }

  static int counter = 0;
  // helper for debugging
  static protected void printGenerateTrees(DTree[] trees) {
    for( int k=0; k<trees.length; k++ )
      if( trees[k] != null ) {
        try {
          PrintWriter writer = new PrintWriter("/tmp/h2o.tree" + ++counter + ".txt", "UTF-8");
          writer.println(trees[k].root().toString2(new StringBuilder(), 0));
          writer.close();
        } catch (FileNotFoundException e) {
          e.printStackTrace();
        } catch (UnsupportedEncodingException e) {
          e.printStackTrace();
        }
        System.out.println(trees[k].root().toString2(new StringBuilder(), 0));
      }
  }

  protected final void debugPrintTreeColumns(Frame fr) {
    new MRTask2() {
      @Override public void map(Chunk[] cs) {
        for (int r=0; r<cs[0]._len; r++) {
          System.err.print("Row "+ r +": ");
          for (int i=0; i<_nclass; i++) {
            Chunk c = chk_tree(cs, i);
            System.err.print(c.at0(r));
            System.err.print(',');
          }
          if (supportsBagging()) {
            Chunk c = chk_oobt(cs);
            System.err.print(c.at80(r)>0 ? ":OUT" : ":IN");
          }
          System.err.println();
        }
      }
    }.doAll(fr);
  }
}