ModelBuilder.java example

Explorer
h2o-3-master
package hex;

import hex.genmodel.utils.DistributionFamily;
import water.*;
import water.exceptions.H2OIllegalArgumentException;
import water.exceptions.H2OModelBuilderIllegalArgumentException;
import water.fvec.*;
import water.rapids.ast.prims.advmath.AstKFold;
import water.util.*;

import java.lang.reflect.Method;
import java.util.*;

/**
 *  Model builder parent class.  Contains the common interfaces and fields across all model builders.
 */
abstract public class ModelBuilder<M extends Model<M,P,O>, P extends Model.Parameters, O extends Model.Output> extends Iced {

  public ToEigenVec getToEigenVec() { return null; }
  public boolean shouldReorder(Vec v) { return _parms._categorical_encoding.needsResponse() && isSupervised(); }

  transient private IcedHashMap<Key,String> _toDelete = new IcedHashMap<>();
  void cleanUp() { FrameUtils.cleanUp(_toDelete); }

  public Job<M> _job;     // Job controlling this build
  /** Block till completion, and return the built model from the DKV.  Note the
   *  funny assert: the Job does NOT have to be controlling this model build,
   *  but might, e.g. be controlling a Grid search for which this is just one
   *  of many results.  Calling 'get' means that we are blocking on the Job
   *  which is controlling ONLY this ModelBuilder, and when the Job completes
   *  we can return built Model. */
  public final M get() { assert _job._result == _result; return _job.get(); }
  public final boolean isStopped() { return _job.isStopped(); }

  // Key of the model being built; note that this is DIFFERENT from
  // _job._result if the Job is being shared by many sub-models
  // e.g. cross-validation.
  protected Key<M> _result;  // Built Model key
  public final Key<M> dest() { return _result; }

  private long _start_time; //start time in msecs - only used for time-based stopping
  protected boolean timeout() {
    assert(_start_time > 0) : "Must set _start_time for each individual model.";
    return _parms._max_runtime_secs > 0 && System.currentTimeMillis() - _start_time > (long) (_parms._max_runtime_secs * 1e3);
  }
  protected boolean stop_requested() {
    return _job.stop_requested() || timeout();
  }

  /** Default model-builder key */
  public static <S extends Model> Key<S> defaultKey(String algoName) {
    return Key.make(H2O.calcNextUniqueModelId(algoName));
  }

  /** Default easy constructor: Unique new job and unique new result key */
  protected ModelBuilder(P parms) {
    this(parms, ModelBuilder.<M>defaultKey(parms.algoName()));
  }

  /** Unique new job and named result key */
  protected ModelBuilder(P parms, Key<M> key) {
    _job = new Job<>(_result = key, parms.javaName(), parms.algoName());
    _parms = parms;
  }

  /** Shared pre-existing Job and unique new result key */
  protected ModelBuilder(P parms, Job<M> job) {
    _job = job;
    _result = defaultKey(parms.algoName());
    _parms = parms;
  }

  /** List of known ModelBuilders with all default args; endlessly cloned by
   *  the GUI for new private instances, then the GUI overrides some of the
   *  defaults with user args. */
  private static String[] ALGOBASES = new String[0];
  public static String[] algos() { return ALGOBASES; }
  private static String[] SCHEMAS = new String[0];
  private static ModelBuilder[] BUILDERS = new ModelBuilder[0];

  /** One-time start-up only ModelBuilder, endlessly cloned by the GUI for the
   *  default settings. */
  protected ModelBuilder(P parms, boolean startup_once) { this(parms,startup_once,"hex.schemas."); }
  protected ModelBuilder(P parms, boolean startup_once, String externalSchemaDirectory ) {
    assert startup_once;
    _job = null;
    _result = null;
    _parms = parms;
    init(false); // Default cheap init
    String base = getClass().getSimpleName().toLowerCase();
    if( ArrayUtils.find(ALGOBASES,base) != -1 )
      throw H2O.fail("Only called once at startup per ModelBuilder, and "+base+" has already been called");
    // FIXME: this is not thread safe!
    ALGOBASES = Arrays.copyOf(ALGOBASES,ALGOBASES.length+1);
    BUILDERS  = Arrays.copyOf(BUILDERS ,BUILDERS .length+1);
    SCHEMAS   = Arrays.copyOf(SCHEMAS  ,SCHEMAS  .length+1);
    ALGOBASES[ALGOBASES.length-1] = base;
    BUILDERS [BUILDERS .length-1] = this;
    SCHEMAS  [SCHEMAS  .length-1] = externalSchemaDirectory;
  }

  /** gbm -> GBM, deeplearning -> DeepLearning */
  public static String algoName(String urlName) { return BUILDERS[ArrayUtils.find(ALGOBASES,urlName)]._parms.algoName(); }
  /** gbm -> hex.tree.gbm.GBM, deeplearning -> hex.deeplearning.DeepLearning */
  public static String javaName(String urlName) { return BUILDERS[ArrayUtils.find(ALGOBASES,urlName)]._parms.javaName(); }
  /** gbm -> GBMParameters */
  public static String paramName(String urlName) { return algoName(urlName)+"Parameters"; }
  /** gbm -> "hex.schemas." ; custAlgo -> "org.myOrg.schemas." */
  public static String schemaDirectory(String urlName) { return SCHEMAS[ArrayUtils.find(ALGOBASES,urlName)]; }


  /** Factory method to create a ModelBuilder instance for given the algo name.
   *  Shallow clone of both the default ModelBuilder instance and a Parameter. */
  public static <B extends ModelBuilder> B make(String algo, Job job, Key<Model> result) {
    int idx = ArrayUtils.find(ALGOBASES,algo.toLowerCase());
    assert idx != -1 : "Unregistered algorithm "+algo;
    B mb = (B)BUILDERS[idx].clone();
    mb._job = job;
    mb._result = result;
    mb._parms = BUILDERS[idx]._parms.clone();
    return mb;
  }


  /** All the parameters required to build the model. */
  public P _parms;              // Not final, so CV can set-after-clone


  /** Training frame: derived from the parameter's training frame, excluding
   *  all ignored columns, all constant and bad columns, perhaps flipping the
   *  response column to an Categorical, etc.  */
  public final Frame train() { return _train; }
  protected transient Frame _train;

  public void setTrain(Frame train) {
    _train = train;
  }
  /** Validation frame: derived from the parameter's validation frame, excluding
   *  all ignored columns, all constant and bad columns, perhaps flipping the
   *  response column to a Categorical, etc.  Is null if no validation key is set.  */
  protected final Frame valid() { return _valid; }
  protected transient Frame _valid;

  // TODO: tighten up the type
  // Map the algo name (e.g., "deeplearning") to the builder class (e.g., DeepLearning.class) :
  private static final Map<String, Class<? extends ModelBuilder>> _builders = new HashMap<>();

  // Map the Model class (e.g., DeepLearningModel.class) to the algo name (e.g., "deeplearning"):
  private static final Map<Class<? extends Model>, String> _model_class_to_algo = new HashMap<>();

  // Map the simple algo name (e.g., deeplearning) to the full algo name (e.g., "Deep Learning"):
  private static final Map<String, String> _algo_to_algo_full_name = new HashMap<>();

  // Map the algo name (e.g., "deeplearning") to the Model class (e.g., DeepLearningModel.class):
  private static final Map<String, Class<? extends Model>> _algo_to_model_class = new HashMap<>();

  /** Train response vector. */
  public Vec response(){return _response;}
  /** Validation response vector. */
  public Vec vresponse(){return _vresponse == null ? _response : _vresponse;}

  abstract protected class Driver extends H2O.H2OCountedCompleter<Driver> {
    protected Driver(){ super(); }
    protected Driver(H2O.H2OCountedCompleter completer){ super(completer); }
    // Pull the boilerplate out of the computeImpl(), so the algo writer doesn't need to worry about the following:
    // 1) Scope (unless they want to keep data, then they must call Scope.untrack(Key<Vec>[]))
    // 2) Train/Valid frame locking and unlocking
    // 3) calling tryComplete()
    public void compute2() {
      try {
        Scope.enter();
        _parms.read_lock_frames(_job); // Fetch & read-lock input frames
        computeImpl();
      } finally {
        setFinalState();
        _parms.read_unlock_frames(_job);
        if (!_parms._is_cv_model) cleanUp(); //cv calls cleanUp on its own terms
        Scope.exit();
      }
      tryComplete();
    }
    public abstract void computeImpl();
  }

  private void setFinalState() {
    Key<M> reskey = dest();
    if (reskey == null) return;
    M res = reskey.get();
    if (res != null && res._output != null) {
      res._output._job = _job;
      res._output.stopClock();
    }
  }

  /** Method to launch training of a Model, based on its parameters. */
  final public Job<M> trainModel() {
    if (error_count() > 0)
      throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(this);
    _start_time = System.currentTimeMillis();
    if( !nFoldCV() )
      return _job.start(trainModelImpl(), _parms.progressUnits(), _parms._max_runtime_secs);

    // cross-validation needs to be forked off to allow continuous (non-blocking) progress bar
    return _job.start(new H2O.H2OCountedCompleter() {
                        @Override
                        public void compute2() {
                          computeCrossValidation();
                          tryComplete();
                        }
                      },
            (nFoldWork()+1/*main model*/) * _parms.progressUnits(), _parms._max_runtime_secs);
  }

  /**
   * Train a model as part of a larger Job;
   *
   * @param fr: Input frame override, ignored if null.
   *   In some cases, algos do not work directly with the original frame in the K/V store.
   *   Instead they run on a private anonymous copy (eg: reblanced dataset).
   *   Use this argument if you want nested job to work on the actual working copy rather than the original Frame in the K/V.
   *   Example: Outer job rebalances dataset and then calls nested job. To avoid needless second reblance, pass in the (already rebalanced) working copy.
   * */
  final public M trainModelNested(Frame fr) {
    if(fr != null) // Use the working copy (e.g. rebalanced) instead of the original K/V store version
      setTrain(fr);
    if (error_count() > 0)
      throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(this);
    _start_time = System.currentTimeMillis();
    if( !nFoldCV() ) trainModelImpl().compute2();
    else computeCrossValidation();
    return _result.get();
  }

  /** Model-specific implementation of model training
   * @return A F/J Job, which, when executed, does the build.  F/J is NOT started.  */
  abstract protected Driver trainModelImpl();

  /**
   * How many should be trained in parallel during N-fold cross-validation?
   * Train all CV models in parallel when parallelism is enabled, otherwise train one at a time
   * Each model can override this logic, based on parameters, dataset size, etc.
   * @return How many models to train in parallel during cross-validation
   */
  protected int nModelsInParallel() {
    if (!_parms._parallelize_cross_validation || _parms._max_runtime_secs != 0) return 1; //user demands serial building (or we need to honor the time constraints for all CV models equally)
    if (_train.byteSize() < 1e6) return _parms._nfolds; //for small data, parallelize over CV models
    return 1; //safe fallback
  }

  // Work for each requested fold
  protected int nFoldWork() {
    if( _parms._fold_column == null ) return _parms._nfolds;
    Vec f = _parms._train.get().vec(_parms._fold_column);
    Vec fc = VecUtils.toCategoricalVec(f);
    int N = fc.domain().length;
    fc.remove();
    return N;
  }

  /**
   * Default naive (serial) implementation of N-fold cross-validation
   * (builds N+1 models, all have train+validation metrics, the main model has N-fold cross-validated validation metrics)
   */
  public void computeCrossValidation() {
    assert _job.isRunning();    // main Job is still running
    _job.setReadyForView(false); //wait until the main job starts to let the user inspect the main job
    final Integer N = nFoldWork();
    init(false);
    try {
      Scope.enter();

      // Step 1: Assign each row to a fold
      final Vec foldAssignment = cv_AssignFold(N);

      // Step 2: Make 2*N binary weight vectors
      final Vec[] weights = cv_makeWeights(N,foldAssignment);

      // Step 3: Build N train & validation frames; build N ModelBuilders; error check them all
      ModelBuilder<M, P, O> cvModelBuilders[] = cv_makeFramesAndBuilders(N,weights);

      // Step 4: Run all the CV models
      cv_buildModels(N, cvModelBuilders);

      // Step 5: Score the CV models
      ModelMetrics.MetricBuilder mbs[] = cv_scoreCVModels(N, weights, cvModelBuilders);

      // Step 6: Build the main model
      buildMainModel();

      // Step 7: Combine cross-validation scores; compute main model x-val
      // scores; compute gains/lifts
      cv_mainModelScores(N, mbs, cvModelBuilders);

      // Step 7: Clean up potentially created temp frames
      for (ModelBuilder mb : cvModelBuilders)
        mb.cleanUp();

      _job.setReadyForView(true);
      DKV.put(_job);

    } finally {
      cleanUp();
      Scope.exit();
    }
  }

  // Step 1: Assign each row to a fold
  // TODO: Implement better splitting algo (with Strata if response is
  // categorical), e.g. http://www.lexjansen.com/scsug/2009/Liang_Xie2.pdf
  public Vec cv_AssignFold(int N) {
    assert(N>=2);
    Vec fold = train().vec(_parms._fold_column);
    if( fold != null ) {
      if( !fold.isInt() ||
          (!(fold.min() == 0 && fold.max() == N-1) &&
           !(fold.min() == 1 && fold.max() == N  ) )) // Allow 0 to N-1, or 1 to N
        throw new H2OIllegalArgumentException("Fold column must be either categorical or contiguous integers from 0..N-1 or 1..N");
      return fold;
    }
    final long seed = _parms.getOrMakeRealSeed();
    Log.info("Creating " + N + " cross-validation splits with random number seed: " + seed);
    switch( _parms._fold_assignment ) {
    case AUTO:
    case Random:     return AstKFold.          kfoldColumn(train().anyVec().makeZero(),N,seed);
    case Modulo:     return AstKFold.    moduloKfoldColumn(train().anyVec().makeZero(),N     );
    case Stratified: return AstKFold.stratifiedKFoldColumn(response(),N,seed);
    default:         throw H2O.unimpl();
    }
  }

  // Step 2: Make 2*N binary weight vectors
  public Vec[] cv_makeWeights( final int N, Vec foldAssignment ) {
    String origWeightsName = _parms._weights_column;
    Vec origWeight  = origWeightsName != null ? train().vec(origWeightsName) : train().anyVec().makeCon(1.0);
    Frame folds_and_weights = new Frame(foldAssignment, origWeight);
    Vec[] weights = new MRTask() {
        @Override public void map(Chunk chks[], NewChunk nchks[]) {
          Chunk fold = chks[0], orig = chks[1];
          for( int row=0; row< orig._len; row++ ) {
            int foldIdx = (int)fold.at8(row) % N;
            double w = orig.atd(row);
            for( int f = 0; f < N; f++ ) {
              boolean holdout = foldIdx == f;
              nchks[2 * f].addNum(holdout ? 0 : w);
              nchks[2*f+1].addNum(holdout ? w : 0);
            }
          }
        }
      }.doAll(2*N,Vec.T_NUM,folds_and_weights).outputFrame().vecs();
    if (_parms._keep_cross_validation_fold_assignment)
      DKV.put(new Frame(Key.<Frame>make("cv_fold_assignment_" + _result.toString()), new String[]{"fold_assignment"}, new Vec[]{foldAssignment.makeCopy()}));
    if( _parms._fold_column == null && !_parms._keep_cross_validation_fold_assignment) foldAssignment.remove();
    if( origWeightsName == null ) origWeight.remove(); // Cleanup temp

    for( Vec weight : weights )
      if( weight.isConst() )
        throw new H2OIllegalArgumentException("Not enough data to create " + N + " random cross-validation splits. Either reduce nfolds, specify a larger dataset (or specify another random number seed, if applicable).");
    return weights;
  }

  // Step 3: Build N train & validation frames; build N ModelBuilders; error check them all
  public ModelBuilder<M, P, O>[] cv_makeFramesAndBuilders( int N, Vec[] weights ) {
    final long old_cs = _parms.checksum();
    final String origDest = _result.toString();

    final String weightName = "__internal_cv_weights__";
    if (train().find(weightName) != -1) throw new H2OIllegalArgumentException("Frame cannot contain a Vec called '" + weightName + "'.");

    Frame cv_fr = new Frame(train().names(),train().vecs());
    if( _parms._weights_column!=null ) cv_fr.remove( _parms._weights_column ); // The CV frames will have their own private weight column

    ModelBuilder<M, P, O>[] cvModelBuilders = new ModelBuilder[N];
    List<Frame> cvFramesForFailedModels = new ArrayList<>();
    for( int i=0; i<N; i++ ) {
      String identifier = origDest + "_cv_" + (i+1);
      // Training/Validation share the same data, but will have exclusive weights
      Frame cvTrain = new Frame(Key.<Frame>make(identifier+"_train"),cv_fr.names(),cv_fr.vecs());
      cvTrain.add(weightName, weights[2*i]);
      DKV.put(cvTrain);
      Frame cvValid = new Frame(Key.<Frame>make(identifier+"_valid"),cv_fr.names(),cv_fr.vecs());
      cvValid.add(weightName, weights[2*i+1]);
      DKV.put(cvValid);

      // Shallow clone - not everything is a private copy!!!
      ModelBuilder<M, P, O> cv_mb = (ModelBuilder)this.clone();
      cv_mb.setTrain(cvTrain);
      cv_mb._result = Key.make(identifier); // Each submodel gets its own key
      cv_mb._parms = (P) _parms.clone();
      // Fix up some parameters of the clone
      cv_mb._parms._is_cv_model = true;
      cv_mb._parms._weights_column = weightName;// All submodels have a weight column, which the main model does not
      cv_mb._parms.setTrain(cvTrain._key);       // All submodels have a weight column, which the main model does not
      cv_mb._parms._valid = cvValid._key;
      cv_mb._parms._fold_assignment = Model.Parameters.FoldAssignmentScheme.AUTO;
      cv_mb._parms._nfolds = 0; // Each submodel is not itself folded
      cv_mb.clearValidationErrors(); // each submodel gets its own validation messages and error_count()

      // Error-check all the cross-validation Builders before launching any
      cv_mb.init(false);
      if( cv_mb.error_count() > 0 ) { // Gather all submodel error messages
        Log.info("Marking frame for failed cv model for removal: " + cvTrain._key);
        cvFramesForFailedModels.add(cvTrain);
        Log.info("Marking frame for failed cv model for removal: " + cvValid._key);
        cvFramesForFailedModels.add(cvValid);

        for (ValidationMessage vm : cv_mb._messages)
          message(vm._log_level, vm._field_name, vm._message);
      }
      cvModelBuilders[i] = cv_mb;
    }

    if( error_count() > 0 ) {               // Found an error in one or more submodels
      Futures fs = new Futures();
      for (Frame cvf : cvFramesForFailedModels) {
        cvf.vec(weightName).remove(fs);     // delete the Vec's chunks
        DKV.remove(cvf._key, fs);           // delete the Frame from the DKV, leaving its vecs
        Log.info("Removing frame for failed cv model: " + cvf._key);
      }
      fs.blockForPending();
      throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(this);
    }
    // check that this Job's original _params haven't changed
    assert old_cs == _parms.checksum();
    return cvModelBuilders;
  }

  // Step 4: Run all the CV models and launch the main model
  public void cv_buildModels(int N, ModelBuilder<M, P, O>[] cvModelBuilders ) {
    H2O.H2OCountedCompleter submodel_tasks[] = new H2O.H2OCountedCompleter[N];
    int nRunning=0;
    RuntimeException rt = null;
    for( int i=0; i<N; ++i ) {
      if( _job.stop_requested() ) break; // Stop launching but still must block for all async jobs
      Log.info("Building cross-validation model " + (i + 1) + " / " + N + ".");
      cvModelBuilders[i]._start_time = System.currentTimeMillis();
      submodel_tasks[i] = H2O.submitTask(cvModelBuilders[i].trainModelImpl());
      if(++nRunning == nModelsInParallel()) { //piece-wise advance in training the CV models
        while (nRunning > 0) try {
          submodel_tasks[i + 1 - nRunning--].join();
        } catch (RuntimeException t) {
          if (rt == null) rt = t;
        }
        if(rt != null) throw rt;
      }
    }
    for( int i=0; i<N; ++i ) //all sub-models must be completed before the main model can be built
      try {
        submodel_tasks[i].join();
      } catch(RuntimeException t){
        if(rt == null) rt = t;
      }
    if(rt != null) throw rt;
    cv_computeAndSetOptimalParameters(cvModelBuilders);
  }

  private void buildMainModel() {
    if (_job.stop_requested()) return;
    assert _job.isRunning();
    Log.info("Building main model.");
    _start_time = System.currentTimeMillis();
    H2O.H2OCountedCompleter mm = H2O.submitTask(trainModelImpl());
    mm.join();  // wait for completion
  }

  // Step 5: Score the CV models
  public ModelMetrics.MetricBuilder[] cv_scoreCVModels(int N, Vec[] weights, ModelBuilder<M, P, O>[] cvModelBuilders) {
    if( _job.stop_requested() ) return null;
    ModelMetrics.MetricBuilder[] mbs = new ModelMetrics.MetricBuilder[N];
    Futures fs = new Futures();
    for (int i=0; i<N; ++i) {
      if( _job.stop_requested() ) return null; //don't waste time scoring if the CV run is stopped
      Frame cvValid = cvModelBuilders[i].valid();
      Frame adaptFr = new Frame(cvValid);
      M cvModel = cvModelBuilders[i].dest().get();
      cvModel.adaptTestForTrain(adaptFr, true, !isSupervised());
      mbs[i] = cvModel.scoreMetrics(adaptFr);
      if (nclasses() == 2 /* need holdout predictions for gains/lift table */ ||
              _parms._keep_cross_validation_predictions ||
              (_parms._distribution== DistributionFamily.huber /*need to compute quantiles on abs error of holdout predictions*/)) {
        String predName = "prediction_" + cvModelBuilders[i]._result.toString();
        cvModel.predictScoreImpl(cvValid, adaptFr, predName, _job, true);
        DKV.put(cvModel);
      }
      // free resources as early as possible
      if (adaptFr != null) {
        Model.cleanup_adapt(adaptFr, cvValid);
        DKV.remove(adaptFr._key,fs);
      }
      DKV.remove(cvModelBuilders[i]._parms._train,fs);
      DKV.remove(cvModelBuilders[i]._parms._valid,fs);
      weights[2*i  ].remove(fs);
      weights[2*i+1].remove(fs);
    }
    fs.blockForPending();
    return mbs;
  }

  // Step 6: Combine cross-validation scores; compute main model x-val scores; compute gains/lifts
  public void cv_mainModelScores(int N, ModelMetrics.MetricBuilder mbs[], ModelBuilder<M, P, O> cvModelBuilders[]) {
    if( _job.stop_requested() ) return;
    assert _job.isRunning();

    M mainModel = _result.get();

    // Compute and put the cross-validation metrics into the main model
    Log.info("Computing " + N + "-fold cross-validation metrics.");
    mainModel._output._cross_validation_models = new Key[N];
    Key<Frame>[] predKeys = new Key[N];
    mainModel._output._cross_validation_predictions = _parms._keep_cross_validation_predictions ? predKeys : null;

    for (int i = 0; i < N; ++i) {
      if (i > 0) mbs[0].reduce(mbs[i]);
      Key<M> cvModelKey = cvModelBuilders[i]._result;
      mainModel._output._cross_validation_models[i] = cvModelKey;
      predKeys[i] = Key.make("prediction_" + cvModelKey.toString()); //must be the same as in cv_scoreCVModels above
    }
    Frame holdoutPreds = null;
    if (_parms._keep_cross_validation_predictions || (nclasses()==2 /*GainsLift needs this*/ || _parms._distribution == DistributionFamily.huber)) {
      Key<Frame> cvhp = Key.make("cv_holdout_prediction_" + mainModel._key.toString());
      if (_parms._keep_cross_validation_predictions) //only show the user if they asked for it
        mainModel._output._cross_validation_holdout_predictions_frame_id = cvhp;
      holdoutPreds = combineHoldoutPredictions(predKeys, cvhp);
    }
    if (_parms._keep_cross_validation_fold_assignment) {
      mainModel._output._cross_validation_fold_assignment_frame_id = Key.make("cv_fold_assignment_" + _result.toString());
      Frame xvalidation_fold_assignment_frame = mainModel._output._cross_validation_fold_assignment_frame_id.get();
      if (xvalidation_fold_assignment_frame != null)
        Scope.untrack(xvalidation_fold_assignment_frame.keysList());
    }
    // Keep or toss predictions
    for (Key<Frame> k : predKeys) {
      Frame fr = DKV.getGet(k);
      if( fr != null ) {
        if (_parms._keep_cross_validation_predictions) Scope.untrack(fr.keysList());
        else fr.remove();
      }
    }
    mainModel._output._cross_validation_metrics = mbs[0].makeModelMetrics(mainModel, _parms.train(), null, holdoutPreds);
    if (holdoutPreds != null) {
      if (_parms._keep_cross_validation_predictions) Scope.untrack(holdoutPreds.keysList());
      else holdoutPreds.remove();
    }
    mainModel._output._cross_validation_metrics._description = N + "-fold cross-validation on training data (Metrics computed for combined holdout predictions)";
    Log.info(mainModel._output._cross_validation_metrics.toString());

    mainModel._output._cross_validation_metrics_summary = makeCrossValidationSummaryTable(mainModel._output._cross_validation_models);

    // Now, the main model is complete (has cv metrics)
    DKV.put(mainModel);
  }

  /** Override for model-specific checks / modifications to _parms for the main model during N-fold cross-validation.
   *  Also allow the cv models to be modified after all of them have been built.
   *  For example, the model might need to be told to not do early stopping. CV models might have their lambda value modified, etc.
   */
  public void cv_computeAndSetOptimalParameters(ModelBuilder<M, P, O>[] cvModelBuilders) { }

  /** @return Whether n-fold cross-validation is done  */
  public boolean nFoldCV() {
    return _parms._fold_column != null || _parms._nfolds != 0;
  }

  /** List containing the categories of models that this builder can
   *  build.  Each ModelBuilder must have one of these. */
  abstract public ModelCategory[] can_build();


  /** Visibility for this algo: is it always visible, is it beta (always
   *  visible but with a note in the UI) or is it experimental (hidden by
   *  default, visible in the UI if the user gives an "experimental" flag at
   *  startup); test-only builders are "experimental"  */
  public enum BuilderVisibility { Experimental, Beta, Stable }
  public BuilderVisibility builderVisibility() { return BuilderVisibility.Stable; }

  /** Clear whatever was done by init() so it can be run again. */
  public void clearInitState() {
    clearValidationErrors();
  }
  protected boolean logMe() { return true; }

  abstract public boolean isSupervised();

  protected transient Vec _response; // Handy response column
  protected transient Vec _vresponse; // Handy response column
  protected transient Vec _offset; // Handy offset column
  protected transient Vec _weights; // observation weight column
  protected transient Vec _fold; // fold id column
  protected transient String[] _origNames;
  protected transient String[][] _origDomains;

  public boolean hasOffsetCol(){ return _parms._offset_column != null;} // don't look at transient Vec
  public boolean hasWeightCol(){return _parms._weights_column != null;} // don't look at transient Vec
  public boolean hasFoldCol(){return _parms._fold_column != null;} // don't look at transient Vec
  public int numSpecialCols() { return (hasOffsetCol() ? 1 : 0) + (hasWeightCol() ? 1 : 0) + (hasFoldCol() ? 1 : 0); }
  public String[] specialColNames() {
    String[] n = new String[numSpecialCols()];
    int i=0;
    if (hasOffsetCol()) n[i++]=_parms._offset_column;
    if (hasWeightCol()) n[i++]=_parms._weights_column;
    if (hasFoldCol())   n[i++]=_parms._fold_column;
    return n;
  }
  // no hasResponse, call isSupervised instead (response is mandatory if isSupervised is true)

  public boolean havePojo() { return false; }
  public boolean haveMojo() { return false; }

  protected int _nclass; // Number of classes; 1 for regression; 2+ for classification

  public int nclasses(){return _nclass;}

  public final boolean isClassifier() { return nclasses() > 1; }

  /**
   * Find and set response/weights/offset/fold and put them all in the end,
   * @return number of non-feature vecs
   */
  public int separateFeatureVecs() {
    int res = 0;
    if(_parms._weights_column != null) {
      Vec w = _train.remove(_parms._weights_column);
      if(w == null)
        error("_weights_column","Weights column '" + _parms._weights_column  + "' not found in the training frame");
      else {
        if(!w.isNumeric())
          error("_weights_column","Invalid weights column '" + _parms._weights_column  + "', weights must be numeric");
        _weights = w;
        if(w.naCnt() > 0)
          error("_weights_columns","Weights cannot have missing values.");
        if(w.min() < 0)
          error("_weights_columns","Weights must be >= 0");
        if(w.max() == 0)
          error("_weights_columns","Max. weight must be > 0");
        _train.add(_parms._weights_column, w);
        ++res;
      }
    } else {
      _weights = null;
      assert(!hasWeightCol());
    }
    if(_parms._offset_column != null) {
      Vec o = _train.remove(_parms._offset_column);
      if(o == null)
        error("_offset_column","Offset column '" + _parms._offset_column  + "' not found in the training frame");
      else {
        if(!o.isNumeric())
          error("_offset_column","Invalid offset column '" + _parms._offset_column  + "', offset must be numeric");
        _offset = o;
        if(o.naCnt() > 0)
          error("_offset_column","Offset cannot have missing values.");
        if(_weights == _offset)
          error("_offset_column", "Offset must be different from weights");
        _train.add(_parms._offset_column, o);
        ++res;
      }
    } else {
      _offset = null;
      assert(!hasOffsetCol());
    }
    if(_parms._fold_column != null) {
      Vec f = _train.remove(_parms._fold_column);
      if(f == null)
        error("_fold_column","Fold column '" + _parms._fold_column  + "' not found in the training frame");
      else {
        if(!f.isInt() && !f.isCategorical())
          error("_fold_column","Invalid fold column '" + _parms._fold_column  + "', fold must be integer or categorical");
        if(f.min() < 0)
          error("_fold_column","Invalid fold column '" + _parms._fold_column  + "', fold must be non-negative");
        if(f.isConst())
          error("_fold_column","Invalid fold column '" + _parms._fold_column  + "', fold cannot be constant");
        _fold = f;
        if(f.naCnt() > 0)
          error("_fold_column","Fold cannot have missing values.");
        if(_fold == _weights)
          error("_fold_column", "Fold must be different from weights");
        if(_fold == _offset)
          error("_fold_column", "Fold must be different from offset");
        _train.add(_parms._fold_column, f);
        ++res;
      }
    } else {
      _fold = null;
      assert(!hasFoldCol());
    }
    if(isSupervised() && _parms._response_column != null) {
      _response = _train.remove(_parms._response_column);
      if (_response == null) {
        if (isSupervised())
          error("_response_column", "Response column '" + _parms._response_column + "' not found in the training frame");
      } else {
        if(_response == _offset)
          error("_response_column", "Response column must be different from offset_column");
        if(_response == _weights)
          error("_response_column", "Response column must be different from weights_column");
        if(_response == _fold)
          error("_response_column", "Response column must be different from fold_column");
        _train.add(_parms._response_column, _response);
        ++res;
      }
    } else {
      _response = null;
    }
    return res;
  }

  protected  boolean ignoreStringColumns(){return true;}
  protected  boolean ignoreConstColumns(){return _parms._ignore_const_cols;}

  /**
   * Ignore constant columns, columns with all NAs and strings.
   * @param npredictors
   * @param expensive
   */
  protected void ignoreBadColumns(int npredictors, boolean expensive){
    // Drop all-constant and all-bad columns.
    if(_parms._ignore_const_cols)
      new FilterCols(npredictors) {
        @Override protected boolean filter(Vec v) {
          boolean isBad = v.isBad();
          boolean skipConst = ignoreConstColumns() && v.isConst();
          boolean skipString = ignoreStringColumns() && v.isString();
          boolean skip = isBad || skipConst || skipString;
          return skip;
        }
      }.doIt(_train,"Dropping bad and constant columns: ",expensive);
  }

  /**
   * Ignore invalid columns (columns that have a very high max value, which can cause issues in DHistogram)
   * @param npredictors
   * @param expensive
   */
  protected void ignoreInvalidColumns(int npredictors, boolean expensive){}

  /**
   * Override this method to call error() if the model is expected to not fit in memory, and say why
   */
  protected void checkMemoryFootPrint() {}


  transient double [] _distribution;
  transient protected double [] _priorClassDist;

  protected boolean computePriorClassDistribution(){
    return isClassifier();
  }

  /** A list of field validation issues. */
  public ValidationMessage[] _messages = new ValidationMessage[0];
  private int _error_count = -1; // -1 ==> init not run yet, for those Jobs that have an init, like ModelBuilder. Note, this counts ONLY errors, not WARNs and etc.
  public int error_count() { assert _error_count >= 0 : "init() not run yet"; return _error_count; }
  public void hide (String field_name, String message) { message(Log.TRACE, field_name, message); }
  public void info (String field_name, String message) { message(Log.INFO , field_name, message); }
  public void warn (String field_name, String message) { message(Log.WARN , field_name, message); }
  public void error(String field_name, String message) { message(Log.ERRR , field_name, message); _error_count++; }
  public void clearValidationErrors() {
    _messages = new ValidationMessage[0];
    _error_count = 0;
  }

  public void message(byte log_level, String field_name, String message) {
    _messages = Arrays.copyOf(_messages, _messages.length + 1);
    _messages[_messages.length - 1] = new ValidationMessage(log_level, field_name, message);

    if (log_level == Log.ERRR) _error_count++;
  }

 /** Get a string representation of only the ERROR ValidationMessages (e.g., to use in an exception throw). */
  public String validationErrors() {
    StringBuilder sb = new StringBuilder();
    for( ValidationMessage vm : _messages )
      if( vm._log_level == Log.ERRR )
        sb.append(vm.toString()).append("\n");
    return sb.toString();
  }

  /** Can be an ERROR, meaning the parameters can't be used as-is,
   *  a TRACE, which means the specified field should be hidden given
   *  the values of other fields, or a WARN or INFO for informative
   *  messages to the user. */
  public static final class ValidationMessage extends Iced {
    final byte _log_level; // See util/Log.java for levels
    final String _field_name;
    final String _message;
    public ValidationMessage(byte log_level, String field_name, String message) {
      _log_level = log_level;
      _field_name = field_name;
      _message = message;
      Log.log(log_level,field_name + ": " + message);
    }
    public int log_level() { return _log_level; }
    @Override public String toString() { return Log.LVLS[_log_level] + " on field: " + _field_name + ": " + _message; }
  }

  // ==========================================================================
  /** Initialize the ModelBuilder, validating all arguments and preparing the
   *  training frame.  This call is expected to be overridden in the subclasses
   *  and each subclass will start with "super.init();".  This call is made by
   *  the front-end whenever the GUI is clicked, and needs to be fast whenever
   *  {@code expensive} is false; it will be called once again at the start of
   *  model building {@see #trainModel()} with expensive set to true.
   *<p>
   *  The incoming training frame (and validation frame) will have ignored
   *  columns dropped out, plus whatever work the parent init did.
   *<p>
   *  NOTE: The front end initially calls this through the parameters validation
   *  endpoint with no training_frame, so each subclass's {@code init()} method
   *  has to work correctly with the training_frame missing.
   *<p>
   */
  public void init(boolean expensive) {
    // Log parameters
    if( expensive && logMe() ) {
      Log.info("Building H2O " + this.getClass().getSimpleName() + " model with these parameters:");
      Log.info(new String(_parms.writeJSON(new AutoBuffer()).buf()));
    }
    // NOTE: allow re-init:
    clearInitState();
    assert _parms != null;      // Parms must already be set in
    if( _parms._train == null ) {
      if (expensive)
        error("_train", "Missing training frame");
      return;
    }
    Frame tr = _train != null?_train:_parms.train();
    if( tr == null ) { error("_train", "Missing training frame: "+_parms._train); return; }
    setTrain(new Frame(null /* not putting this into KV */, tr._names.clone(), tr.vecs().clone()));
    if (expensive) {
      _parms.getOrMakeRealSeed();
    }
    if (_parms._categorical_encoding.needsResponse() && !isSupervised()) {
      error("_categorical_encoding", "Categorical encoding scheme cannot be "
          + _parms._categorical_encoding.toString() + " - no response column available.");
    }
    if (_parms._nfolds < 0 || _parms._nfolds == 1) {
      error("_nfolds", "nfolds must be either 0 or >1.");
    }
    if (_parms._nfolds > 1 && _parms._nfolds > train().numRows()) {
      error("_nfolds", "nfolds cannot be larger than the number of rows (" + train().numRows() + ").");
    }
    if (_parms._fold_column != null) {
      hide("_fold_assignment", "Fold assignment is ignored when a fold column is specified.");
      if (_parms._nfolds > 1) {
        error("_nfolds", "nfolds cannot be specified at the same time as a fold column.");
      } else {
        hide("_nfolds", "nfolds is ignored when a fold column is specified.");
      }
      if (_parms._fold_assignment != Model.Parameters.FoldAssignmentScheme.AUTO) {
        error("_fold_assignment", "Fold assignment is not allowed in conjunction with a fold column.");
      }
    }
    if (_parms._nfolds > 1) {
      hide("_fold_column", "Fold column is ignored when nfolds > 1.");
    }
    // hide cross-validation parameters unless cross-val is enabled
    if (!nFoldCV()) {
      hide("_keep_cross_validation_predictions", "Only for cross-validation.");
      hide("_keep_cross_validation_fold_assignment", "Only for cross-validation.");
      hide("_fold_assignment", "Only for cross-validation.");
      if (_parms._fold_assignment != Model.Parameters.FoldAssignmentScheme.AUTO) {
        error("_fold_assignment", "Fold assignment is only allowed for cross-validation.");
      }
    }
    if (_parms._distribution == DistributionFamily.modified_huber) {
      error("_distribution", "Modified Huber distribution is not supported yet.");
    }
    if (_parms._distribution != DistributionFamily.tweedie) {
      hide("_tweedie_power", "Only for Tweedie Distribution.");
    }
    if (_parms._tweedie_power <= 1 || _parms._tweedie_power >= 2) {
      error("_tweedie_power", "Tweedie power must be between 1 and 2 (exclusive).");
    }

    // Drop explicitly dropped columns
    if( _parms._ignored_columns != null ) {
      _train.remove(_parms._ignored_columns);
      if( expensive ) Log.info("Dropping ignored columns: "+Arrays.toString(_parms._ignored_columns));
    }
    // Rebalance train and valid datasets
    if (expensive && error_count() == 0 && _parms._auto_rebalance) {
      setTrain(rebalance(_train, false, _result + ".temporary.train"));
      _valid = rebalance(_valid, false, _result + ".temporary.valid");
    }

    // Drop all non-numeric columns (e.g., String and UUID).  No current algo
    // can use them, and otherwise all algos will then be forced to remove
    // them.  Text algos (grep, word2vec) take raw text columns - which are
    // numeric (arrays of bytes).
    ignoreBadColumns(separateFeatureVecs(), expensive);
    ignoreInvalidColumns(separateFeatureVecs(), expensive);
    // Check that at least some columns are not-constant and not-all-NAs
    if( _train.numCols() == 0 )
      error("_train","There are no usable columns to generate model");

    if(isSupervised()) {
      if(_response != null) {
        if (_parms._distribution != DistributionFamily.tweedie) {
          hide("_tweedie_power", "Tweedie power is only used for Tweedie distribution.");
        }
        if (_parms._distribution != DistributionFamily.quantile) {
          hide("_quantile_alpha", "Quantile (alpha) is only used for Quantile regression.");
        }
        if (expensive) checkDistributions();
        _nclass = _response.isCategorical() ? _response.cardinality() : 1;
        if (_response.isConst())
          error("_response","Response cannot be constant.");
      }
      if (! _parms._balance_classes)
        hide("_max_after_balance_size", "Balance classes is false, hide max_after_balance_size");
      else if (_parms._weights_column != null && _weights != null && !_weights.isBinary())
        error("_balance_classes", "Balance classes and observation weights are not currently supported together.");
      if( _parms._max_after_balance_size <= 0.0 )
        error("_max_after_balance_size","Max size after balancing needs to be positive, suggest 1.0f");

      if( _train != null ) {
        if (_train.numCols() <= 1)
          error("_train", "Training data must have at least 2 features (incl. response).");
        if( null == _parms._response_column) {
          error("_response_column", "Response column parameter not set.");
          return;
        }
        if(_response != null && computePriorClassDistribution()) {
          if (isClassifier() && isSupervised()) {
            MRUtils.ClassDist cdmt =
                _weights != null ? new MRUtils.ClassDist(nclasses()).doAll(_response, _weights) : new MRUtils.ClassDist(nclasses()).doAll(_response);
            _distribution = cdmt.dist();
            _priorClassDist = cdmt.rel_dist();
          } else {                    // Regression; only 1 "class"
            _distribution = new double[]{ (_weights != null ? _weights.mean() : 1.0) * train().numRows() };
            _priorClassDist = new double[]{1.0f};
          }
        }
      }

      if( !isClassifier() ) {
        hide("_balance_classes", "Balance classes is only applicable to classification problems.");
        hide("_class_sampling_factors", "Class sampling factors is only applicable to classification problems.");
        hide("_max_after_balance_size", "Max after balance size is only applicable to classification problems.");
        hide("_max_confusion_matrix_size", "Max confusion matrix size is only applicable to classification problems.");
      }
      if (_nclass <= 2) {
        hide("_max_hit_ratio_k", "Max K-value for hit ratio is only applicable to multi-class classification problems.");
        hide("_max_confusion_matrix_size", "Only for multi-class classification problems.");
      }
      if( !_parms._balance_classes ) {
        hide("_max_after_balance_size", "Only used with balanced classes");
        hide("_class_sampling_factors", "Class sampling factors is only applicable if balancing classes.");
      }
    }
    else {
      hide("_response_column", "Ignored for unsupervised methods.");
      hide("_balance_classes", "Ignored for unsupervised methods.");
      hide("_class_sampling_factors", "Ignored for unsupervised methods.");
      hide("_max_after_balance_size", "Ignored for unsupervised methods.");
      hide("_max_confusion_matrix_size", "Ignored for unsupervised methods.");
      _response = null;
      _vresponse = null;
      _nclass = 1;
    }

    if( _nclass > Model.Parameters.MAX_SUPPORTED_LEVELS ) {
      error("_nclass", "Too many levels in response column: " + _nclass + ", maximum supported number of classes is " + Model.Parameters.MAX_SUPPORTED_LEVELS + ".");
    }

    // Build the validation set to be compatible with the training set.
    // Toss out extra columns, complain about missing ones, remap categoricals
    Frame va = _parms.valid();  // User-given validation set
    if (va != null) {
      _valid = adaptFrameToTrain(va, "Validation Frame", "_validation_frame", expensive);
      _vresponse = _valid.vec(_parms._response_column);
    } else {
      _valid = null;
      _vresponse = null;
    }

    if (expensive) {
      Frame newtrain = encodeFrameCategoricals(_train, ! _parms._is_cv_model);
      if (newtrain != _train) {
        _origNames = _train.names();
        _origDomains = _train.domains();
        setTrain(newtrain);
        separateFeatureVecs(); //fix up the pointers to the special vecs
      }
      if (_valid != null) {
        _valid = encodeFrameCategoricals(_valid, ! _parms._is_cv_model /* for CV, need to score one more time in outer loop */);
        _vresponse = _valid.vec(_parms._response_column);
      }
      boolean restructured = false;
      Vec[] vecs = _train.vecs();
      for (int j = 0; j < vecs.length; ++j) {
        Vec v = vecs[j];
        if (v == _response || v == _fold) continue;
        if (v.isCategorical() && shouldReorder(v)) {
          final int len = v.domain().length;
          Log.info("Reordering categorical column " + _train.name(j) + " (" + len + " levels) based on the mean (weighted) response per level.");
          VecUtils.MeanResponsePerLevelTask mrplt = new VecUtils.MeanResponsePerLevelTask(len).doAll(v,
                  _parms._weights_column != null ? _train.vec(_parms._weights_column) : v.makeCon(1.0),
                  _train.vec(_parms._response_column));
          double[] meanWeightedResponse  = mrplt.meanWeightedResponse;
//          for (int i=0;i<len;++i)
//            Log.info(v.domain()[i] + " -> " + meanWeightedResponse[i]);

          // Option 1: Order the categorical column by response to make better splits
          int[] idx=new int[len];
          for (int i=0;i<len;++i) idx[i] = i;
          ArrayUtils.sort(idx, meanWeightedResponse);
          int[] invIdx=new int[len];
          for (int i=0;i<len;++i) invIdx[idx[i]] = i;
          Vec vNew = new VecUtils.ReorderTask(invIdx).doAll(1, Vec.T_NUM, new Frame(v)).outputFrame().anyVec();
          String[] newDomain = new String[len];
          for (int i = 0; i < len; ++i) newDomain[i] = v.domain()[idx[i]];
          vNew.setDomain(newDomain);
//          for (int i=0;i<len;++i)
//            Log.info(vNew.domain()[i] + " -> " + meanWeightedResponse[idx[i]]);
          vecs[j] = vNew;
          restructured = true;
        }
      }
      if (restructured)
        _train.restructure(_train.names(), vecs);
    }
    assert (!expensive || _valid==null || Arrays.equals(_train._names, _valid._names) || _parms._categorical_encoding == Model.Parameters.CategoricalEncodingScheme.Binary);
    if (_valid!=null && !Arrays.equals(_train._names, _valid._names) && _parms._categorical_encoding == Model.Parameters.CategoricalEncodingScheme.Binary) {
      for (String name : _train._names)
        assert(ArrayUtils.contains(_valid._names, name)) : "Internal error during categorical encoding: training column " + name + " not in validation frame with columns " + Arrays.toString(_valid._names);
    }

    if (_parms._checkpoint != null && DKV.get(_parms._checkpoint) == null) {
      error("_checkpoint", "Checkpoint has to point to existing model!");
    }

    if (_parms._stopping_tolerance < 0) {
      error("_stopping_tolerance", "Stopping tolerance must be >= 0.");
    }
    if (_parms._stopping_tolerance >= 1) {
      error("_stopping_tolerance", "Stopping tolerance must be < 1.");
    }
    if (_parms._stopping_rounds == 0) {
      if (_parms._stopping_metric != ScoreKeeper.StoppingMetric.AUTO)
        warn("_stopping_metric", "Stopping metric is ignored for _stopping_rounds=0.");
      if (_parms._stopping_tolerance != _parms.defaultStoppingTolerance())
        warn("_stopping_tolerance", "Stopping tolerance is ignored for _stopping_rounds=0.");
    } else if (_parms._stopping_rounds < 0) {
      error("_stopping_rounds", "Stopping rounds must be >= 0.");
    } else {
      if (isClassifier()) {
        if (_parms._stopping_metric == ScoreKeeper.StoppingMetric.deviance && !getClass().getSimpleName().contains("GLM")) {
          error("_stopping_metric", "Stopping metric cannot be deviance for classification.");
        }
        if (nclasses()!=2 && _parms._stopping_metric == ScoreKeeper.StoppingMetric.AUC) {
          error("_stopping_metric", "Stopping metric cannot be AUC for multinomial classification.");
        }
      } else {
        if (_parms._stopping_metric == ScoreKeeper.StoppingMetric.misclassification ||
                _parms._stopping_metric == ScoreKeeper.StoppingMetric.AUC ||
                _parms._stopping_metric == ScoreKeeper.StoppingMetric.logloss)
        {
          error("_stopping_metric", "Stopping metric cannot be " + _parms._stopping_metric.toString() + " for regression.");
        }
      }
    }
    if (_parms._max_runtime_secs < 0) {
      error("_max_runtime_secs", "Max runtime (in seconds) must be greater than 0 (or 0 for unlimited).");
    }
  }

  /**
   * Adapts a given frame to the same schema as the training frame.
   * This includes encoding of categorical variables (if expensive is enabled).
   *
   * Note: This method should only be used during ModelBuilder initialization - it should be called in init(..) method.
   *
   * @param fr input frame
   * @param frDesc frame description, eg. "Validation Frame" - will be shown in validation error messages
   * @param field name of a field for validation errors
   * @param expensive indicates full ("expensive") processing
   * @return adapted frame
   */
  protected Frame init_adaptFrameToTrain(Frame fr, String frDesc, String field, boolean expensive) {
    Frame adapted = adaptFrameToTrain(fr, frDesc, field, expensive);
    if (expensive)
      adapted = encodeFrameCategoricals(adapted, true);
    return adapted;
  }

  private Frame adaptFrameToTrain(Frame fr, String frDesc, String field, boolean expensive) {
    if (fr.numRows()==0) error(field, frDesc + " must have > 0 rows.");
    Frame adapted = new Frame(null /* not putting this into KV */, fr._names.clone(), fr.vecs().clone());
    try {
      String[] msgs = Model.adaptTestForTrain(adapted, null, null, _train._names, _train.domains(), _parms, expensive, true, null, getToEigenVec(), _toDelete, false);
      Vec response = adapted.vec(_parms._response_column);
      if (response == null && _parms._response_column != null)
        error(field, frDesc + " must have a response column '" + _parms._response_column + "'.");
      if (expensive) {
        for (String s : msgs) {
          Log.info(s);
          warn(field, s);
        }
      }
    } catch (IllegalArgumentException iae) {
      error(field, iae.getMessage());
    }
    return adapted;
  }

  private Frame encodeFrameCategoricals(Frame fr, boolean scopeTrack) {
    String[] skipCols = new String[]{_parms._weights_column, _parms._offset_column, _parms._fold_column, _parms._response_column};
    Frame encoded = FrameUtils.categoricalEncoder(fr, skipCols, _parms._categorical_encoding, getToEigenVec());
    if (encoded != fr) {
      assert encoded._key != null;
      if (scopeTrack)
        Scope.track(encoded);
      else
        _toDelete.put(encoded._key, Arrays.toString(Thread.currentThread().getStackTrace()));
    }
    return encoded;
  }

  /**
   * Rebalance a frame for load balancing
   * @param original_fr Input frame
   * @param local Whether to only create enough chunks to max out all cores on one node only
   * @param name Name of rebalanced frame
   * @return Frame that has potentially more chunks
   */

  protected Frame rebalance(final Frame original_fr, boolean local, final String name) {
    if (original_fr == null) return null;
    int chunks = desiredChunks(original_fr, local);
    if (original_fr.anyVec().nChunks() >= chunks) {
      if (chunks>1)
        Log.info(name.substring(name.length()-5)+ " dataset already contains " + original_fr.anyVec().nChunks() +
              " chunks. No need to rebalance.");
      return original_fr;
    }
    Log.info("Rebalancing " + name.substring(name.length()-5)  + " dataset into " + chunks + " chunks.");
    Key newKey = Key.makeUserHidden(name + ".chunks" + chunks);
    RebalanceDataSet rb = new RebalanceDataSet(original_fr, newKey, chunks);
    H2O.submitTask(rb).join();
    Frame rebalanced_fr = DKV.get(newKey).get();
    Scope.track(rebalanced_fr);
    return rebalanced_fr;
  }

  /**
   * Find desired number of chunks. If fewer, dataset will be rebalanced.
   * @return Lower bound on number of chunks after rebalancing.
   */
  protected int desiredChunks(final Frame original_fr, boolean local) {
    return Math.min((int) Math.ceil(original_fr.numRows() / 1e3), H2O.NUMCPUS);
  }

  public void checkDistributions() {
    if (_parms._distribution == DistributionFamily.poisson) {
      if (_response.min() < 0)
        error("_response", "Response must be non-negative for Poisson distribution.");
    } else if (_parms._distribution == DistributionFamily.gamma) {
      if (_response.min() < 0)
        error("_response", "Response must be non-negative for Gamma distribution.");
    } else if (_parms._distribution == DistributionFamily.tweedie) {
      if (_parms._tweedie_power >= 2 || _parms._tweedie_power <= 1)
        error("_tweedie_power", "Tweedie power must be between 1 and 2.");
      if (_response.min() < 0)
        error("_response", "Response must be non-negative for Tweedie distribution.");
    } else if (_parms._distribution == DistributionFamily.quantile) {
      if (_parms._quantile_alpha > 1 || _parms._quantile_alpha < 0)
        error("_quantile_alpha", "Quantile alpha must be between 0 and 1.");
    } else if (_parms._distribution == DistributionFamily.huber) {
      if (_parms._huber_alpha <0 || _parms._huber_alpha>1)
        error("_huber_alpha", "Huber alpha must be between 0 and 1.");
    }
  }

  transient public HashSet<String> _removedCols = new HashSet<>();
  public abstract class FilterCols {
    final int _specialVecs; // special vecs to skip at the end
    public FilterCols(int n) {_specialVecs = n;}

    abstract protected boolean filter(Vec v);

    public void doIt( Frame f, String msg, boolean expensive ) {
      List<Integer> rmcolsList = new ArrayList<>();
      for( int i = 0; i < f.vecs().length - _specialVecs; i++ )
        if( filter(f.vec(i)) ) rmcolsList.add(i);
      if( !rmcolsList.isEmpty() ) {
        _removedCols = new HashSet<>(rmcolsList.size());
        int[] rmcols = new int[rmcolsList.size()];
        for (int i=0;i<rmcols.length;++i) {
          rmcols[i]=rmcolsList.get(i);
          _removedCols.add(f._names[rmcols[i]]);
        }
        f.remove(rmcols); //bulk-remove
        msg += _removedCols.toString();
        warn("_train", msg);
        if (expensive) Log.info(msg);
      }
    }
  }

  //stitch together holdout predictions into one large Frame
  private static Frame combineHoldoutPredictions(Key<Frame>[] predKeys, Key key) {
    int N = predKeys.length;
    Frame template = predKeys[0].get();
    Vec[] vecs = new Vec[N*template.numCols()];
    int idx=0;
    for (int i=0;i<N;++i)
      for (int j=0;j<predKeys[i].get().numCols();++j)
        vecs[idx++]=predKeys[i].get().vec(j);
    return new HoldoutPredictionCombiner(N,template.numCols()).doAll(template.types(),new Frame(vecs)).outputFrame(key, template.names(),template.domains());
  }

  // helper to combine multiple holdout prediction Vecs (each only has 1/N-th filled with non-zeros) into 1 Vec
  private static class HoldoutPredictionCombiner extends MRTask<HoldoutPredictionCombiner> {
    int _folds, _cols;
    public HoldoutPredictionCombiner(int folds, int cols) { _folds=folds; _cols=cols; }
    @Override public void map(Chunk[] cs, NewChunk[] nc) {
      for (int c=0;c<_cols;++c) {
        double [] vals = new double[cs[0].len()];
        for (int f=0;f<_folds;++f)
          for (int row = 0; row < cs[0].len(); ++row)
            vals[row] += cs[f * _cols + c].atd(row);
        nc[c].setDoubles(vals);
      }
    }
  }

  private TwoDimTable makeCrossValidationSummaryTable(Key[] cvmodels) {
    if (cvmodels == null || cvmodels.length == 0) return null;
    int N = cvmodels.length;
    int extra_length=2; //mean/sigma/cv1/cv2/.../cvN
    String[] colTypes = new String[N+extra_length];
    Arrays.fill(colTypes, "string");
    String[] colFormats = new String[N+extra_length];
    Arrays.fill(colFormats, "%s");
    String[] colNames = new String[N+extra_length];
    colNames[0] = "mean";
    colNames[1] = "sd";
    for (int i=0;i<N;++i)
    colNames[i+extra_length] = "cv_" + (i+1) + "_valid";
    Set<String> excluded = new HashSet<>();
    excluded.add("total_rows");
    excluded.add("makeSchema");
    excluded.add("hr");
    excluded.add("frame");
    excluded.add("remove");
    excluded.add("cm");
    excluded.add("auc_obj");
    List<Method> methods = new ArrayList<>();
    {
      Model m = DKV.getGet(cvmodels[0]);
      ModelMetrics mm = m._output._validation_metrics;

      if (mm != null) {

        for (Method meth : mm.getClass().getMethods()) {
          if (excluded.contains(meth.getName())) continue;
          try {
            double c = (double) meth.invoke(mm);
            methods.add(meth);
          } catch (Exception ignored) {}
        }

        ConfusionMatrix cm = mm.cm();
        if (cm != null) {
          for (Method meth : cm.getClass().getMethods()) {
            if (excluded.contains(meth.getName())) continue;
            try {
              double c = (double) meth.invoke(cm);
              methods.add(meth);
            } catch (Exception ignored) {}
          }
        }
      }
    }

    // make unique, and sort alphabetically
    Set<String> rowNames=new TreeSet<>();
    for (Method m : methods) rowNames.add(m.getName());
    List<Method> meths = new ArrayList<>();
    OUTER:
    for (String n : rowNames)
      for (Method m : methods)
        if (m.getName().equals(n)) { //find the first method that has that name
          meths.add(m);
          continue OUTER;
        }

    int numMetrics = rowNames.size();

    TwoDimTable table = new TwoDimTable("Cross-Validation Metrics Summary",
            null,
            rowNames.toArray(new String[0]), colNames, colTypes, colFormats, "");

    MathUtils.BasicStats stats = new MathUtils.BasicStats(numMetrics);
    double[][] vals = new double[N][numMetrics];
    int i = 0;
    for (Key<Model> km : cvmodels) {
      Model m = DKV.getGet(km);
      if (m==null) continue;
      ModelMetrics mm = m._output._validation_metrics;
      int j=0;
      for (Method meth : meths) {
        if (excluded.contains(meth.getName())) continue;
        try {
          double val = (double) meth.invoke(mm);
          vals[i][j] = val;
          table.set(j++, i+extra_length, (float)val);
        } catch (Throwable e) { }
        if (mm.cm()==null) continue;
        try {
          double val = (double) meth.invoke(mm.cm());
          vals[i][j] = val;
          table.set(j++, i+extra_length, (float)val);
        } catch (Throwable e) { }
      }
      i++;
    }

    for (i=0;i<N;++i)
      stats.add(vals[i],1);
    for (i=0;i<numMetrics;++i) {
      table.set(i, 0, (float)stats.mean()[i]);
      table.set(i, 1, (float)stats.sigma()[i]);
    }

    Log.info(table);
    return table;
  }

  public static void bulkBuildModels(Job job, ModelBuilder[] modelBuilders, int parallelization) {
    final int N = modelBuilders.length;
    H2O.H2OCountedCompleter submodel_tasks[] = new H2O.H2OCountedCompleter[N];
    int nRunning=0;
    RuntimeException rt = null;
    for( int i=0; i<N; ++i ) {
      if (job.stop_requested() ) break; // Stop launching but still must block for all async jobs
      modelBuilders[i]._start_time = System.currentTimeMillis();
      submodel_tasks[i] = H2O.submitTask(modelBuilders[i].trainModelImpl());
      if(++nRunning == parallelization) { //piece-wise advance in training the models
        while (nRunning > 0) try {
          submodel_tasks[i + 1 - nRunning--].join();
          job.update(1); // One job finished
        } catch (RuntimeException t) {
          if (rt == null) rt = t;
        }
        if(rt != null) throw rt;
      }
    }
    for( int i=0; i<N; ++i ) //all sub-models must be completed before the main model can be built
      try {
        submodel_tasks[i].join();
      } catch(RuntimeException t){
        if(rt == null) rt = t;
      }
    if(rt != null) throw rt;
  }

}