package hex;
import hex.genmodel.utils.DistributionFamily;
import water.*;
import water.exceptions.H2OIllegalArgumentException;
import water.exceptions.H2OModelBuilderIllegalArgumentException;
import water.fvec.*;
import water.rapids.ast.prims.advmath.AstKFold;
import water.util.*;
import java.lang.reflect.Method;
import java.util.*;
/**
* Model builder parent class. Contains the common interfaces and fields across all model builders.
*/
abstract public class ModelBuilder<M extends Model<M,P,O>, P extends Model.Parameters, O extends Model.Output> extends Iced {
public ToEigenVec getToEigenVec() { return null; }
public boolean shouldReorder(Vec v) { return _parms._categorical_encoding.needsResponse() && isSupervised(); }
transient private IcedHashMap<Key,String> _toDelete = new IcedHashMap<>();
void cleanUp() { FrameUtils.cleanUp(_toDelete); }
public Job<M> _job; // Job controlling this build
/** Block till completion, and return the built model from the DKV. Note the
* funny assert: the Job does NOT have to be controlling this model build,
* but might, e.g. be controlling a Grid search for which this is just one
* of many results. Calling 'get' means that we are blocking on the Job
* which is controlling ONLY this ModelBuilder, and when the Job completes
* we can return built Model. */
public final M get() { assert _job._result == _result; return _job.get(); }
public final boolean isStopped() { return _job.isStopped(); }
// Key of the model being built; note that this is DIFFERENT from
// _job._result if the Job is being shared by many sub-models
// e.g. cross-validation.
protected Key<M> _result; // Built Model key
public final Key<M> dest() { return _result; }
private long _start_time; //start time in msecs - only used for time-based stopping
protected boolean timeout() {
assert(_start_time > 0) : "Must set _start_time for each individual model.";
return _parms._max_runtime_secs > 0 && System.currentTimeMillis() - _start_time > (long) (_parms._max_runtime_secs * 1e3);
}
protected boolean stop_requested() {
return _job.stop_requested() || timeout();
}
/** Default model-builder key */
public static <S extends Model> Key<S> defaultKey(String algoName) {
return Key.make(H2O.calcNextUniqueModelId(algoName));
}
/** Default easy constructor: Unique new job and unique new result key */
protected ModelBuilder(P parms) {
this(parms, ModelBuilder.<M>defaultKey(parms.algoName()));
}
/** Unique new job and named result key */
protected ModelBuilder(P parms, Key<M> key) {
_job = new Job<>(_result = key, parms.javaName(), parms.algoName());
_parms = parms;
}
/** Shared pre-existing Job and unique new result key */
protected ModelBuilder(P parms, Job<M> job) {
_job = job;
_result = defaultKey(parms.algoName());
_parms = parms;
}
/** List of known ModelBuilders with all default args; endlessly cloned by
* the GUI for new private instances, then the GUI overrides some of the
* defaults with user args. */
private static String[] ALGOBASES = new String[0];
public static String[] algos() { return ALGOBASES; }
private static String[] SCHEMAS = new String[0];
private static ModelBuilder[] BUILDERS = new ModelBuilder[0];
/** One-time start-up only ModelBuilder, endlessly cloned by the GUI for the
* default settings. */
protected ModelBuilder(P parms, boolean startup_once) { this(parms,startup_once,"hex.schemas."); }
protected ModelBuilder(P parms, boolean startup_once, String externalSchemaDirectory ) {
assert startup_once;
_job = null;
_result = null;
_parms = parms;
init(false); // Default cheap init
String base = getClass().getSimpleName().toLowerCase();
if( ArrayUtils.find(ALGOBASES,base) != -1 )
throw H2O.fail("Only called once at startup per ModelBuilder, and "+base+" has already been called");
// FIXME: this is not thread safe!
ALGOBASES = Arrays.copyOf(ALGOBASES,ALGOBASES.length+1);
BUILDERS = Arrays.copyOf(BUILDERS ,BUILDERS .length+1);
SCHEMAS = Arrays.copyOf(SCHEMAS ,SCHEMAS .length+1);
ALGOBASES[ALGOBASES.length-1] = base;
BUILDERS [BUILDERS .length-1] = this;
SCHEMAS [SCHEMAS .length-1] = externalSchemaDirectory;
}
/** gbm -> GBM, deeplearning -> DeepLearning */
public static String algoName(String urlName) { return BUILDERS[ArrayUtils.find(ALGOBASES,urlName)]._parms.algoName(); }
/** gbm -> hex.tree.gbm.GBM, deeplearning -> hex.deeplearning.DeepLearning */
public static String javaName(String urlName) { return BUILDERS[ArrayUtils.find(ALGOBASES,urlName)]._parms.javaName(); }
/** gbm -> GBMParameters */
public static String paramName(String urlName) { return algoName(urlName)+"Parameters"; }
/** gbm -> "hex.schemas." ; custAlgo -> "org.myOrg.schemas." */
public static String schemaDirectory(String urlName) { return SCHEMAS[ArrayUtils.find(ALGOBASES,urlName)]; }
/** Factory method to create a ModelBuilder instance for given the algo name.
* Shallow clone of both the default ModelBuilder instance and a Parameter. */
public static <B extends ModelBuilder> B make(String algo, Job job, Key<Model> result) {
int idx = ArrayUtils.find(ALGOBASES,algo.toLowerCase());
assert idx != -1 : "Unregistered algorithm "+algo;
B mb = (B)BUILDERS[idx].clone();
mb._job = job;
mb._result = result;
mb._parms = BUILDERS[idx]._parms.clone();
return mb;
}
/** All the parameters required to build the model. */
public P _parms; // Not final, so CV can set-after-clone
/** Training frame: derived from the parameter's training frame, excluding
* all ignored columns, all constant and bad columns, perhaps flipping the
* response column to an Categorical, etc. */
public final Frame train() { return _train; }
protected transient Frame _train;
public void setTrain(Frame train) {
_train = train;
}
/** Validation frame: derived from the parameter's validation frame, excluding
* all ignored columns, all constant and bad columns, perhaps flipping the
* response column to a Categorical, etc. Is null if no validation key is set. */
protected final Frame valid() { return _valid; }
protected transient Frame _valid;
// TODO: tighten up the type
// Map the algo name (e.g., "deeplearning") to the builder class (e.g., DeepLearning.class) :
private static final Map<String, Class<? extends ModelBuilder>> _builders = new HashMap<>();
// Map the Model class (e.g., DeepLearningModel.class) to the algo name (e.g., "deeplearning"):
private static final Map<Class<? extends Model>, String> _model_class_to_algo = new HashMap<>();
// Map the simple algo name (e.g., deeplearning) to the full algo name (e.g., "Deep Learning"):
private static final Map<String, String> _algo_to_algo_full_name = new HashMap<>();
// Map the algo name (e.g., "deeplearning") to the Model class (e.g., DeepLearningModel.class):
private static final Map<String, Class<? extends Model>> _algo_to_model_class = new HashMap<>();
/** Train response vector. */
public Vec response(){return _response;}
/** Validation response vector. */
public Vec vresponse(){return _vresponse == null ? _response : _vresponse;}
abstract protected class Driver extends H2O.H2OCountedCompleter<Driver> {
protected Driver(){ super(); }
protected Driver(H2O.H2OCountedCompleter completer){ super(completer); }
// Pull the boilerplate out of the computeImpl(), so the algo writer doesn't need to worry about the following:
// 1) Scope (unless they want to keep data, then they must call Scope.untrack(Key<Vec>[]))
// 2) Train/Valid frame locking and unlocking
// 3) calling tryComplete()
public void compute2() {
try {
Scope.enter();
_parms.read_lock_frames(_job); // Fetch & read-lock input frames
computeImpl();
} finally {
setFinalState();
_parms.read_unlock_frames(_job);
if (!_parms._is_cv_model) cleanUp(); //cv calls cleanUp on its own terms
Scope.exit();
}
tryComplete();
}
public abstract void computeImpl();
}
private void setFinalState() {
Key<M> reskey = dest();
if (reskey == null) return;
M res = reskey.get();
if (res != null && res._output != null) {
res._output._job = _job;
res._output.stopClock();
}
}
/** Method to launch training of a Model, based on its parameters. */
final public Job<M> trainModel() {
if (error_count() > 0)
throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(this);
_start_time = System.currentTimeMillis();
if( !nFoldCV() )
return _job.start(trainModelImpl(), _parms.progressUnits(), _parms._max_runtime_secs);
// cross-validation needs to be forked off to allow continuous (non-blocking) progress bar
return _job.start(new H2O.H2OCountedCompleter() {
@Override
public void compute2() {
computeCrossValidation();
tryComplete();
}
},
(nFoldWork()+1/*main model*/) * _parms.progressUnits(), _parms._max_runtime_secs);
}
/**
* Train a model as part of a larger Job;
*
* @param fr: Input frame override, ignored if null.
* In some cases, algos do not work directly with the original frame in the K/V store.
* Instead they run on a private anonymous copy (eg: reblanced dataset).
* Use this argument if you want nested job to work on the actual working copy rather than the original Frame in the K/V.
* Example: Outer job rebalances dataset and then calls nested job. To avoid needless second reblance, pass in the (already rebalanced) working copy.
* */
final public M trainModelNested(Frame fr) {
if(fr != null) // Use the working copy (e.g. rebalanced) instead of the original K/V store version
setTrain(fr);
if (error_count() > 0)
throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(this);
_start_time = System.currentTimeMillis();
if( !nFoldCV() ) trainModelImpl().compute2();
else computeCrossValidation();
return _result.get();
}
/** Model-specific implementation of model training
* @return A F/J Job, which, when executed, does the build. F/J is NOT started. */
abstract protected Driver trainModelImpl();
/**
* How many should be trained in parallel during N-fold cross-validation?
* Train all CV models in parallel when parallelism is enabled, otherwise train one at a time
* Each model can override this logic, based on parameters, dataset size, etc.
* @return How many models to train in parallel during cross-validation
*/
protected int nModelsInParallel() {
if (!_parms._parallelize_cross_validation || _parms._max_runtime_secs != 0) return 1; //user demands serial building (or we need to honor the time constraints for all CV models equally)
if (_train.byteSize() < 1e6) return _parms._nfolds; //for small data, parallelize over CV models
return 1; //safe fallback
}
// Work for each requested fold
protected int nFoldWork() {
if( _parms._fold_column == null ) return _parms._nfolds;
Vec f = _parms._train.get().vec(_parms._fold_column);
Vec fc = VecUtils.toCategoricalVec(f);
int N = fc.domain().length;
fc.remove();
return N;
}
/**
* Default naive (serial) implementation of N-fold cross-validation
* (builds N+1 models, all have train+validation metrics, the main model has N-fold cross-validated validation metrics)
*/
public void computeCrossValidation() {
assert _job.isRunning(); // main Job is still running
_job.setReadyForView(false); //wait until the main job starts to let the user inspect the main job
final Integer N = nFoldWork();
init(false);
try {
Scope.enter();
// Step 1: Assign each row to a fold
final Vec foldAssignment = cv_AssignFold(N);
// Step 2: Make 2*N binary weight vectors
final Vec[] weights = cv_makeWeights(N,foldAssignment);
// Step 3: Build N train & validation frames; build N ModelBuilders; error check them all
ModelBuilder<M, P, O> cvModelBuilders[] = cv_makeFramesAndBuilders(N,weights);
// Step 4: Run all the CV models
cv_buildModels(N, cvModelBuilders);
// Step 5: Score the CV models
ModelMetrics.MetricBuilder mbs[] = cv_scoreCVModels(N, weights, cvModelBuilders);
// Step 6: Build the main model
buildMainModel();
// Step 7: Combine cross-validation scores; compute main model x-val
// scores; compute gains/lifts
cv_mainModelScores(N, mbs, cvModelBuilders);
// Step 7: Clean up potentially created temp frames
for (ModelBuilder mb : cvModelBuilders)
mb.cleanUp();
_job.setReadyForView(true);
DKV.put(_job);
} finally {
cleanUp();
Scope.exit();
}
}
// Step 1: Assign each row to a fold
// TODO: Implement better splitting algo (with Strata if response is
// categorical), e.g. http://www.lexjansen.com/scsug/2009/Liang_Xie2.pdf
public Vec cv_AssignFold(int N) {
assert(N>=2);
Vec fold = train().vec(_parms._fold_column);
if( fold != null ) {
if( !fold.isInt() ||
(!(fold.min() == 0 && fold.max() == N-1) &&
!(fold.min() == 1 && fold.max() == N ) )) // Allow 0 to N-1, or 1 to N
throw new H2OIllegalArgumentException("Fold column must be either categorical or contiguous integers from 0..N-1 or 1..N");
return fold;
}
final long seed = _parms.getOrMakeRealSeed();
Log.info("Creating " + N + " cross-validation splits with random number seed: " + seed);
switch( _parms._fold_assignment ) {
case AUTO:
case Random: return AstKFold. kfoldColumn(train().anyVec().makeZero(),N,seed);
case Modulo: return AstKFold. moduloKfoldColumn(train().anyVec().makeZero(),N );
case Stratified: return AstKFold.stratifiedKFoldColumn(response(),N,seed);
default: throw H2O.unimpl();
}
}
// Step 2: Make 2*N binary weight vectors
public Vec[] cv_makeWeights( final int N, Vec foldAssignment ) {
String origWeightsName = _parms._weights_column;
Vec origWeight = origWeightsName != null ? train().vec(origWeightsName) : train().anyVec().makeCon(1.0);
Frame folds_and_weights = new Frame(foldAssignment, origWeight);
Vec[] weights = new MRTask() {
@Override public void map(Chunk chks[], NewChunk nchks[]) {
Chunk fold = chks[0], orig = chks[1];
for( int row=0; row< orig._len; row++ ) {
int foldIdx = (int)fold.at8(row) % N;
double w = orig.atd(row);
for( int f = 0; f < N; f++ ) {
boolean holdout = foldIdx == f;
nchks[2 * f].addNum(holdout ? 0 : w);
nchks[2*f+1].addNum(holdout ? w : 0);
}
}
}
}.doAll(2*N,Vec.T_NUM,folds_and_weights).outputFrame().vecs();
if (_parms._keep_cross_validation_fold_assignment)
DKV.put(new Frame(Key.<Frame>make("cv_fold_assignment_" + _result.toString()), new String[]{"fold_assignment"}, new Vec[]{foldAssignment.makeCopy()}));
if( _parms._fold_column == null && !_parms._keep_cross_validation_fold_assignment) foldAssignment.remove();
if( origWeightsName == null ) origWeight.remove(); // Cleanup temp
for( Vec weight : weights )
if( weight.isConst() )
throw new H2OIllegalArgumentException("Not enough data to create " + N + " random cross-validation splits. Either reduce nfolds, specify a larger dataset (or specify another random number seed, if applicable).");
return weights;
}
// Step 3: Build N train & validation frames; build N ModelBuilders; error check them all
public ModelBuilder<M, P, O>[] cv_makeFramesAndBuilders( int N, Vec[] weights ) {
final long old_cs = _parms.checksum();
final String origDest = _result.toString();
final String weightName = "__internal_cv_weights__";
if (train().find(weightName) != -1) throw new H2OIllegalArgumentException("Frame cannot contain a Vec called '" + weightName + "'.");
Frame cv_fr = new Frame(train().names(),train().vecs());
if( _parms._weights_column!=null ) cv_fr.remove( _parms._weights_column ); // The CV frames will have their own private weight column
ModelBuilder<M, P, O>[] cvModelBuilders = new ModelBuilder[N];
List<Frame> cvFramesForFailedModels = new ArrayList<>();
for( int i=0; i<N; i++ ) {
String identifier = origDest + "_cv_" + (i+1);
// Training/Validation share the same data, but will have exclusive weights
Frame cvTrain = new Frame(Key.<Frame>make(identifier+"_train"),cv_fr.names(),cv_fr.vecs());
cvTrain.add(weightName, weights[2*i]);
DKV.put(cvTrain);
Frame cvValid = new Frame(Key.<Frame>make(identifier+"_valid"),cv_fr.names(),cv_fr.vecs());
cvValid.add(weightName, weights[2*i+1]);
DKV.put(cvValid);
// Shallow clone - not everything is a private copy!!!
ModelBuilder<M, P, O> cv_mb = (ModelBuilder)this.clone();
cv_mb.setTrain(cvTrain);
cv_mb._result = Key.make(identifier); // Each submodel gets its own key
cv_mb._parms = (P) _parms.clone();
// Fix up some parameters of the clone
cv_mb._parms._is_cv_model = true;
cv_mb._parms._weights_column = weightName;// All submodels have a weight column, which the main model does not
cv_mb._parms.setTrain(cvTrain._key); // All submodels have a weight column, which the main model does not
cv_mb._parms._valid = cvValid._key;
cv_mb._parms._fold_assignment = Model.Parameters.FoldAssignmentScheme.AUTO;
cv_mb._parms._nfolds = 0; // Each submodel is not itself folded
cv_mb.clearValidationErrors(); // each submodel gets its own validation messages and error_count()
// Error-check all the cross-validation Builders before launching any
cv_mb.init(false);
if( cv_mb.error_count() > 0 ) { // Gather all submodel error messages
Log.info("Marking frame for failed cv model for removal: " + cvTrain._key);
cvFramesForFailedModels.add(cvTrain);
Log.info("Marking frame for failed cv model for removal: " + cvValid._key);
cvFramesForFailedModels.add(cvValid);
for (ValidationMessage vm : cv_mb._messages)
message(vm._log_level, vm._field_name, vm._message);
}
cvModelBuilders[i] = cv_mb;
}
if( error_count() > 0 ) { // Found an error in one or more submodels
Futures fs = new Futures();
for (Frame cvf : cvFramesForFailedModels) {
cvf.vec(weightName).remove(fs); // delete the Vec's chunks
DKV.remove(cvf._key, fs); // delete the Frame from the DKV, leaving its vecs
Log.info("Removing frame for failed cv model: " + cvf._key);
}
fs.blockForPending();
throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(this);
}
// check that this Job's original _params haven't changed
assert old_cs == _parms.checksum();
return cvModelBuilders;
}
// Step 4: Run all the CV models and launch the main model
public void cv_buildModels(int N, ModelBuilder<M, P, O>[] cvModelBuilders ) {
H2O.H2OCountedCompleter submodel_tasks[] = new H2O.H2OCountedCompleter[N];
int nRunning=0;
RuntimeException rt = null;
for( int i=0; i<N; ++i ) {
if( _job.stop_requested() ) break; // Stop launching but still must block for all async jobs
Log.info("Building cross-validation model " + (i + 1) + " / " + N + ".");
cvModelBuilders[i]._start_time = System.currentTimeMillis();
submodel_tasks[i] = H2O.submitTask(cvModelBuilders[i].trainModelImpl());
if(++nRunning == nModelsInParallel()) { //piece-wise advance in training the CV models
while (nRunning > 0) try {
submodel_tasks[i + 1 - nRunning--].join();
} catch (RuntimeException t) {
if (rt == null) rt = t;
}
if(rt != null) throw rt;
}
}
for( int i=0; i<N; ++i ) //all sub-models must be completed before the main model can be built
try {
submodel_tasks[i].join();
} catch(RuntimeException t){
if(rt == null) rt = t;
}
if(rt != null) throw rt;
cv_computeAndSetOptimalParameters(cvModelBuilders);
}
private void buildMainModel() {
if (_job.stop_requested()) return;
assert _job.isRunning();
Log.info("Building main model.");
_start_time = System.currentTimeMillis();
H2O.H2OCountedCompleter mm = H2O.submitTask(trainModelImpl());
mm.join(); // wait for completion
}
// Step 5: Score the CV models
public ModelMetrics.MetricBuilder[] cv_scoreCVModels(int N, Vec[] weights, ModelBuilder<M, P, O>[] cvModelBuilders) {
if( _job.stop_requested() ) return null;
ModelMetrics.MetricBuilder[] mbs = new ModelMetrics.MetricBuilder[N];
Futures fs = new Futures();
for (int i=0; i<N; ++i) {
if( _job.stop_requested() ) return null; //don't waste time scoring if the CV run is stopped
Frame cvValid = cvModelBuilders[i].valid();
Frame adaptFr = new Frame(cvValid);
M cvModel = cvModelBuilders[i].dest().get();
cvModel.adaptTestForTrain(adaptFr, true, !isSupervised());
mbs[i] = cvModel.scoreMetrics(adaptFr);
if (nclasses() == 2 /* need holdout predictions for gains/lift table */ ||
_parms._keep_cross_validation_predictions ||
(_parms._distribution== DistributionFamily.huber /*need to compute quantiles on abs error of holdout predictions*/)) {
String predName = "prediction_" + cvModelBuilders[i]._result.toString();
cvModel.predictScoreImpl(cvValid, adaptFr, predName, _job, true);
DKV.put(cvModel);
}
// free resources as early as possible
if (adaptFr != null) {
Model.cleanup_adapt(adaptFr, cvValid);
DKV.remove(adaptFr._key,fs);
}
DKV.remove(cvModelBuilders[i]._parms._train,fs);
DKV.remove(cvModelBuilders[i]._parms._valid,fs);
weights[2*i ].remove(fs);
weights[2*i+1].remove(fs);
}
fs.blockForPending();
return mbs;
}
// Step 6: Combine cross-validation scores; compute main model x-val scores; compute gains/lifts
public void cv_mainModelScores(int N, ModelMetrics.MetricBuilder mbs[], ModelBuilder<M, P, O> cvModelBuilders[]) {
if( _job.stop_requested() ) return;
assert _job.isRunning();
M mainModel = _result.get();
// Compute and put the cross-validation metrics into the main model
Log.info("Computing " + N + "-fold cross-validation metrics.");
mainModel._output._cross_validation_models = new Key[N];
Key<Frame>[] predKeys = new Key[N];
mainModel._output._cross_validation_predictions = _parms._keep_cross_validation_predictions ? predKeys : null;
for (int i = 0; i < N; ++i) {
if (i > 0) mbs[0].reduce(mbs[i]);
Key<M> cvModelKey = cvModelBuilders[i]._result;
mainModel._output._cross_validation_models[i] = cvModelKey;
predKeys[i] = Key.make("prediction_" + cvModelKey.toString()); //must be the same as in cv_scoreCVModels above
}
Frame holdoutPreds = null;
if (_parms._keep_cross_validation_predictions || (nclasses()==2 /*GainsLift needs this*/ || _parms._distribution == DistributionFamily.huber)) {
Key<Frame> cvhp = Key.make("cv_holdout_prediction_" + mainModel._key.toString());
if (_parms._keep_cross_validation_predictions) //only show the user if they asked for it
mainModel._output._cross_validation_holdout_predictions_frame_id = cvhp;
holdoutPreds = combineHoldoutPredictions(predKeys, cvhp);
}
if (_parms._keep_cross_validation_fold_assignment) {
mainModel._output._cross_validation_fold_assignment_frame_id = Key.make("cv_fold_assignment_" + _result.toString());
Frame xvalidation_fold_assignment_frame = mainModel._output._cross_validation_fold_assignment_frame_id.get();
if (xvalidation_fold_assignment_frame != null)
Scope.untrack(xvalidation_fold_assignment_frame.keysList());
}
// Keep or toss predictions
for (Key<Frame> k : predKeys) {
Frame fr = DKV.getGet(k);
if( fr != null ) {
if (_parms._keep_cross_validation_predictions) Scope.untrack(fr.keysList());
else fr.remove();
}
}
mainModel._output._cross_validation_metrics = mbs[0].makeModelMetrics(mainModel, _parms.train(), null, holdoutPreds);
if (holdoutPreds != null) {
if (_parms._keep_cross_validation_predictions) Scope.untrack(holdoutPreds.keysList());
else holdoutPreds.remove();
}
mainModel._output._cross_validation_metrics._description = N + "-fold cross-validation on training data (Metrics computed for combined holdout predictions)";
Log.info(mainModel._output._cross_validation_metrics.toString());
mainModel._output._cross_validation_metrics_summary = makeCrossValidationSummaryTable(mainModel._output._cross_validation_models);
// Now, the main model is complete (has cv metrics)
DKV.put(mainModel);
}
/** Override for model-specific checks / modifications to _parms for the main model during N-fold cross-validation.
* Also allow the cv models to be modified after all of them have been built.
* For example, the model might need to be told to not do early stopping. CV models might have their lambda value modified, etc.
*/
public void cv_computeAndSetOptimalParameters(ModelBuilder<M, P, O>[] cvModelBuilders) { }
/** @return Whether n-fold cross-validation is done */
public boolean nFoldCV() {
return _parms._fold_column != null || _parms._nfolds != 0;
}
/** List containing the categories of models that this builder can
* build. Each ModelBuilder must have one of these. */
abstract public ModelCategory[] can_build();
/** Visibility for this algo: is it always visible, is it beta (always
* visible but with a note in the UI) or is it experimental (hidden by
* default, visible in the UI if the user gives an "experimental" flag at
* startup); test-only builders are "experimental" */
public enum BuilderVisibility { Experimental, Beta, Stable }
public BuilderVisibility builderVisibility() { return BuilderVisibility.Stable; }
/** Clear whatever was done by init() so it can be run again. */
public void clearInitState() {
clearValidationErrors();
}
protected boolean logMe() { return true; }
abstract public boolean isSupervised();
protected transient Vec _response; // Handy response column
protected transient Vec _vresponse; // Handy response column
protected transient Vec _offset; // Handy offset column
protected transient Vec _weights; // observation weight column
protected transient Vec _fold; // fold id column
protected transient String[] _origNames;
protected transient String[][] _origDomains;
public boolean hasOffsetCol(){ return _parms._offset_column != null;} // don't look at transient Vec
public boolean hasWeightCol(){return _parms._weights_column != null;} // don't look at transient Vec
public boolean hasFoldCol(){return _parms._fold_column != null;} // don't look at transient Vec
public int numSpecialCols() { return (hasOffsetCol() ? 1 : 0) + (hasWeightCol() ? 1 : 0) + (hasFoldCol() ? 1 : 0); }
public String[] specialColNames() {
String[] n = new String[numSpecialCols()];
int i=0;
if (hasOffsetCol()) n[i++]=_parms._offset_column;
if (hasWeightCol()) n[i++]=_parms._weights_column;
if (hasFoldCol()) n[i++]=_parms._fold_column;
return n;
}
// no hasResponse, call isSupervised instead (response is mandatory if isSupervised is true)
public boolean havePojo() { return false; }
public boolean haveMojo() { return false; }
protected int _nclass; // Number of classes; 1 for regression; 2+ for classification
public int nclasses(){return _nclass;}
public final boolean isClassifier() { return nclasses() > 1; }
/**
* Find and set response/weights/offset/fold and put them all in the end,
* @return number of non-feature vecs
*/
public int separateFeatureVecs() {
int res = 0;
if(_parms._weights_column != null) {
Vec w = _train.remove(_parms._weights_column);
if(w == null)
error("_weights_column","Weights column '" + _parms._weights_column + "' not found in the training frame");
else {
if(!w.isNumeric())
error("_weights_column","Invalid weights column '" + _parms._weights_column + "', weights must be numeric");
_weights = w;
if(w.naCnt() > 0)
error("_weights_columns","Weights cannot have missing values.");
if(w.min() < 0)
error("_weights_columns","Weights must be >= 0");
if(w.max() == 0)
error("_weights_columns","Max. weight must be > 0");
_train.add(_parms._weights_column, w);
++res;
}
} else {
_weights = null;
assert(!hasWeightCol());
}
if(_parms._offset_column != null) {
Vec o = _train.remove(_parms._offset_column);
if(o == null)
error("_offset_column","Offset column '" + _parms._offset_column + "' not found in the training frame");
else {
if(!o.isNumeric())
error("_offset_column","Invalid offset column '" + _parms._offset_column + "', offset must be numeric");
_offset = o;
if(o.naCnt() > 0)
error("_offset_column","Offset cannot have missing values.");
if(_weights == _offset)
error("_offset_column", "Offset must be different from weights");
_train.add(_parms._offset_column, o);
++res;
}
} else {
_offset = null;
assert(!hasOffsetCol());
}
if(_parms._fold_column != null) {
Vec f = _train.remove(_parms._fold_column);
if(f == null)
error("_fold_column","Fold column '" + _parms._fold_column + "' not found in the training frame");
else {
if(!f.isInt() && !f.isCategorical())
error("_fold_column","Invalid fold column '" + _parms._fold_column + "', fold must be integer or categorical");
if(f.min() < 0)
error("_fold_column","Invalid fold column '" + _parms._fold_column + "', fold must be non-negative");
if(f.isConst())
error("_fold_column","Invalid fold column '" + _parms._fold_column + "', fold cannot be constant");
_fold = f;
if(f.naCnt() > 0)
error("_fold_column","Fold cannot have missing values.");
if(_fold == _weights)
error("_fold_column", "Fold must be different from weights");
if(_fold == _offset)
error("_fold_column", "Fold must be different from offset");
_train.add(_parms._fold_column, f);
++res;
}
} else {
_fold = null;
assert(!hasFoldCol());
}
if(isSupervised() && _parms._response_column != null) {
_response = _train.remove(_parms._response_column);
if (_response == null) {
if (isSupervised())
error("_response_column", "Response column '" + _parms._response_column + "' not found in the training frame");
} else {
if(_response == _offset)
error("_response_column", "Response column must be different from offset_column");
if(_response == _weights)
error("_response_column", "Response column must be different from weights_column");
if(_response == _fold)
error("_response_column", "Response column must be different from fold_column");
_train.add(_parms._response_column, _response);
++res;
}
} else {
_response = null;
}
return res;
}
protected boolean ignoreStringColumns(){return true;}
protected boolean ignoreConstColumns(){return _parms._ignore_const_cols;}
/**
* Ignore constant columns, columns with all NAs and strings.
* @param npredictors
* @param expensive
*/
protected void ignoreBadColumns(int npredictors, boolean expensive){
// Drop all-constant and all-bad columns.
if(_parms._ignore_const_cols)
new FilterCols(npredictors) {
@Override protected boolean filter(Vec v) {
boolean isBad = v.isBad();
boolean skipConst = ignoreConstColumns() && v.isConst();
boolean skipString = ignoreStringColumns() && v.isString();
boolean skip = isBad || skipConst || skipString;
return skip;
}
}.doIt(_train,"Dropping bad and constant columns: ",expensive);
}
/**
* Ignore invalid columns (columns that have a very high max value, which can cause issues in DHistogram)
* @param npredictors
* @param expensive
*/
protected void ignoreInvalidColumns(int npredictors, boolean expensive){}
/**
* Override this method to call error() if the model is expected to not fit in memory, and say why
*/
protected void checkMemoryFootPrint() {}
transient double [] _distribution;
transient protected double [] _priorClassDist;
protected boolean computePriorClassDistribution(){
return isClassifier();
}
/** A list of field validation issues. */
public ValidationMessage[] _messages = new ValidationMessage[0];
private int _error_count = -1; // -1 ==> init not run yet, for those Jobs that have an init, like ModelBuilder. Note, this counts ONLY errors, not WARNs and etc.
public int error_count() { assert _error_count >= 0 : "init() not run yet"; return _error_count; }
public void hide (String field_name, String message) { message(Log.TRACE, field_name, message); }
public void info (String field_name, String message) { message(Log.INFO , field_name, message); }
public void warn (String field_name, String message) { message(Log.WARN , field_name, message); }
public void error(String field_name, String message) { message(Log.ERRR , field_name, message); _error_count++; }
public void clearValidationErrors() {
_messages = new ValidationMessage[0];
_error_count = 0;
}
public void message(byte log_level, String field_name, String message) {
_messages = Arrays.copyOf(_messages, _messages.length + 1);
_messages[_messages.length - 1] = new ValidationMessage(log_level, field_name, message);
if (log_level == Log.ERRR) _error_count++;
}
/** Get a string representation of only the ERROR ValidationMessages (e.g., to use in an exception throw). */
public String validationErrors() {
StringBuilder sb = new StringBuilder();
for( ValidationMessage vm : _messages )
if( vm._log_level == Log.ERRR )
sb.append(vm.toString()).append("\n");
return sb.toString();
}
/** Can be an ERROR, meaning the parameters can't be used as-is,
* a TRACE, which means the specified field should be hidden given
* the values of other fields, or a WARN or INFO for informative
* messages to the user. */
public static final class ValidationMessage extends Iced {
final byte _log_level; // See util/Log.java for levels
final String _field_name;
final String _message;
public ValidationMessage(byte log_level, String field_name, String message) {
_log_level = log_level;
_field_name = field_name;
_message = message;
Log.log(log_level,field_name + ": " + message);
}
public int log_level() { return _log_level; }
@Override public String toString() { return Log.LVLS[_log_level] + " on field: " + _field_name + ": " + _message; }
}
// ==========================================================================
/** Initialize the ModelBuilder, validating all arguments and preparing the
* training frame. This call is expected to be overridden in the subclasses
* and each subclass will start with "super.init();". This call is made by
* the front-end whenever the GUI is clicked, and needs to be fast whenever
* {@code expensive} is false; it will be called once again at the start of
* model building {@see #trainModel()} with expensive set to true.
*<p>
* The incoming training frame (and validation frame) will have ignored
* columns dropped out, plus whatever work the parent init did.
*<p>
* NOTE: The front end initially calls this through the parameters validation
* endpoint with no training_frame, so each subclass's {@code init()} method
* has to work correctly with the training_frame missing.
*<p>
*/
public void init(boolean expensive) {
// Log parameters
if( expensive && logMe() ) {
Log.info("Building H2O " + this.getClass().getSimpleName() + " model with these parameters:");
Log.info(new String(_parms.writeJSON(new AutoBuffer()).buf()));
}
// NOTE: allow re-init:
clearInitState();
assert _parms != null; // Parms must already be set in
if( _parms._train == null ) {
if (expensive)
error("_train", "Missing training frame");
return;
}
Frame tr = _train != null?_train:_parms.train();
if( tr == null ) { error("_train", "Missing training frame: "+_parms._train); return; }
setTrain(new Frame(null /* not putting this into KV */, tr._names.clone(), tr.vecs().clone()));
if (expensive) {
_parms.getOrMakeRealSeed();
}
if (_parms._categorical_encoding.needsResponse() && !isSupervised()) {
error("_categorical_encoding", "Categorical encoding scheme cannot be "
+ _parms._categorical_encoding.toString() + " - no response column available.");
}
if (_parms._nfolds < 0 || _parms._nfolds == 1) {
error("_nfolds", "nfolds must be either 0 or >1.");
}
if (_parms._nfolds > 1 && _parms._nfolds > train().numRows()) {
error("_nfolds", "nfolds cannot be larger than the number of rows (" + train().numRows() + ").");
}
if (_parms._fold_column != null) {
hide("_fold_assignment", "Fold assignment is ignored when a fold column is specified.");
if (_parms._nfolds > 1) {
error("_nfolds", "nfolds cannot be specified at the same time as a fold column.");
} else {
hide("_nfolds", "nfolds is ignored when a fold column is specified.");
}
if (_parms._fold_assignment != Model.Parameters.FoldAssignmentScheme.AUTO) {
error("_fold_assignment", "Fold assignment is not allowed in conjunction with a fold column.");
}
}
if (_parms._nfolds > 1) {
hide("_fold_column", "Fold column is ignored when nfolds > 1.");
}
// hide cross-validation parameters unless cross-val is enabled
if (!nFoldCV()) {
hide("_keep_cross_validation_predictions", "Only for cross-validation.");
hide("_keep_cross_validation_fold_assignment", "Only for cross-validation.");
hide("_fold_assignment", "Only for cross-validation.");
if (_parms._fold_assignment != Model.Parameters.FoldAssignmentScheme.AUTO) {
error("_fold_assignment", "Fold assignment is only allowed for cross-validation.");
}
}
if (_parms._distribution == DistributionFamily.modified_huber) {
error("_distribution", "Modified Huber distribution is not supported yet.");
}
if (_parms._distribution != DistributionFamily.tweedie) {
hide("_tweedie_power", "Only for Tweedie Distribution.");
}
if (_parms._tweedie_power <= 1 || _parms._tweedie_power >= 2) {
error("_tweedie_power", "Tweedie power must be between 1 and 2 (exclusive).");
}
// Drop explicitly dropped columns
if( _parms._ignored_columns != null ) {
_train.remove(_parms._ignored_columns);
if( expensive ) Log.info("Dropping ignored columns: "+Arrays.toString(_parms._ignored_columns));
}
// Rebalance train and valid datasets
if (expensive && error_count() == 0 && _parms._auto_rebalance) {
setTrain(rebalance(_train, false, _result + ".temporary.train"));
_valid = rebalance(_valid, false, _result + ".temporary.valid");
}
// Drop all non-numeric columns (e.g., String and UUID). No current algo
// can use them, and otherwise all algos will then be forced to remove
// them. Text algos (grep, word2vec) take raw text columns - which are
// numeric (arrays of bytes).
ignoreBadColumns(separateFeatureVecs(), expensive);
ignoreInvalidColumns(separateFeatureVecs(), expensive);
// Check that at least some columns are not-constant and not-all-NAs
if( _train.numCols() == 0 )
error("_train","There are no usable columns to generate model");
if(isSupervised()) {
if(_response != null) {
if (_parms._distribution != DistributionFamily.tweedie) {
hide("_tweedie_power", "Tweedie power is only used for Tweedie distribution.");
}
if (_parms._distribution != DistributionFamily.quantile) {
hide("_quantile_alpha", "Quantile (alpha) is only used for Quantile regression.");
}
if (expensive) checkDistributions();
_nclass = _response.isCategorical() ? _response.cardinality() : 1;
if (_response.isConst())
error("_response","Response cannot be constant.");
}
if (! _parms._balance_classes)
hide("_max_after_balance_size", "Balance classes is false, hide max_after_balance_size");
else if (_parms._weights_column != null && _weights != null && !_weights.isBinary())
error("_balance_classes", "Balance classes and observation weights are not currently supported together.");
if( _parms._max_after_balance_size <= 0.0 )
error("_max_after_balance_size","Max size after balancing needs to be positive, suggest 1.0f");
if( _train != null ) {
if (_train.numCols() <= 1)
error("_train", "Training data must have at least 2 features (incl. response).");
if( null == _parms._response_column) {
error("_response_column", "Response column parameter not set.");
return;
}
if(_response != null && computePriorClassDistribution()) {
if (isClassifier() && isSupervised()) {
MRUtils.ClassDist cdmt =
_weights != null ? new MRUtils.ClassDist(nclasses()).doAll(_response, _weights) : new MRUtils.ClassDist(nclasses()).doAll(_response);
_distribution = cdmt.dist();
_priorClassDist = cdmt.rel_dist();
} else { // Regression; only 1 "class"
_distribution = new double[]{ (_weights != null ? _weights.mean() : 1.0) * train().numRows() };
_priorClassDist = new double[]{1.0f};
}
}
}
if( !isClassifier() ) {
hide("_balance_classes", "Balance classes is only applicable to classification problems.");
hide("_class_sampling_factors", "Class sampling factors is only applicable to classification problems.");
hide("_max_after_balance_size", "Max after balance size is only applicable to classification problems.");
hide("_max_confusion_matrix_size", "Max confusion matrix size is only applicable to classification problems.");
}
if (_nclass <= 2) {
hide("_max_hit_ratio_k", "Max K-value for hit ratio is only applicable to multi-class classification problems.");
hide("_max_confusion_matrix_size", "Only for multi-class classification problems.");
}
if( !_parms._balance_classes ) {
hide("_max_after_balance_size", "Only used with balanced classes");
hide("_class_sampling_factors", "Class sampling factors is only applicable if balancing classes.");
}
}
else {
hide("_response_column", "Ignored for unsupervised methods.");
hide("_balance_classes", "Ignored for unsupervised methods.");
hide("_class_sampling_factors", "Ignored for unsupervised methods.");
hide("_max_after_balance_size", "Ignored for unsupervised methods.");
hide("_max_confusion_matrix_size", "Ignored for unsupervised methods.");
_response = null;
_vresponse = null;
_nclass = 1;
}
if( _nclass > Model.Parameters.MAX_SUPPORTED_LEVELS ) {
error("_nclass", "Too many levels in response column: " + _nclass + ", maximum supported number of classes is " + Model.Parameters.MAX_SUPPORTED_LEVELS + ".");
}
// Build the validation set to be compatible with the training set.
// Toss out extra columns, complain about missing ones, remap categoricals
Frame va = _parms.valid(); // User-given validation set
if (va != null) {
_valid = adaptFrameToTrain(va, "Validation Frame", "_validation_frame", expensive);
_vresponse = _valid.vec(_parms._response_column);
} else {
_valid = null;
_vresponse = null;
}
if (expensive) {
Frame newtrain = encodeFrameCategoricals(_train, ! _parms._is_cv_model);
if (newtrain != _train) {
_origNames = _train.names();
_origDomains = _train.domains();
setTrain(newtrain);
separateFeatureVecs(); //fix up the pointers to the special vecs
}
if (_valid != null) {
_valid = encodeFrameCategoricals(_valid, ! _parms._is_cv_model /* for CV, need to score one more time in outer loop */);
_vresponse = _valid.vec(_parms._response_column);
}
boolean restructured = false;
Vec[] vecs = _train.vecs();
for (int j = 0; j < vecs.length; ++j) {
Vec v = vecs[j];
if (v == _response || v == _fold) continue;
if (v.isCategorical() && shouldReorder(v)) {
final int len = v.domain().length;
Log.info("Reordering categorical column " + _train.name(j) + " (" + len + " levels) based on the mean (weighted) response per level.");
VecUtils.MeanResponsePerLevelTask mrplt = new VecUtils.MeanResponsePerLevelTask(len).doAll(v,
_parms._weights_column != null ? _train.vec(_parms._weights_column) : v.makeCon(1.0),
_train.vec(_parms._response_column));
double[] meanWeightedResponse = mrplt.meanWeightedResponse;
// for (int i=0;i<len;++i)
// Log.info(v.domain()[i] + " -> " + meanWeightedResponse[i]);
// Option 1: Order the categorical column by response to make better splits
int[] idx=new int[len];
for (int i=0;i<len;++i) idx[i] = i;
ArrayUtils.sort(idx, meanWeightedResponse);
int[] invIdx=new int[len];
for (int i=0;i<len;++i) invIdx[idx[i]] = i;
Vec vNew = new VecUtils.ReorderTask(invIdx).doAll(1, Vec.T_NUM, new Frame(v)).outputFrame().anyVec();
String[] newDomain = new String[len];
for (int i = 0; i < len; ++i) newDomain[i] = v.domain()[idx[i]];
vNew.setDomain(newDomain);
// for (int i=0;i<len;++i)
// Log.info(vNew.domain()[i] + " -> " + meanWeightedResponse[idx[i]]);
vecs[j] = vNew;
restructured = true;
}
}
if (restructured)
_train.restructure(_train.names(), vecs);
}
assert (!expensive || _valid==null || Arrays.equals(_train._names, _valid._names) || _parms._categorical_encoding == Model.Parameters.CategoricalEncodingScheme.Binary);
if (_valid!=null && !Arrays.equals(_train._names, _valid._names) && _parms._categorical_encoding == Model.Parameters.CategoricalEncodingScheme.Binary) {
for (String name : _train._names)
assert(ArrayUtils.contains(_valid._names, name)) : "Internal error during categorical encoding: training column " + name + " not in validation frame with columns " + Arrays.toString(_valid._names);
}
if (_parms._checkpoint != null && DKV.get(_parms._checkpoint) == null) {
error("_checkpoint", "Checkpoint has to point to existing model!");
}
if (_parms._stopping_tolerance < 0) {
error("_stopping_tolerance", "Stopping tolerance must be >= 0.");
}
if (_parms._stopping_tolerance >= 1) {
error("_stopping_tolerance", "Stopping tolerance must be < 1.");
}
if (_parms._stopping_rounds == 0) {
if (_parms._stopping_metric != ScoreKeeper.StoppingMetric.AUTO)
warn("_stopping_metric", "Stopping metric is ignored for _stopping_rounds=0.");
if (_parms._stopping_tolerance != _parms.defaultStoppingTolerance())
warn("_stopping_tolerance", "Stopping tolerance is ignored for _stopping_rounds=0.");
} else if (_parms._stopping_rounds < 0) {
error("_stopping_rounds", "Stopping rounds must be >= 0.");
} else {
if (isClassifier()) {
if (_parms._stopping_metric == ScoreKeeper.StoppingMetric.deviance && !getClass().getSimpleName().contains("GLM")) {
error("_stopping_metric", "Stopping metric cannot be deviance for classification.");
}
if (nclasses()!=2 && _parms._stopping_metric == ScoreKeeper.StoppingMetric.AUC) {
error("_stopping_metric", "Stopping metric cannot be AUC for multinomial classification.");
}
} else {
if (_parms._stopping_metric == ScoreKeeper.StoppingMetric.misclassification ||
_parms._stopping_metric == ScoreKeeper.StoppingMetric.AUC ||
_parms._stopping_metric == ScoreKeeper.StoppingMetric.logloss)
{
error("_stopping_metric", "Stopping metric cannot be " + _parms._stopping_metric.toString() + " for regression.");
}
}
}
if (_parms._max_runtime_secs < 0) {
error("_max_runtime_secs", "Max runtime (in seconds) must be greater than 0 (or 0 for unlimited).");
}
}
/**
* Adapts a given frame to the same schema as the training frame.
* This includes encoding of categorical variables (if expensive is enabled).
*
* Note: This method should only be used during ModelBuilder initialization - it should be called in init(..) method.
*
* @param fr input frame
* @param frDesc frame description, eg. "Validation Frame" - will be shown in validation error messages
* @param field name of a field for validation errors
* @param expensive indicates full ("expensive") processing
* @return adapted frame
*/
protected Frame init_adaptFrameToTrain(Frame fr, String frDesc, String field, boolean expensive) {
Frame adapted = adaptFrameToTrain(fr, frDesc, field, expensive);
if (expensive)
adapted = encodeFrameCategoricals(adapted, true);
return adapted;
}
private Frame adaptFrameToTrain(Frame fr, String frDesc, String field, boolean expensive) {
if (fr.numRows()==0) error(field, frDesc + " must have > 0 rows.");
Frame adapted = new Frame(null /* not putting this into KV */, fr._names.clone(), fr.vecs().clone());
try {
String[] msgs = Model.adaptTestForTrain(adapted, null, null, _train._names, _train.domains(), _parms, expensive, true, null, getToEigenVec(), _toDelete, false);
Vec response = adapted.vec(_parms._response_column);
if (response == null && _parms._response_column != null)
error(field, frDesc + " must have a response column '" + _parms._response_column + "'.");
if (expensive) {
for (String s : msgs) {
Log.info(s);
warn(field, s);
}
}
} catch (IllegalArgumentException iae) {
error(field, iae.getMessage());
}
return adapted;
}
private Frame encodeFrameCategoricals(Frame fr, boolean scopeTrack) {
String[] skipCols = new String[]{_parms._weights_column, _parms._offset_column, _parms._fold_column, _parms._response_column};
Frame encoded = FrameUtils.categoricalEncoder(fr, skipCols, _parms._categorical_encoding, getToEigenVec());
if (encoded != fr) {
assert encoded._key != null;
if (scopeTrack)
Scope.track(encoded);
else
_toDelete.put(encoded._key, Arrays.toString(Thread.currentThread().getStackTrace()));
}
return encoded;
}
/**
* Rebalance a frame for load balancing
* @param original_fr Input frame
* @param local Whether to only create enough chunks to max out all cores on one node only
* @param name Name of rebalanced frame
* @return Frame that has potentially more chunks
*/
protected Frame rebalance(final Frame original_fr, boolean local, final String name) {
if (original_fr == null) return null;
int chunks = desiredChunks(original_fr, local);
if (original_fr.anyVec().nChunks() >= chunks) {
if (chunks>1)
Log.info(name.substring(name.length()-5)+ " dataset already contains " + original_fr.anyVec().nChunks() +
" chunks. No need to rebalance.");
return original_fr;
}
Log.info("Rebalancing " + name.substring(name.length()-5) + " dataset into " + chunks + " chunks.");
Key newKey = Key.makeUserHidden(name + ".chunks" + chunks);
RebalanceDataSet rb = new RebalanceDataSet(original_fr, newKey, chunks);
H2O.submitTask(rb).join();
Frame rebalanced_fr = DKV.get(newKey).get();
Scope.track(rebalanced_fr);
return rebalanced_fr;
}
/**
* Find desired number of chunks. If fewer, dataset will be rebalanced.
* @return Lower bound on number of chunks after rebalancing.
*/
protected int desiredChunks(final Frame original_fr, boolean local) {
return Math.min((int) Math.ceil(original_fr.numRows() / 1e3), H2O.NUMCPUS);
}
public void checkDistributions() {
if (_parms._distribution == DistributionFamily.poisson) {
if (_response.min() < 0)
error("_response", "Response must be non-negative for Poisson distribution.");
} else if (_parms._distribution == DistributionFamily.gamma) {
if (_response.min() < 0)
error("_response", "Response must be non-negative for Gamma distribution.");
} else if (_parms._distribution == DistributionFamily.tweedie) {
if (_parms._tweedie_power >= 2 || _parms._tweedie_power <= 1)
error("_tweedie_power", "Tweedie power must be between 1 and 2.");
if (_response.min() < 0)
error("_response", "Response must be non-negative for Tweedie distribution.");
} else if (_parms._distribution == DistributionFamily.quantile) {
if (_parms._quantile_alpha > 1 || _parms._quantile_alpha < 0)
error("_quantile_alpha", "Quantile alpha must be between 0 and 1.");
} else if (_parms._distribution == DistributionFamily.huber) {
if (_parms._huber_alpha <0 || _parms._huber_alpha>1)
error("_huber_alpha", "Huber alpha must be between 0 and 1.");
}
}
transient public HashSet<String> _removedCols = new HashSet<>();
public abstract class FilterCols {
final int _specialVecs; // special vecs to skip at the end
public FilterCols(int n) {_specialVecs = n;}
abstract protected boolean filter(Vec v);
public void doIt( Frame f, String msg, boolean expensive ) {
List<Integer> rmcolsList = new ArrayList<>();
for( int i = 0; i < f.vecs().length - _specialVecs; i++ )
if( filter(f.vec(i)) ) rmcolsList.add(i);
if( !rmcolsList.isEmpty() ) {
_removedCols = new HashSet<>(rmcolsList.size());
int[] rmcols = new int[rmcolsList.size()];
for (int i=0;i<rmcols.length;++i) {
rmcols[i]=rmcolsList.get(i);
_removedCols.add(f._names[rmcols[i]]);
}
f.remove(rmcols); //bulk-remove
msg += _removedCols.toString();
warn("_train", msg);
if (expensive) Log.info(msg);
}
}
}
//stitch together holdout predictions into one large Frame
private static Frame combineHoldoutPredictions(Key<Frame>[] predKeys, Key key) {
int N = predKeys.length;
Frame template = predKeys[0].get();
Vec[] vecs = new Vec[N*template.numCols()];
int idx=0;
for (int i=0;i<N;++i)
for (int j=0;j<predKeys[i].get().numCols();++j)
vecs[idx++]=predKeys[i].get().vec(j);
return new HoldoutPredictionCombiner(N,template.numCols()).doAll(template.types(),new Frame(vecs)).outputFrame(key, template.names(),template.domains());
}
// helper to combine multiple holdout prediction Vecs (each only has 1/N-th filled with non-zeros) into 1 Vec
private static class HoldoutPredictionCombiner extends MRTask<HoldoutPredictionCombiner> {
int _folds, _cols;
public HoldoutPredictionCombiner(int folds, int cols) { _folds=folds; _cols=cols; }
@Override public void map(Chunk[] cs, NewChunk[] nc) {
for (int c=0;c<_cols;++c) {
double [] vals = new double[cs[0].len()];
for (int f=0;f<_folds;++f)
for (int row = 0; row < cs[0].len(); ++row)
vals[row] += cs[f * _cols + c].atd(row);
nc[c].setDoubles(vals);
}
}
}
private TwoDimTable makeCrossValidationSummaryTable(Key[] cvmodels) {
if (cvmodels == null || cvmodels.length == 0) return null;
int N = cvmodels.length;
int extra_length=2; //mean/sigma/cv1/cv2/.../cvN
String[] colTypes = new String[N+extra_length];
Arrays.fill(colTypes, "string");
String[] colFormats = new String[N+extra_length];
Arrays.fill(colFormats, "%s");
String[] colNames = new String[N+extra_length];
colNames[0] = "mean";
colNames[1] = "sd";
for (int i=0;i<N;++i)
colNames[i+extra_length] = "cv_" + (i+1) + "_valid";
Set<String> excluded = new HashSet<>();
excluded.add("total_rows");
excluded.add("makeSchema");
excluded.add("hr");
excluded.add("frame");
excluded.add("remove");
excluded.add("cm");
excluded.add("auc_obj");
List<Method> methods = new ArrayList<>();
{
Model m = DKV.getGet(cvmodels[0]);
ModelMetrics mm = m._output._validation_metrics;
if (mm != null) {
for (Method meth : mm.getClass().getMethods()) {
if (excluded.contains(meth.getName())) continue;
try {
double c = (double) meth.invoke(mm);
methods.add(meth);
} catch (Exception ignored) {}
}
ConfusionMatrix cm = mm.cm();
if (cm != null) {
for (Method meth : cm.getClass().getMethods()) {
if (excluded.contains(meth.getName())) continue;
try {
double c = (double) meth.invoke(cm);
methods.add(meth);
} catch (Exception ignored) {}
}
}
}
}
// make unique, and sort alphabetically
Set<String> rowNames=new TreeSet<>();
for (Method m : methods) rowNames.add(m.getName());
List<Method> meths = new ArrayList<>();
OUTER:
for (String n : rowNames)
for (Method m : methods)
if (m.getName().equals(n)) { //find the first method that has that name
meths.add(m);
continue OUTER;
}
int numMetrics = rowNames.size();
TwoDimTable table = new TwoDimTable("Cross-Validation Metrics Summary",
null,
rowNames.toArray(new String[0]), colNames, colTypes, colFormats, "");
MathUtils.BasicStats stats = new MathUtils.BasicStats(numMetrics);
double[][] vals = new double[N][numMetrics];
int i = 0;
for (Key<Model> km : cvmodels) {
Model m = DKV.getGet(km);
if (m==null) continue;
ModelMetrics mm = m._output._validation_metrics;
int j=0;
for (Method meth : meths) {
if (excluded.contains(meth.getName())) continue;
try {
double val = (double) meth.invoke(mm);
vals[i][j] = val;
table.set(j++, i+extra_length, (float)val);
} catch (Throwable e) { }
if (mm.cm()==null) continue;
try {
double val = (double) meth.invoke(mm.cm());
vals[i][j] = val;
table.set(j++, i+extra_length, (float)val);
} catch (Throwable e) { }
}
i++;
}
for (i=0;i<N;++i)
stats.add(vals[i],1);
for (i=0;i<numMetrics;++i) {
table.set(i, 0, (float)stats.mean()[i]);
table.set(i, 1, (float)stats.sigma()[i]);
}
Log.info(table);
return table;
}
public static void bulkBuildModels(Job job, ModelBuilder[] modelBuilders, int parallelization) {
final int N = modelBuilders.length;
H2O.H2OCountedCompleter submodel_tasks[] = new H2O.H2OCountedCompleter[N];
int nRunning=0;
RuntimeException rt = null;
for( int i=0; i<N; ++i ) {
if (job.stop_requested() ) break; // Stop launching but still must block for all async jobs
modelBuilders[i]._start_time = System.currentTimeMillis();
submodel_tasks[i] = H2O.submitTask(modelBuilders[i].trainModelImpl());
if(++nRunning == parallelization) { //piece-wise advance in training the models
while (nRunning > 0) try {
submodel_tasks[i + 1 - nRunning--].join();
job.update(1); // One job finished
} catch (RuntimeException t) {
if (rt == null) rt = t;
}
if(rt != null) throw rt;
}
}
for( int i=0; i<N; ++i ) //all sub-models must be completed before the main model can be built
try {
submodel_tasks[i].join();
} catch(RuntimeException t){
if(rt == null) rt = t;
}
if(rt != null) throw rt;
}
}