package hex.deepwater; import hex.*; import hex.genmodel.GenModel; import hex.genmodel.utils.DistributionFamily; import hex.schemas.DeepWaterModelV3; import hex.util.LinearAlgebraUtils; import water.*; import water.api.schemas3.ModelSchemaV3; import water.exceptions.H2OIllegalArgumentException; import water.fvec.Chunk; import water.fvec.Frame; import water.fvec.NewChunk; import water.fvec.Vec; import water.parser.BufferedString; import water.util.FrameUtils; import water.util.Log; import water.util.PrettyPrint; import water.util.RandomUtils; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Random; import java.util.concurrent.atomic.AtomicInteger; import static hex.ModelMetrics.calcVarImp; import static water.H2O.technote; /** * The Deep Learning model * It contains a DeepWaterModelInfo with the most up-to-date model, * a scoring history, as well as some helpers to indicate the progress */ public class DeepWaterModel extends Model<DeepWaterModel,DeepWaterParameters,DeepWaterModelOutput> implements Model.DeepFeatures { @Override public DeepwaterMojoWriter getMojo() { return new DeepwaterMojoWriter(this); } // Default publicly visible Schema is V2 public ModelSchemaV3 schema() { return new DeepWaterModelV3(); } void set_model_info(DeepWaterModelInfo mi) { model_info = mi; } final public DeepWaterModelInfo model_info() { return model_info; } @Override public ToEigenVec getToEigenVec() { return LinearAlgebraUtils.toEigen; } // final public VarImp varImp() { return _output.errors.variable_importances; } private volatile DeepWaterModelInfo model_info; // timing private long total_checkpointed_run_time_ms; //time spent in previous models private long total_training_time_ms; //total time spent running (training+scoring, including all previous models) private long total_scoring_time_ms; //total time spent scoring (including all previous models) long total_setup_time_ms; //total time spent setting up (including all previous models) private long time_of_start_ms; //start time for this model (this cp restart) // auto-tuning long actual_train_samples_per_iteration; long time_for_iteration_overhead_ms; //helper for auto-tuning: time in microseconds for collective bcast/reduce of the model // helpers for diagnostics double epoch_counter; int iterations; private boolean stopped_early; long training_rows; long validation_rows; // Keep the best model so far, based on a single criterion (overall class. error or MSE) private float _bestLoss = Float.POSITIVE_INFINITY; Key actual_best_model_key; static final String unstable_msg = technote(4, "\n\nTrying to predict with an unstable model." + "\nJob was aborted due to observed numerical instability (exponential growth)." + "\nEither the weights or the bias values are unreasonably large or lead to large activation values." + "\nTry a different network architecture, a bounded activation function (tanh), adding regularization" + "\n(via dropout) or use a smaller learning rate and/or momentum."); public DeepWaterScoringInfo last_scored() { return (DeepWaterScoringInfo) super.last_scored(); } /** * Get the parameters actually used for model building, not the user-given ones (_parms) * They might differ since some defaults are filled in, and some invalid combinations are auto-disabled in modifyParams * @return actually used parameters */ public final DeepWaterParameters get_params() { return model_info.get_params(); } @Override public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { switch(_output.getModelCategory()) { case Binomial: return new ModelMetricsBinomial.MetricBuilderBinomial(domain); case Multinomial: return new ModelMetricsMultinomial.MetricBuilderMultinomial(_output.nclasses(),domain); case Regression: return new ModelMetricsRegression.MetricBuilderRegression(); case AutoEncoder: return new ModelMetricsAutoEncoder.MetricBuilderAutoEncoder(_output.nfeatures()); default: throw H2O.unimpl("Invalid ModelCategory " + _output.getModelCategory()); } } static DataInfo makeDataInfo(Frame train, Frame valid, DeepWaterParameters parms) { double x = 0.782347234; boolean identityLink = new Distribution(parms).link(x) == x; return new DataInfo( train, valid, parms._autoencoder ? 0 : 1, //nResponses parms._autoencoder || parms._use_all_factor_levels, //use all FactorLevels for auto-encoder parms._standardize ? (parms._autoencoder ? DataInfo.TransformType.NORMALIZE : parms._sparse ? DataInfo.TransformType.DESCALE : DataInfo.TransformType.STANDARDIZE) : DataInfo.TransformType.NONE, //transform predictors !parms._standardize || train.lastVec().isCategorical() ? DataInfo.TransformType.NONE : identityLink ? DataInfo.TransformType.STANDARDIZE : DataInfo.TransformType.NONE, //transform response for regression with identity link parms._missing_values_handling == DeepWaterParameters.MissingValuesHandling.Skip, //whether to skip missing false, // do not replace NAs in numeric cols with mean true, // always add a bucket for missing values parms._weights_column != null, // observation weights parms._offset_column != null, parms._fold_column != null ); } /** Constructor to restart from a checkpointed model * @param destKey New destination key for the model * @param parms User-given parameters for checkpoint restart * @param cp Checkpoint to restart from */ public DeepWaterModel(final Key<DeepWaterModel> destKey, final DeepWaterParameters parms, final DeepWaterModel cp, final DataInfo dataInfo) { super(destKey, parms == null ? (DeepWaterParameters)cp._parms.clone() : IcedUtils.deepCopy(parms), (DeepWaterModelOutput)cp._output.clone()); DeepWaterParameters.Sanity.modifyParms(_parms, _parms, cp._output.nclasses()); //sanitize the model_info's parameters assert(_parms != cp._parms); //make sure we have a clone assert (_parms._checkpoint == cp._key); model_info = IcedUtils.deepCopy(cp.model_info); model_info._dataInfo = dataInfo; assert(model_info._network != null); assert(model_info._modelparams != null); model_info.javaToNative(); _dist = new Distribution(get_params()); assert(_dist.distribution != DistributionFamily.AUTO); // Note: Must use sanitized parameters via get_params() as this._params can still have defaults AUTO, etc.) actual_best_model_key = cp.actual_best_model_key; if (actual_best_model_key.get() == null) { DeepWaterModel best = IcedUtils.deepCopy(cp); //best.model_info.data_info = model_info.data_info; // Note: we currently DO NOT use the checkpoint's data info - as data may change during checkpoint restarts actual_best_model_key = Key.<DeepWaterModel>make(H2O.SELF); DKV.put(actual_best_model_key, best); } time_of_start_ms = cp.time_of_start_ms; total_training_time_ms = cp.total_training_time_ms; total_checkpointed_run_time_ms = cp.total_training_time_ms; total_scoring_time_ms = cp.total_scoring_time_ms; total_setup_time_ms = cp.total_setup_time_ms; training_rows = cp.training_rows; //copy the value to display the right number on the model page before training has started validation_rows = cp.validation_rows; //copy the value to display the right number on the model page before training has started _bestLoss = cp._bestLoss; epoch_counter = cp.epoch_counter; iterations = cp.iterations; // deep clone scoring history scoringInfo = cp.scoringInfo.clone(); for (int i=0; i< scoringInfo.length;++i) scoringInfo[i] = IcedUtils.deepCopy(cp.scoringInfo[i]); _output.errors = last_scored(); _output._scoring_history = DeepWaterScoringInfo.createScoringHistoryTable(scoringInfo, (null != get_params()._valid), false, _output.getModelCategory(), _output.isAutoencoder()); _output._variable_importances = calcVarImp(last_scored().variable_importances); if (dataInfo!=null) { _output.setNames(dataInfo._adaptedFrame.names()); _output._domains = dataInfo._adaptedFrame.domains(); } assert(_key.equals(destKey)); } private void setDataInfoToOutput(DataInfo dinfo) { if (dinfo == null) return; // update the model's expected frame format - needed for train/test adaptation _output.setNames(dinfo._adaptedFrame.names()); _output._domains = dinfo._adaptedFrame.domains(); _output._nums = dinfo._nums; _output._cats = dinfo._cats; _output._catOffsets = dinfo._catOffsets; _output._normMul = dinfo._normMul; _output._normSub = dinfo._normSub; _output._normRespMul = dinfo._normRespMul; _output._normRespSub = dinfo._normRespSub; _output._useAllFactorLevels = dinfo._useAllFactorLevels; } /** * Regular constructor (from scratch) * @param destKey destination key * @param params DL parameters * @param output DL model output * @param nClasses Number of classes (1 for regression or autoencoder) */ public DeepWaterModel(final Key<DeepWaterModel> destKey, final DeepWaterParameters params, final DeepWaterModelOutput output, Frame train, Frame valid, int nClasses) { super(destKey, params, output); if (H2O.getCloudSize() != 1) throw new IllegalArgumentException("Deep Water currently only supports execution of 1 node."); _output._origNames = params._train.get().names(); _output._origDomains = params._train.get().domains(); DeepWaterParameters parms = (DeepWaterParameters) params.clone(); //make a copy, don't change model's parameters DeepWaterParameters.Sanity.modifyParms(parms, parms, nClasses); //sanitize the model_info's parameters DataInfo dinfo = null; if (parms._problem_type == DeepWaterParameters.ProblemType.dataset) { dinfo = makeDataInfo(train, valid, parms); DKV.put(dinfo); setDataInfoToOutput(dinfo); // either provide no image_shape (i.e., (0,0)), or provide both values and channels >= 1 (to turn it into an image problem) if (parms._image_shape != null && parms._image_shape[0] != 0) { if (parms._image_shape[0] < 0) { throw new IllegalArgumentException("image_shape must either have both values == 0 or both values >= 1 for " + parms._problem_type.getClass().toString() + "=" + parms._problem_type.toString()); } if (parms._image_shape[1] <= 0) { throw new IllegalArgumentException("image_shape must either have both values == 0 or both values >= 1 for " + parms._problem_type.getClass().toString() + "=" + parms._problem_type.toString()); } if (parms._channels <= 0) { throw new IllegalArgumentException("channels must be >= 1 when image_shape is provided for " + parms._problem_type.getClass().toString() + "=" + parms._problem_type.toString()); } if (dinfo.fullN() != parms._image_shape[0] * parms._image_shape[1] * parms._channels) { throw new IllegalArgumentException("Data input size mismatch: Expect image_shape[0] x image_shape[1] x channels == #cols(H2OFrame), but got: " + parms._image_shape[0] + " x " + parms._image_shape[1] + " x " + parms._channels + " != " + dinfo.fullN() + ". Check these parameters, or disable ignore_const_cols."); } } } model_info = new DeepWaterModelInfo(parms, nClasses, dinfo != null ? dinfo.fullN() : -1); model_info._dataInfo = dinfo; if (dinfo!=null) { FrameUtils.printTopCategoricalLevels(dinfo._adaptedFrame, dinfo.fullN() > 10000, 10); Log.info("Building the model on " + dinfo.numNums() + " numeric features and " + dinfo.numCats() + " (one-hot encoded) categorical features."); } // now, parms is get_params(); _dist = new Distribution(get_params()); assert(_dist.distribution != DistributionFamily.AUTO); // Note: Must use sanitized parameters via get_params() as this._params can still have defaults AUTO, etc.) actual_best_model_key = Key.make(H2O.SELF); if (get_params()._nfolds != 0) actual_best_model_key = null; if (!get_params()._autoencoder) { scoringInfo = new DeepWaterScoringInfo[1]; scoringInfo[0] = new DeepWaterScoringInfo(); scoringInfo[0].validation = (get_params()._valid != null); scoringInfo[0].time_stamp_ms = System.currentTimeMillis(); _output.errors = last_scored(); _output._scoring_history = DeepWaterScoringInfo.createScoringHistoryTable(scoringInfo, (null != get_params()._valid), false, _output.getModelCategory(), _output.isAutoencoder()); _output._variable_importances = calcVarImp(last_scored().variable_importances); } time_of_start_ms = System.currentTimeMillis(); assert _key.equals(destKey); boolean fail = false; long byte_size = 0; try { byte_size = new AutoBuffer().put(this).buf().length; } catch(Throwable t) { fail = true; } if (byte_size > Value.MAX || fail) throw new IllegalArgumentException(technote(5, "Model is too large to fit into the DKV (larger than " + PrettyPrint.bytes(Value.MAX) + ").")); } long _timeLastIterationEnter; private long _timeLastScoreStart; //start actual scoring private long _timeLastScoreEnd; //finished actual scoring private long _timeLastPrintStart; private void checkTimingConsistency() { assert(total_scoring_time_ms <= total_training_time_ms); assert(total_setup_time_ms <= total_training_time_ms); assert(total_setup_time_ms+total_scoring_time_ms <= total_training_time_ms); assert(total_training_time_ms >= total_checkpointed_run_time_ms); assert(total_checkpointed_run_time_ms >= 0); assert(total_training_time_ms >= 0); assert(total_scoring_time_ms >= 0); } private void updateTiming(Key<Job> job_key) { final long now = System.currentTimeMillis(); long start_time_current_model = job_key.get().start_time(); total_training_time_ms = total_checkpointed_run_time_ms + (now - start_time_current_model); checkTimingConsistency(); } /** * Score this DeepWater model * @param fTrain potentially downsampled training data for scoring * @param fValid potentially downsampled validation data for scoring * @param jobKey key of the owning job * @param iteration Map/Reduce iteration count * @return true if model building is ongoing */ boolean doScoring(Frame fTrain, Frame fValid, Key<Job> jobKey, int iteration, boolean finalScoring) { final long now = System.currentTimeMillis(); final double time_since_last_iter = now - _timeLastIterationEnter; updateTiming(jobKey); _timeLastIterationEnter = now; epoch_counter = (double)model_info().get_processed_total()/training_rows; boolean keep_running; // Auto-tuning // if multi-node and auto-tuning and at least 10 ms for communication and per-iteration overhead (to avoid doing thins on multi-JVM on same node), // then adjust the auto-tuning parameter 'actual_train_samples_per_iteration' such that the targeted ratio of comm to comp is achieved if (get_params()._train_samples_per_iteration == -2 && iteration > 1) { Log.debug("Auto-tuning train_samples_per_iteration."); if (time_for_iteration_overhead_ms > 10) { Log.debug(" Time taken for per-iteration comm overhead: " + PrettyPrint.msecs(time_for_iteration_overhead_ms, true)); Log.debug(" Time taken for Map/Reduce iteration: " + PrettyPrint.msecs((long) time_since_last_iter, true)); final double comm_to_work_ratio = time_for_iteration_overhead_ms / time_since_last_iter; Log.debug(" Ratio of per-iteration comm overhead to computation: " + String.format("%.5f", comm_to_work_ratio)); Log.debug(" target_comm_to_work: " + get_params()._target_ratio_comm_to_comp); Log.debug("Old value of train_samples_per_iteration: " + actual_train_samples_per_iteration); double correction = get_params()._target_ratio_comm_to_comp / comm_to_work_ratio; correction = Math.max(0.5,Math.min(2, correction)); //it's ok to train up to 2x more training rows per iteration, but not fewer than half. if (Math.abs(correction) < 0.8 || Math.abs(correction) > 1.2) { //don't correct unless it's significant (avoid slow drift) actual_train_samples_per_iteration /= correction; actual_train_samples_per_iteration = Math.max(1, actual_train_samples_per_iteration); Log.debug("New value of train_samples_per_iteration: " + actual_train_samples_per_iteration); } else { Log.debug("Keeping value of train_samples_per_iteration the same (would deviate too little from previous value): " + actual_train_samples_per_iteration); } } else { Log.debug("Iteration overhead is faster than 10 ms. Not modifying train_samples_per_iteration: " + actual_train_samples_per_iteration); } } keep_running = (epoch_counter < get_params()._epochs) && !stopped_early; final long sinceLastScore = now -_timeLastScoreStart; // this is potentially slow - only do every so often if( !keep_running || get_params()._score_each_iteration || (sinceLastScore > get_params()._score_interval *1000 //don't score too often &&(double)(_timeLastScoreEnd-_timeLastScoreStart)/sinceLastScore < get_params()._score_duty_cycle) ) { //duty cycle Log.info(logNvidiaStats()); jobKey.get().update(0,"Scoring on " + fTrain.numRows() + " training samples" +(fValid != null ? (", " + fValid.numRows() + " validation samples") : "")); final boolean printme = !get_params()._quiet_mode; _timeLastScoreStart = System.currentTimeMillis(); DeepWaterScoringInfo scoringInfo = new DeepWaterScoringInfo(); scoringInfo.time_stamp_ms = _timeLastScoreStart; updateTiming(jobKey); scoringInfo.total_training_time_ms = total_training_time_ms; scoringInfo.total_scoring_time_ms = total_scoring_time_ms; scoringInfo.total_setup_time_ms = total_setup_time_ms; scoringInfo.epoch_counter = epoch_counter; scoringInfo.iterations = iterations; scoringInfo.training_samples = (double)model_info().get_processed_total(); scoringInfo.validation = fValid != null; scoringInfo.score_training_samples = fTrain.numRows(); scoringInfo.score_validation_samples = get_params()._score_validation_samples; scoringInfo.is_classification = _output.isClassifier(); scoringInfo.is_autoencoder = _output.isAutoencoder(); if (printme) Log.info("Scoring the model."); // compute errors final String m = model_info().toString(); if (m.length() > 0) Log.info(m); // For GainsLift and Huber, we need the full predictions to compute the model metrics boolean needPreds = _output.nclasses() == 2 /* gains/lift table requires predictions */ || get_params()._distribution==DistributionFamily.huber; // Scoring on training data ModelMetrics mtrain; Frame preds = null; if (needPreds) { // allocate predictions since they are needed preds = score(fTrain); mtrain = ModelMetrics.getFromDKV(this, fTrain); } else { // no need to allocate predictions ModelMetrics.MetricBuilder mb = scoreMetrics(fTrain); mtrain = mb.makeModelMetrics(this,fTrain,fTrain,null); } if (preds!=null) preds.remove(); _output._training_metrics = mtrain; scoringInfo.scored_train = new ScoreKeeper(mtrain); ModelMetricsSupervised mm1 = (ModelMetricsSupervised)mtrain; if (mm1 instanceof ModelMetricsBinomial) { ModelMetricsBinomial mm = (ModelMetricsBinomial)(mm1); scoringInfo.training_AUC = mm._auc; } if (fTrain.numRows() != training_rows) { _output._training_metrics._description = "Metrics reported on temporary training frame with " + fTrain.numRows() + " samples"; } else if (fTrain._key != null && fTrain._key.toString().contains("chunks")){ _output._training_metrics._description = "Metrics reported on temporary (load-balanced) training frame"; } else { _output._training_metrics._description = "Metrics reported on full training frame"; } // Scoring on validation data ModelMetrics mvalid; if (fValid != null) { preds = null; if (needPreds) { // allocate predictions since they are needed preds = score(fValid); mvalid = ModelMetrics.getFromDKV(this, fValid); } else { // no need to allocate predictions ModelMetrics.MetricBuilder mb = scoreMetrics(fValid); mvalid = mb.makeModelMetrics(this, fValid, fValid,null); } if (preds!=null) preds.remove(); _output._validation_metrics = mvalid; scoringInfo.scored_valid = new ScoreKeeper(mvalid); if (mvalid != null) { if (mvalid instanceof ModelMetricsBinomial) { ModelMetricsBinomial mm = (ModelMetricsBinomial) mvalid; scoringInfo.validation_AUC = mm._auc; } if (fValid.numRows() != validation_rows) { _output._validation_metrics._description = "Metrics reported on temporary validation frame with " + fValid.numRows() + " samples"; } else if (fValid._key != null && fValid._key.toString().contains("chunks")){ _output._validation_metrics._description = "Metrics reported on temporary (load-balanced) validation frame"; } else { _output._validation_metrics._description = "Metrics reported on full validation frame"; } } } // if (get_params()._variable_importances) { // if (!get_params()._quiet_mode) Log.info("Computing variable importances."); // throw H2O.unimpl(); // final float[] vi = model_info().computeVariableImportances(); // scoringInfo.variable_importances = new VarImp(vi, Arrays.copyOfRange(model_info().data_info().coefNames(), 0, vi.length)); // } _timeLastScoreEnd = System.currentTimeMillis(); long scoringTime = _timeLastScoreEnd - _timeLastScoreStart; total_scoring_time_ms += scoringTime; updateTiming(jobKey); // update the scoringInfo object to report proper speed scoringInfo.total_training_time_ms = total_training_time_ms; scoringInfo.total_scoring_time_ms = total_scoring_time_ms; scoringInfo.this_scoring_time_ms = scoringTime; // enlarge the error array by one, push latest score back if (this.scoringInfo == null) { this.scoringInfo = new DeepWaterScoringInfo[]{scoringInfo}; } else { DeepWaterScoringInfo[] err2 = new DeepWaterScoringInfo[this.scoringInfo.length + 1]; System.arraycopy(this.scoringInfo, 0, err2, 0, this.scoringInfo.length); err2[err2.length - 1] = scoringInfo; this.scoringInfo = err2; } _output.errors = last_scored(); _output._scoring_history = DeepWaterScoringInfo.createScoringHistoryTable(this.scoringInfo, (null != get_params()._valid), false, _output.getModelCategory(), _output.isAutoencoder()); _output._variable_importances = calcVarImp(last_scored().variable_importances); _output._model_summary = model_info.createSummaryTable(); // always keep a copy of the best model so far (based on the following criterion) if (!finalScoring) { if (actual_best_model_key != null && get_params()._overwrite_with_best_model && ( // if we have a best_model in DKV, then compare against its error() (unless it's a different model as judged by the network size) (DKV.get(actual_best_model_key) != null && !(loss() >= DKV.get(actual_best_model_key).<DeepWaterModel>get().loss() ) ) || // otherwise, compare against our own _bestError (DKV.get(actual_best_model_key) == null && loss() < _bestLoss) ) ) { _bestLoss = loss(); model_info.nativeToJava(); putMeAsBestModel(actual_best_model_key); } // print the freshly scored model to ASCII if (keep_running && printme) Log.info(toString()); if (ScoreKeeper.stopEarly(ScoringInfo.scoreKeepers(scoring_history()), get_params()._stopping_rounds, _output.isClassifier(), get_params()._stopping_metric, get_params()._stopping_tolerance, "model's last", true )) { Log.info("Convergence detected based on simple moving average of the loss function for the past " + get_params()._stopping_rounds + " scoring events. Model building completed."); stopped_early = true; } if (printme) Log.info("Time taken for scoring and diagnostics: " + PrettyPrint.msecs(scoringInfo.this_scoring_time_ms, true)); } } if (stopped_early) { // pretend as if we finished all epochs to get the progress bar pretty (especially for N-fold and grid-search) ((Job) DKV.getGet(jobKey)).update((long) (get_params()._epochs * training_rows)); update(jobKey); return false; } progressUpdate(jobKey, keep_running); //update(jobKey); return keep_running; } private void putMeAsBestModel(Key bestModelKey) { DKV.put(bestModelKey, IcedUtils.deepCopy(this)); assert DKV.get(bestModelKey) != null; assert ((DeepWaterModel)DKV.getGet(bestModelKey)).compareTo(this) <= 0; } private void progressUpdate(Key<Job> job_key, boolean keep_running) { updateTiming(job_key); Job job = job_key.get(); double progress = job.progress(); // Log.info("2nd speed: (samples: " + model_info().get_processed_total() + ", total_run_time: " + total_training_time_ms + ", total_scoring_time: " + total_scoring_time_ms + ", total_setup_time: " + total_setup_time_ms + ")"); float speed = (float)(model_info().get_processed_total() * 1000. / (total_training_time_ms -total_scoring_time_ms-total_setup_time_ms)); assert(speed >= 0) : "negative speed computed! (total_run_time: " + total_training_time_ms + ", total_scoring_time: " + total_scoring_time_ms + ", total_setup_time: " + total_setup_time_ms + ")"; String msg = "Iterations: " + String.format("%,d", iterations) + ". Epochs: " + String.format("%g", epoch_counter) + ". Speed: " + (speed>10 ? String.format("%d", (int)speed) : String.format("%g", speed)) + " samples/sec." + (progress == 0 ? "" : " Estimated time left: " + PrettyPrint.msecs((long) (total_training_time_ms * (1. - progress) / progress), true)); job.update(actual_train_samples_per_iteration,msg); //mark the amount of work done for the progress bar long now = System.currentTimeMillis(); long sinceLastPrint = now -_timeLastPrintStart; if (!keep_running || sinceLastPrint > get_params()._score_interval * 1000) { //print this after every score_interval, not considering duty cycle _timeLastPrintStart = now; if (!get_params()._quiet_mode) { Log.info( "Training time: " + PrettyPrint.msecs(total_training_time_ms, true) + " (scoring: " + PrettyPrint.msecs(total_scoring_time_ms, true) + "). " + "Processed " + String.format("%,d", model_info().get_processed_total()) + " samples" + " (" + String.format("%.3f", epoch_counter) + " epochs).\n"); Log.info(msg); } } } private int backendCount = 0; @Override protected void setupBigScorePredict() { synchronized (model_info()) { backendCount++; // Initial init of backend + model, backend is shared across threads if (null == model_info()._backend) { model_info().javaToNative(); } // Backend already initialized, initialize model per thread if (null == model_info().getModel().get()) { model_info().initModel(); } } } @Override protected void closeBigScorePredict() { synchronized (model_info()) { if (0 == --backendCount) { // No more threads using the backend, nuke backend + model model_info().nukeBackend(); } else if (null != model_info().getModel().get()) { // Backend still used by other threads, nuke only model model_info().nukeModel(); } } } /** * Single-instance scoring - slow, not optimized for mini-batches - do not use unless you know what you're doing * @param data One single observation unrolled into a double[], with a length equal to the number of input neurons * @param preds Array to store the predictions in (nclasses+1) * @return vector of [0, p0, p1, p2, etc.] */ @Override protected double[] score0(double[] data, double[] preds) { //allocate a big enough array for the model to be able to score with mini_batch float[] f = new float[_parms._mini_batch_size * data.length]; for (int i=0; i<data.length; ++i) f[i] = (float)data[i]; //only fill the first observation //float[] predFloats = model_info().predict(f); float[] predFloats = model_info._backend.predict(model_info.getModel().get(), f); if (_output.nclasses()>=2) { for (int i = 1; i < _output.nclasses()+1; ++i) preds[i] = predFloats[i]; } else { preds[0] = predFloats[0]; } return preds; } @Override public double[] score0(double[] data, double[] preds, double weight, double offset) { assert(weight==1); assert(offset==0); return score0(data, preds); } @Override protected long checksum_impl() { return super.checksum_impl() * _output._run_time + model_info().hashCode(); } @Override public Frame scoreAutoEncoder(Frame frame, Key destination_key, boolean reconstruction_error_per_feature) { throw H2O.unimpl(); } @Override public Frame scoreDeepFeatures(Frame frame, int layer) { throw H2O.unimpl(); } @Override public Frame scoreDeepFeatures(Frame frame, int layer, Job j) { throw H2O.unimpl(); } @Override public Frame scoreDeepFeatures(Frame frame, String layer, Job job) { if (layer == null) throw new H2OIllegalArgumentException("must give hidden layer (symbol) name to extract - cannot be null"); if (isSupervised()) { int ridx = frame.find(_output.responseName()); if (ridx != -1) { // drop the response for scoring! frame = new Frame(frame); frame.remove(ridx); } } Frame adaptFrm = new Frame(frame); Scope.enter(); adaptTestForTrain(adaptFrm, true, false); Frame _fr = adaptFrm; DataInfo di = model_info()._dataInfo; if (di != null) { di = IcedUtils.deepCopy(di); di._adaptedFrame = _fr; //dinfo logic on _adaptedFrame is what we'll need for extracting standardized features from the data for scoring } final int dataIdx = 0; //FIXME final int weightIdx =_fr.find(get_params()._weights_column); final int batch_size = get_params()._mini_batch_size; ArrayList score_data = new ArrayList(); //for binary data (path to data) ArrayList<Integer> skipped = new ArrayList(); // randomly add more rows to fill up to a multiple of batch_size long seed = 0xDECAF + 0xD00D * model_info().get_processed_global(); Random rng = RandomUtils.getRNG(seed); //make predictions for all rows - even those with weights 0 for now (easier to deal with minibatch) BufferedString bs = new BufferedString(); if ((int)_fr.numRows() != _fr.numRows()) { throw new IllegalArgumentException("Cannot handle datasets with more than 2 billion rows."); } for (int i=0; i<_fr.numRows(); ++i) { double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i); if (weight == 0) { //don't send observations with weight 0 to the GPU skipped.add(i); continue; } if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.image || model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.text) { BufferedString file = _fr.vec(dataIdx).atStr(bs, i); if (file!=null) score_data.add(file.toString()); } else if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) { score_data.add(i); } else throw H2O.unimpl(); } while (score_data.size() % batch_size != 0) { int pick = rng.nextInt(score_data.size()); score_data.add(score_data.get(pick)); } assert(isSupervised()); //not yet implemented for autoencoder final boolean makeNative = model_info()._backend ==null; if (makeNative) model_info().javaToNative(); Frame _predFrame = null; DeepWaterIterator iter; try { // first, figure out hidden layer dimensionality - do this the hard way int cols; { if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.image) { int width = model_info()._width; int height = model_info()._height; int channels = model_info()._channels; iter = new DeepWaterImageIterator(score_data, null /*no labels*/, model_info()._meanData, batch_size, width, height, channels, model_info().get_params()._cache_data); } else if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) { iter = new DeepWaterDatasetIterator(score_data, null /*no labels*/, di, batch_size, model_info().get_params()._cache_data); } else if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.text) { iter = new DeepWaterTextIterator(score_data, null /*no labels*/, batch_size, 56 /*FIXME*/, model_info().get_params()._cache_data); } else { throw H2O.unimpl(); } float[] data = iter.getData(); float[] predFloats = model_info().extractLayer(layer, data); //just to see how big this gets if (predFloats.length == 0) { throw new IllegalArgumentException(model_info().listAllLayers()); } cols = predFloats.length; assert (cols % batch_size == 0); cols /= batch_size; } // allocate the predictions Vec/Frame Vec[] predVecs = new Vec[cols]; for (int i = 0; i < cols; ++i) predVecs[i] = _fr.anyVec().makeZero(); _predFrame = new Frame(predVecs); String[] names = new String[cols]; for (int j=0; j<cols; ++j) { names[j]= "DF."+layer+".C" + (j+1); } _predFrame.setNames(names); Vec.Writer[] vw = new Vec.Writer[cols]; // prep predictions vec for writing for (int i = 0; i < vw.length; ++i) vw[i] = _predFrame.vec(i).open(); // re-create the iterators long row=0; int skippedIdx=0; int skippedRow=skipped.isEmpty()?-1:skipped.get(skippedIdx); if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.image) { int width = model_info()._width; int height = model_info()._height; int channels = model_info()._channels; iter = new DeepWaterImageIterator(score_data, null /*no labels*/, model_info()._meanData, batch_size, width, height, channels, model_info().get_params()._cache_data); } else if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) { iter = new DeepWaterDatasetIterator(score_data, null /*no labels*/, di, batch_size, model_info().get_params()._cache_data); } else if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.text) { iter = new DeepWaterTextIterator(score_data, null /*no labels*/, batch_size, 56 /*FIXME*/, model_info().get_params()._cache_data); } else { throw H2O.unimpl(); } // extract actual hidden layer data Futures fs=new Futures(); while(iter.Next(fs)) { float[] data = iter.getData(); float[] predFloats = model_info().extractLayer(layer, data); // System.err.println("preds: " + Arrays.toString(predFloats)); // fill the pre-created output Frame for (int j = 0; j < batch_size; ++j) { while (row==skippedRow) { assert(weightIdx == -1 ||_fr.vec(weightIdx).at(row)==0); if (skipped.size()>skippedIdx+1) { skippedRow = skipped.get(++skippedIdx); } row++; } if (row >= _fr.numRows()) break; for (int i = 0; i < cols; ++i) vw[i].set(row, predFloats[j*cols + i]); row++; } for (Vec.Writer aVw : vw) aVw.close(fs); fs.blockForPending(); } } catch (IOException e) { e.printStackTrace(); } finally { if (makeNative) model_info().nukeBackend(); return _predFrame; } } class DeepWaterBigScore extends BigScore { Frame _predFrame; //OUTPUT @Override public Frame outputFrame(Key<Frame> key, String [] names, String [][] domains){ _predFrame = new Frame(key, names, _predFrame.vecs()); if (domains!=null) _predFrame.vec(0).setDomain(domains[0]); //only the label is ever categorical if (_predFrame._key!=null) DKV.put(_predFrame); return _predFrame; } @Override public void map(Chunk[] chks, NewChunk[] cpreds) { } @Override public void reduce( BigScore bs ) { } @Override protected void setupLocal() { if (model_info._unstable) { Log.err("Cannot score with an _unstable model."); Log.err(unstable_msg); throw new UnsupportedOperationException(unstable_msg); } DataInfo di = model_info()._dataInfo; if (di != null) { di = IcedUtils.deepCopy(di); di._adaptedFrame = _fr; //dinfo logic on _adaptedFrame is what we'll need for extracting standardized features from the data for scoring } final int dataIdx = 0; //FIXME final int weightIdx =_fr.find(get_params()._weights_column); final int respIdx =_fr.find(get_params()._response_column); final int batch_size = get_params()._mini_batch_size; final int classes = _output.nclasses(); ArrayList score_data = new ArrayList(); //for binary data (path to data) ArrayList<Integer> skipped = new ArrayList(); // randomly add more rows to fill up to a multiple of batch_size long seed = 0xDECAF + 0xD00D * model_info().get_processed_global(); Random rng = RandomUtils.getRNG(seed); //make predictions for all rows - even those with weights 0 for now (easier to deal with minibatch) BufferedString bs = new BufferedString(); if ((int)_fr.numRows() != _fr.numRows()) { throw new IllegalArgumentException("Cannot handle datasets with more than 2 billion rows."); } for (int i=0; i<_fr.numRows(); ++i) { if (isCancelled() || _j != null && _j.stop_requested()) return; double weight = weightIdx == -1 ? 1 : _fr.vec(weightIdx).at(i); if (weight == 0) { //don't send observations with weight 0 to the GPU skipped.add(i); continue; } if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.image || model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.text) { BufferedString file = _fr.vec(dataIdx).atStr(bs, i); if (file!=null) score_data.add(file.toString()); } else if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) { score_data.add(i); } else throw H2O.unimpl(); } while (score_data.size() % batch_size != 0) { int pick = rng.nextInt(score_data.size()); score_data.add(score_data.get(pick)); } _mb = makeMetricBuilder(_domain); assert(isSupervised()); //not yet implemented for autoencoder int cols = _output.nclasses() + (_output.isClassifier()?1:0); if (_makePreds) { Vec[] predVecs = new Vec[cols]; for (int i = 0; i < cols; ++i) predVecs[i] = _fr.anyVec().makeZero(); _predFrame = new Frame(predVecs); } DeepWaterIterator iter; try { Vec.Writer[] vw = new Vec.Writer[cols]; if (_makePreds) { // prep predictions vec for writing for (int i = 0; i < vw.length; ++i) vw[i] = _predFrame.vec(i).open(); } long row=0; int skippedIdx=0; int skippedRow=skipped.isEmpty()?-1:skipped.get(skippedIdx); double mul = 1; double sub = 0; if (_output._normRespMul!=null && _output._normRespSub!=null) { mul = _output._normRespMul[0]; sub = _output._normRespSub[0]; } if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.image) { int width = model_info()._width; int height = model_info()._height; int channels = model_info()._channels; iter = new DeepWaterImageIterator(score_data, null /*no labels*/, model_info()._meanData, batch_size, width, height, channels, model_info().get_params()._cache_data); } else if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.dataset) { iter = new DeepWaterDatasetIterator(score_data, null /*no labels*/, di, batch_size, model_info().get_params()._cache_data); } else if (model_info().get_params()._problem_type == DeepWaterParameters.ProblemType.text) { iter = new DeepWaterTextIterator(score_data, null /*no labels*/, batch_size, 56 /*FIXME*/, model_info().get_params()._cache_data); } else { throw H2O.unimpl(); } Futures fs=new Futures(); while(iter.Next(fs)) { if (isCancelled() || _j != null && _j.stop_requested()) return; float[] data = iter.getData(); float[] predFloats = model_info().predict(data); // System.err.println("preds: " + Arrays.toString(predFloats)); // Log.info("Scoring on " + batch_size + " samples (rows " + row + " and up): " + Arrays.toString(((DeepWaterImageIterator)iter).getFiles())); // fill the pre-created output Frame boolean unstable = false; for (int j = 0; j < batch_size; ++j) { while (row==skippedRow) { assert(weightIdx == -1 ||_fr.vec(weightIdx).at(row)==0); if (skipped.size()>skippedIdx+1) { skippedRow = skipped.get(++skippedIdx); } row++; } if (row >= _fr.numRows()) break; float [] actual = null; if (_computeMetrics) actual = new float[]{(float)_fr.vec(respIdx).at(row)}; if(_output.isClassifier()) { double[] preds =new double[classes+1]; for (int i=0;i<classes;++i) { int idx=j*classes+i; //[p0,...,p9,p0,...,p9, ... ,p0,...,p9] preds[1+i] = predFloats[idx]; if (Double.isNaN(preds[1+i])) unstable = true; } if (_parms._balance_classes) GenModel.correctProbabilities(preds, _output._priorClassDist, _output._modelClassDist); preds[0] = hex.genmodel.GenModel.getPrediction(preds, _output._priorClassDist, null, defaultThreshold()); if (_makePreds) { //Log.info(iter.getFiles()[j] + " -> preds: " + Arrays.toString(preds)); for (int i = 0; i <= classes; ++i) vw[i].set(row, preds[i]); } if (_computeMetrics) _mb.perRow(preds, actual, DeepWaterModel.this); } else { double pred = predFloats[j] * mul + sub; if (Double.isNaN(pred)) unstable = true; if (_makePreds) vw[0].set(row, pred); if (_computeMetrics) _mb.perRow(new double[]{pred}, actual, DeepWaterModel.this); } row++; } if (_makePreds) { for (Vec.Writer aVw : vw) aVw.close(fs); fs.blockForPending(); } if (unstable) { model_info._unstable = true; Log.err(unstable_msg); throw new UnsupportedOperationException(unstable_msg); } } if ( _j != null) _j.update(_fr.anyVec().nChunks()); } catch (IOException e) { e.printStackTrace(); } } DeepWaterBigScore(String[] domain, int ncols, double[] mean, boolean testHasWeights, boolean computeMetrics, boolean makePreds, Job j) { super(domain, ncols, mean, testHasWeights, computeMetrics, makePreds, j); } } @Override protected Frame predictScoreImpl(Frame fr, Frame adaptFrm, String destination_key, Job j, boolean computeMetrics) { final boolean makeNative = model_info()._backend ==null; if (makeNative) model_info().javaToNative(); // Build up the names & domains. String[] names = makeScoringNames(); String[][] domains = new String[names.length][]; domains[0] = names.length == 1 ? null : !computeMetrics ? _output._domains[_output._domains.length-1] : adaptFrm.lastVec().domain(); //DEBUGGING ONLY /* DataInfo _dinfo = model_info._dataInfoKey.get(); for (int r=0; r<_dinfo._adaptedFrame.numRows(); ++r) { // Version 1 - via DataInfo DataInfo.Row row = _dinfo.newDenseRow(); Chunk[] chks = new Chunk[_dinfo._adaptedFrame.numCols()]; for (int i = 0; i < chks.length; ++i) chks[i] = _dinfo._adaptedFrame.vec(i).chunkForRow(r); for (int i = 0; i < chks.length; ++i) assert (chks[i]._len == chks[0]._len); _dinfo.extractDenseRow(chks, r - (int)chks[0].start(), row); // Version 2 - via GenModel double[] from = new double[chks.length]; for (int i = 0; i < chks.length; ++i) from[i] = chks[i].atd(r - (int)chks[0].start()); float[] _destData = new float[_output._nums + _output._catOffsets[_output._cats]]; GenModel.setInput(from, _destData, _output._nums, _output._cats, _output._catOffsets, _output._normMul, _output._normSub, _output._useAllFactorLevels); // Compare the two for (int i = 0; i < _dinfo.fullN(); ++i) assert Math.abs(_destData[i] - row.get(i)) <= 1e-6 * Math.abs(_destData[i] + row.get(i)) : " feature " + i + " is " + _destData[i] + " vs " + row.get(i); } */ // Score the dataset, building the class distribution & predictions BigScore bs = new DeepWaterBigScore(domains[0],names.length,adaptFrm.means(),_output.hasWeights() && adaptFrm.find(_output.weightsName()) >= 0,computeMetrics, true /*make preds*/, j).doAll(adaptFrm); if (computeMetrics) bs._mb.makeModelMetrics(this, fr, adaptFrm, bs.outputFrame()); if (makeNative) removeNativeState(); return bs.outputFrame(null == destination_key ? Key.<Frame>make() : Key.<Frame>make(destination_key), names, domains); } @Override protected ModelMetrics.MetricBuilder scoreMetrics(Frame adaptFrm) { final boolean makeNative = model_info()._backend ==null; if (makeNative) model_info().javaToNative(); final boolean computeMetrics = (!isSupervised() || (adaptFrm.vec(_output.responseName()) != null && !adaptFrm.vec(_output.responseName()).isBad())); // Build up the names & domains. String [] domain = !computeMetrics ? _output._domains[_output._domains.length-1] : adaptFrm.lastVec().domain(); // Score the dataset, building the class distribution & predictions BigScore bs = new DeepWaterBigScore(domain,0,adaptFrm.means(),_output.hasWeights() && adaptFrm.find(_output.weightsName()) >= 0,computeMetrics, false /*no preds*/, null).doAll(adaptFrm); if (makeNative) removeNativeState(); return bs._mb; } void removeNativeState() { model_info().nukeBackend(); } @Override protected Futures remove_impl(Futures fs) { cleanUpCache(fs); removeNativeState(); if (actual_best_model_key!=null) DKV.remove(actual_best_model_key); if (model_info()._dataInfo !=null) model_info()._dataInfo.remove(fs); return super.remove_impl(fs); } void exportNativeModel(String path, int iteration) { model_info().saveNativeState(path, iteration); } static String CACHE_MARKER = "__d33pW473r_1n73rn4l__"; void cleanUpCache() { cleanUpCache(null); } private void cleanUpCache(Futures fs) { final Key[] cacheKeys = KeySnapshot.globalSnapshot().filter(new KeySnapshot.KVFilter() { @Override public boolean filter(KeySnapshot.KeyInfo k) { return Value.isSubclassOf(k._type, DeepWaterImageIterator.IcedImage.class) && k._key.toString().contains(CACHE_MARKER) || Value.isSubclassOf(k._type, DeepWaterDatasetIterator.IcedRow.class) && k._key.toString().contains(CACHE_MARKER); } }).keys(); if (fs==null) fs = new Futures(); for (Key k : cacheKeys) DKV.remove(k, fs); fs.blockForPending(); } private static String getNvidiaStats() throws java.io.IOException { String cmd = "nvidia-smi"; InputStream stdin = Runtime.getRuntime().exec(cmd).getInputStream(); InputStreamReader isr = new InputStreamReader(stdin); BufferedReader br = new BufferedReader(isr); StringBuilder sb = new StringBuilder(); String s; while ((s = br.readLine()) != null) { sb.append(s).append("\n"); } return sb.toString(); } static private String logNvidiaStats() { try { return (getNvidiaStats()); } catch (IOException e) { return null; } } }