package hex.deeplearning; import com.amazonaws.services.simpleworkflow.model.Run; import hex.*; import water.*; import water.util.*; import static water.util.MRUtils.sampleFrame; import static water.util.MRUtils.sampleFrameStratified; import hex.FrameTask.DataInfo; import water.api.*; import water.fvec.Frame; import water.fvec.RebalanceDataSet; import water.fvec.Vec; import java.lang.reflect.Field; import java.util.Arrays; import java.util.Random; /** * Deep Learning Neural Net implementation based on MRTask2 */ public class DeepLearning extends Job.ValidatedJob { static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields public static DocGen.FieldDoc[] DOC_FIELDS; public static final String DOC_GET = "Deep Learning"; /** * A model key associated with a previously trained Deep Learning * model. This option allows users to build a new model as a * continuation of a previously generated model (e.g., by a grid search). */ @API(help = "Model checkpoint to resume training with", filter= Default.class, json = true) public Key checkpoint; /** * If enabled, store the best model under the destination key of this model at the end of training. * Only applicable if training is not cancelled. */ @API(help = "If enabled, override the final model with the best model found during training", filter= Default.class, json = true) public boolean override_with_best_model = true; /** * Unlock expert mode parameters than can affect model building speed, * predictive accuracy and scoring. Leaving expert mode parameters at default * values is fine for many problems, but best results on complex datasets are often * only attainable via expert mode options. */ @API(help = "Enable expert mode (to access all options from GUI)", filter = Default.class, json = true) public boolean expert_mode = false; @API(help = "Auto-Encoder (Experimental)", filter= Default.class, json = true) public boolean autoencoder = false; @API(help="Use all factor levels of categorical variables. Otherwise, the first factor level is omitted (without loss of accuracy). Useful for variable importances and auto-enabled for autoencoder.",filter=Default.class, json=true, importance = ParamImportance.SECONDARY) public boolean use_all_factor_levels = true; /*Neural Net Topology*/ /** * The activation function (non-linearity) to be used the neurons in the hidden layers. * Tanh: Hyperbolic tangent function (same as scaled and shifted sigmoid). * Rectifier: Chooses the maximum of (0, x) where x is the input value. * Maxout: Choose the maximum coordinate of the input vector. * With Dropout: Zero out a random user-given fraction of the * incoming weights to each hidden layer during training, for each * training row. This effectively trains exponentially many models at * once, and can improve generalization. */ @API(help = "Activation function", filter = Default.class, json = true, importance = ParamImportance.CRITICAL) public Activation activation = Activation.Rectifier; /** * The number and size of each hidden layer in the model. * For example, if a user specifies "100,200,100" a model with 3 hidden * layers will be produced, and the middle hidden layer will have 200 * neurons.To specify a grid search, add parentheses around each * model's specification: "(100,100), (50,50,50), (20,20,20,20)". */ @API(help = "Hidden layer sizes (e.g. 100,100). Grid search: (10,10), (20,20,20)", filter = Default.class, json = true, importance = ParamImportance.CRITICAL) public int[] hidden = new int[] { 200, 200 }; /** * The number of passes over the training dataset to be carried out. * It is recommended to start with lower values for initial grid searches. * This value can be modified during checkpoint restarts and allows continuation * of selected models. */ @API(help = "How many times the dataset should be iterated (streamed), can be fractional", filter = Default.class, dmin = 1e-3, json = true, importance = ParamImportance.CRITICAL) public double epochs = 10; /** * The number of training data rows to be processed per iteration. Note that * independent of this parameter, each row is used immediately to update the model * with (online) stochastic gradient descent. This parameter controls the * synchronization period between nodes in a distributed environment and the * frequency at which scoring and model cancellation can happen. For example, if * it is set to 10,000 on H2O running on 4 nodes, then each node will * process 2,500 rows per iteration, sampling randomly from their local data. * Then, model averaging between the nodes takes place, and scoring can happen * (dependent on scoring interval and duty factor). Special values are 0 for * one epoch per iteration, -1 for processing the maximum amount of data * per iteration (if **replicate training data** is enabled, N epochs * will be trained per iteration on N nodes, otherwise one epoch). Special value * of -2 turns on automatic mode (auto-tuning). */ @API(help = "Number of training samples (globally) per MapReduce iteration. Special values are 0: one epoch, -1: all available data (e.g., replicated training data), -2: automatic", filter = Default.class, lmin = -2, json = true, importance = ParamImportance.SECONDARY) public long train_samples_per_iteration = -2; // @API(help = "Target ratio of communication overhead to computation. Only for multi-node operation and train_samples_per_iteration=-2 (auto-tuning)", filter = Default.class, dmin = 1e-3, dmax=0.999, json = true, importance = ParamImportance.SECONDARY) public double target_ratio_comm_to_comp = 0.02; /** * The random seed controls sampling and initialization. Reproducible * results are only expected with single-threaded operation (i.e., * when running on one node, turning off load balancing and providing * a small dataset that fits in one chunk). In general, the * multi-threaded asynchronous updates to the model parameters will * result in (intentional) race conditions and non-reproducible * results. Note that deterministic sampling and initialization might * still lead to some weak sense of determinism in the model. */ @API(help = "Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded", filter = Default.class, json = true) public long seed = new Random().nextLong(); /*Adaptive Learning Rate*/ /** * The implemented adaptive learning rate algorithm (ADADELTA) automatically * combines the benefits of learning rate annealing and momentum * training to avoid slow convergence. Specification of only two * parameters (rho and epsilon) simplifies hyper parameter search. * In some cases, manually controlled (non-adaptive) learning rate and * momentum specifications can lead to better results, but require the * specification (and hyper parameter search) of up to 7 parameters. * If the model is built on a topology with many local minima or * long plateaus, it is possible for a constant learning rate to produce * sub-optimal results. Learning rate annealing allows digging deeper into * local minima, while rate decay allows specification of different * learning rates per layer. When the gradient is being estimated in * a long valley in the optimization landscape, a large learning rate * can cause the gradient to oscillate and move in the wrong * direction. When the gradient is computed on a relatively flat * surface with small learning rates, the model can converge far * slower than necessary. */ @API(help = "Adaptive learning rate (ADADELTA)", filter = Default.class, json = true, importance = ParamImportance.SECONDARY) public boolean adaptive_rate = true; /** * The first of two hyper parameters for adaptive learning rate (ADADELTA). * It is similar to momentum and relates to the memory to prior weight updates. * Typical values are between 0.9 and 0.999. * This parameter is only active if adaptive learning rate is enabled. */ @API(help = "Adaptive learning rate time decay factor (similarity to prior updates)", filter = Default.class, dmin = 0.01, dmax = 1, json = true, importance = ParamImportance.SECONDARY) public double rho = 0.99; /** * The second of two hyper parameters for adaptive learning rate (ADADELTA). * It is similar to learning rate annealing during initial training * and momentum at later stages where it allows forward progress. * Typical values are between 1e-10 and 1e-4. * This parameter is only active if adaptive learning rate is enabled. */ @API(help = "Adaptive learning rate smoothing factor (to avoid divisions by zero and allow progress)", filter = Default.class, dmin = 1e-15, dmax = 1, json = true, importance = ParamImportance.SECONDARY) public double epsilon = 1e-8; /*Learning Rate*/ /** * When adaptive learning rate is disabled, the magnitude of the weight * updates are determined by the user specified learning rate * (potentially annealed), and are a function of the difference * between the predicted value and the target value. That difference, * generally called delta, is only available at the output layer. To * correct the output at each hidden layer, back propagation is * used. Momentum modifies back propagation by allowing prior * iterations to influence the current update. Using the momentum * parameter can aid in avoiding local minima and the associated * instability. Too much momentum can lead to instabilities, that's * why the momentum is best ramped up slowly. * This parameter is only active if adaptive learning rate is disabled. */ @API(help = "Learning rate (higher => less stable, lower => slower convergence)", filter = Default.class, dmin = 1e-10, dmax = 1, json = true, importance = ParamImportance.SECONDARY) public double rate = .005; /** * Learning rate annealing reduces the learning rate to "freeze" into * local minima in the optimization landscape. The annealing rate is the * inverse of the number of training samples it takes to cut the learning rate in half * (e.g., 1e-6 means that it takes 1e6 training samples to halve the learning rate). * This parameter is only active if adaptive learning rate is disabled. */ @API(help = "Learning rate annealing: rate / (1 + rate_annealing * samples)", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY) public double rate_annealing = 1e-6; /** * The learning rate decay parameter controls the change of learning rate across layers. * For example, assume the rate parameter is set to 0.01, and the rate_decay parameter is set to 0.5. * Then the learning rate for the weights connecting the input and first hidden layer will be 0.01, * the learning rate for the weights connecting the first and the second hidden layer will be 0.005, * and the learning rate for the weights connecting the second and third hidden layer will be 0.0025, etc. * This parameter is only active if adaptive learning rate is disabled. */ @API(help = "Learning rate decay factor between layers (N-th layer: rate*alpha^(N-1))", filter = Default.class, dmin = 0, json = true, importance = ParamImportance.EXPERT) public double rate_decay = 1.0; /*Momentum*/ /** * The momentum_start parameter controls the amount of momentum at the beginning of training. * This parameter is only active if adaptive learning rate is disabled. */ @API(help = "Initial momentum at the beginning of training (try 0.5)", filter = Default.class, dmin = 0, dmax = 0.9999999999, json = true, importance = ParamImportance.SECONDARY) public double momentum_start = 0; /** * The momentum_ramp parameter controls the amount of learning for which momentum increases * (assuming momentum_stable is larger than momentum_start). The ramp is measured in the number * of training samples. * This parameter is only active if adaptive learning rate is disabled. */ @API(help = "Number of training samples for which momentum increases", filter = Default.class, dmin = 1, json = true, importance = ParamImportance.SECONDARY) public double momentum_ramp = 1e6; /** * The momentum_stable parameter controls the final momentum value reached after momentum_ramp training samples. * The momentum used for training will remain the same for training beyond reaching that point. * This parameter is only active if adaptive learning rate is disabled. */ @API(help = "Final momentum after the ramp is over (try 0.99)", filter = Default.class, dmin = 0, dmax = 0.9999999999, json = true, importance = ParamImportance.SECONDARY) public double momentum_stable = 0; /** * The Nesterov accelerated gradient descent method is a modification to * traditional gradient descent for convex functions. The method relies on * gradient information at various points to build a polynomial approximation that * minimizes the residuals in fewer iterations of the descent. * This parameter is only active if adaptive learning rate is disabled. */ @API(help = "Use Nesterov accelerated gradient (recommended)", filter = Default.class, json = true, importance = ParamImportance.SECONDARY) public boolean nesterov_accelerated_gradient = true; /*Regularization*/ /** * A fraction of the features for each training row to be omitted from training in order * to improve generalization (dimension sampling). */ @API(help = "Input layer dropout ratio (can improve generalization, try 0.1 or 0.2)", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY) public double input_dropout_ratio = 0.0; /** * A fraction of the inputs for each hidden layer to be omitted from training in order * to improve generalization. Defaults to 0.5 for each hidden layer if omitted. */ @API(help = "Hidden layer dropout ratios (can improve generalization), specify one value per hidden layer, defaults to 0.5", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY) public double[] hidden_dropout_ratios; /** * A regularization method that constrains the absolute value of the weights and * has the net effect of dropping some weights (setting them to zero) from a model * to reduce complexity and avoid overfitting. */ @API(help = "L1 regularization (can add stability and improve generalization, causes many weights to become 0)", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY) public double l1 = 0.0; /** * A regularization method that constrdains the sum of the squared * weights. This method introduces bias into parameter estimates, but * frequently produces substantial gains in modeling as estimate variance is * reduced. */ @API(help = "L2 regularization (can add stability and improve generalization, causes many weights to be small", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY) public double l2 = 0.0; /** * A maximum on the sum of the squared incoming weights into * any one neuron. This tuning parameter is especially useful for unbound * activation functions such as Maxout or Rectifier. */ @API(help = "Constraint for squared sum of incoming weights per unit (e.g. for Rectifier)", filter = Default.class, dmin = 1e-10, json = true, importance = ParamImportance.EXPERT) public float max_w2 = Float.POSITIVE_INFINITY; /*Initialization*/ /** * The distribution from which initial weights are to be drawn. The default * option is an optimized initialization that considers the size of the network. * The "uniform" option uses a uniform distribution with a mean of 0 and a given * interval. The "normal" option draws weights from the standard normal * distribution with a mean of 0 and given standard deviation. */ @API(help = "Initial Weight Distribution", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public InitialWeightDistribution initial_weight_distribution = InitialWeightDistribution.UniformAdaptive; /** * The scale of the distribution function for Uniform or Normal distributions. * For Uniform, the values are drawn uniformly from -initial_weight_scale...initial_weight_scale. * For Normal, the values are drawn from a Normal distribution with a standard deviation of initial_weight_scale. */ @API(help = "Uniform: -value...value, Normal: stddev)", filter = Default.class, dmin = 0, json = true, importance = ParamImportance.EXPERT) public double initial_weight_scale = 1.0; /** * The loss (error) function to be minimized by the model. * Cross Entropy loss is used when the model output consists of independent * hypotheses, and the outputs can be interpreted as the probability that each * hypothesis is true. Cross entropy is the recommended loss function when the * target values are class labels, and especially for imbalanced data. * It strongly penalizes error in the prediction of the actual class label. * Mean Square loss is used when the model output are continuous real values, but can * be used for classification as well (where it emphasizes the error on all * output classes, not just for the actual class). */ @API(help = "Loss function", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public Loss loss = Loss.Automatic; /*Scoring*/ /** * The minimum time (in seconds) to elapse between model scoring. The actual * interval is determined by the number of training samples per iteration and the scoring duty cycle. */ @API(help = "Shortest time interval (in secs) between model scoring", filter = Default.class, dmin = 0, json = true, importance = ParamImportance.SECONDARY) public double score_interval = 5; /** * The number of training dataset points to be used for scoring. Will be * randomly sampled. Use 0 for selecting the entire training dataset. */ @API(help = "Number of training set samples for scoring (0 for all)", filter = Default.class, lmin = 0, json = true, importance = ParamImportance.EXPERT) public long score_training_samples = 10000l; /** * The number of validation dataset points to be used for scoring. Can be * randomly sampled or stratified (if "balance classes" is set and "score * validation sampling" is set to stratify). Use 0 for selecting the entire * training dataset. */ @API(help = "Number of validation set samples for scoring (0 for all)", filter = Default.class, lmin = 0, json = true, importance = ParamImportance.EXPERT) public long score_validation_samples = 0l; /** * Maximum fraction of wall clock time spent on model scoring on training and validation samples, * and on diagnostics such as computation of feature importances (i.e., not on training). */ @API(help = "Maximum duty cycle fraction for scoring (lower: more training, higher: more scoring).", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.EXPERT) public double score_duty_cycle = 0.1; /** * The stopping criteria in terms of classification error (1-accuracy) on the * training data scoring dataset. When the error is at or below this threshold, * training stops. */ @API(help = "Stopping criterion for classification error fraction on training data (-1 to disable)", filter = Default.class, dmin=-1, dmax=1, json = true, importance = ParamImportance.EXPERT) public double classification_stop = 0; /** * The stopping criteria in terms of regression error (MSE) on the training * data scoring dataset. When the error is at or below this threshold, training * stops. */ @API(help = "Stopping criterion for regression error (MSE) on training data (-1 to disable)", filter = Default.class, dmin=-1, json = true, importance = ParamImportance.EXPERT) public double regression_stop = 1e-6; /** * Enable quiet mode for less output to standard output. */ @API(help = "Enable quiet mode for less output to standard output", filter = Default.class, json = true) public boolean quiet_mode = false; /** * For classification models, the maximum size (in terms of classes) of the * confusion matrix for it to be printed. This option is meant to avoid printing * extremely large confusion matrices. */ @API(help = "Max. size (number of classes) for confusion matrices to be shown", filter = Default.class, json = true) public int max_confusion_matrix_size = 20; /** * The maximum number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable) */ @API(help = "Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)", filter = Default.class, lmin=0, json = true, importance = ParamImportance.EXPERT) public int max_hit_ratio_k = 10; /*Imbalanced Classes*/ /** * For imbalanced data, balance training data class counts via * over/under-sampling. This can result in improved predictive accuracy. */ @API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public boolean balance_classes = false; /** * Desired over/under-sampling ratios per class (lexicographic order). Only when balance_classes is enabled. If not specified, they will be automatically computed to obtain class balance during training. */ @API(help = "Desired over/under-sampling ratios per class (lexicographic order).", filter = Default.class, dmin = 0, json = true, importance = ParamImportance.SECONDARY) public float[] class_sampling_factors; /** * When classes are balanced, limit the resulting dataset size to the * specified multiple of the original dataset size. */ @API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, importance = ParamImportance.EXPERT) public float max_after_balance_size = 5.0f; /** * Method used to sample the validation dataset for scoring, see Score Validation Samples above. */ @API(help = "Method used to sample validation dataset for scoring", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public ClassSamplingMethod score_validation_sampling = ClassSamplingMethod.Uniform; /*Misc*/ /** * Gather diagnostics for hidden layers, such as mean and RMS values of learning * rate, momentum, weights and biases. */ @API(help = "Enable diagnostics for hidden layers", filter = Default.class, json = true) public boolean diagnostics = true; /** * Whether to compute variable importances for input features. * The implemented method (by Gedeon) considers the weights connecting the * input features to the first two hidden layers. */ @API(help = "Compute variable importances for input features (Gedeon method) - can be slow for large networks", filter = Default.class, json = true) public boolean variable_importances = false; /** * Enable fast mode (minor approximation in back-propagation), should not affect results significantly. */ @API(help = "Enable fast mode (minor approximation in back-propagation)", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public boolean fast_mode = true; /** * Ignore constant training columns (no information can be gained anyway). */ @API(help = "Ignore constant training columns (no information can be gained anyway)", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public boolean ignore_const_cols = true; /** * Increase training speed on small datasets by splitting it into many chunks * to allow utilization of all cores. */ @API(help = "Force extra load balancing to increase training speed for small datasets (to keep all cores busy)", filter = Default.class, json = true) public boolean force_load_balance = true; /** * Replicate the entire training dataset onto every node for faster training on small datasets. */ @API(help = "Replicate the entire training dataset onto every node for faster training on small datasets", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public boolean replicate_training_data = true; /** * Run on a single node for fine-tuning of model parameters. Can be useful for * checkpoint resumes after training on multiple nodes for fast initial * convergence. */ @API(help = "Run on a single node for fine-tuning of model parameters", filter = Default.class, json = true) public boolean single_node_mode = false; /** * Enable shuffling of training data (on each node). This option is * recommended if training data is replicated on N nodes, and the number of training samples per iteration * is close to N times the dataset size, where all nodes train will (almost) all * the data. It is automatically enabled if the number of training samples per iteration is set to -1 (or to N * times the dataset size or larger). */ @API(help = "Enable shuffling of training data (recommended if training data is replicated and train_samples_per_iteration is close to #nodes x #rows)", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public boolean shuffle_training_data = false; // @API(help = "Handling of missing values. Either Skip or MeanImputation.", filter= Default.class, json = true) public MissingValuesHandling missing_values_handling = MissingValuesHandling.MeanImputation; @API(help = "Sparse data handling (Experimental).", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public boolean sparse = false; @API(help = "Use a column major weight matrix for input layer. Can speed up forward propagation, but might slow down backpropagation (Experimental).", filter = Default.class, json = true, importance = ParamImportance.EXPERT) public boolean col_major = false; @API(help = "Average activation for sparse auto-encoder (Experimental)", filter= Default.class, json = true) public double average_activation = 0; @API(help = "Sparsity regularization (Experimental)", filter= Default.class, json = true) public double sparsity_beta = 0; @API(help = "Max. number of categorical features, enforced via hashing (Experimental).", filter= Default.class, lmin = 1, json = true) public int max_categorical_features = Integer.MAX_VALUE; @API(help = "Force reproducibility on small data (will be slow - only uses 1 thread)", filter= Default.class, json = true) public boolean reproducible = false; public enum MissingValuesHandling { Skip, MeanImputation } public enum ClassSamplingMethod { Uniform, Stratified } public enum InitialWeightDistribution { UniformAdaptive, Uniform, Normal } /** * Activation functions */ public enum Activation { Tanh, TanhWithDropout, Rectifier, RectifierWithDropout, Maxout, MaxoutWithDropout } /** * Loss functions * CrossEntropy is recommended */ public enum Loss { Automatic, MeanSquare, CrossEntropy } // the following parameters can only be specified in expert mode transient final String [] expert_options = new String[] { "use_all_factor_levels", "loss", "max_w2", "score_training_samples", "score_validation_samples", "initial_weight_distribution", "initial_weight_scale", "diagnostics", "rate_decay", "score_duty_cycle", "variable_importances", "fast_mode", "score_validation_sampling", "ignore_const_cols", "force_load_balance", "replicate_training_data", "shuffle_training_data", "nesterov_accelerated_gradient", "classification_stop", "regression_stop", "quiet_mode", "max_confusion_matrix_size", "max_hit_ratio_k", "hidden_dropout_ratios", "single_node_mode", "sparse", "col_major", "autoencoder", "average_activation", "sparsity_beta", "max_categorical_features", }; // the following parameters can be modified when restarting from a checkpoint transient final String [] cp_modifiable = new String[] { "expert_mode", "seed", "epochs", "score_interval", "train_samples_per_iteration", "target_ratio_comm_to_comp", "score_duty_cycle", "classification_stop", "regression_stop", "quiet_mode", "max_confusion_matrix_size", "max_hit_ratio_k", "diagnostics", "variable_importances", "force_load_balance", "replicate_training_data", "shuffle_training_data", "single_node_mode", "sparse", "col_major", // Allow modification of the regularization parameters after a checkpoint restart "l1", "l2", "max_w2", }; /** * Helper to specify which arguments trigger a refresh on change * @param ver */ @Override protected void registered(RequestServer.API_VERSION ver) { super.registered(ver); for (Argument arg : _arguments) { if ( arg._name.equals("activation") || arg._name.equals("initial_weight_distribution") || arg._name.equals("expert_mode") || arg._name.equals("adaptive_rate") || arg._name.equals("replicate_training_data") || arg._name.equals("balance_classes") || arg._name.equals("n_folds") || arg._name.equals("autoencoder") || arg._name.equals("checkpoint")) { arg.setRefreshOnChange(); } } } /** * Helper to handle arguments based on existing input values * @param arg * @param inputArgs */ @Override protected void queryArgumentValueSet(Argument arg, java.util.Properties inputArgs) { super.queryArgumentValueSet(arg, inputArgs); if (!arg._name.equals("checkpoint") && !Utils.contains(cp_modifiable, arg._name)) { if (checkpoint != null) { arg.disable("Taken from model checkpoint."); final DeepLearningModel cp_model = UKV.get(checkpoint); if (cp_model == null) { throw new IllegalArgumentException("Checkpointed model was not found."); } if (cp_model.model_info().unstable()) { throw new IllegalArgumentException("Checkpointed model was unstable. Not restarting."); } return; } } if(arg._name.equals("initial_weight_scale") && (initial_weight_distribution == InitialWeightDistribution.UniformAdaptive) ) { arg.disable("Using sqrt(6 / (# units + # units of previous layer)) for Uniform distribution.", inputArgs); } if (classification) { if(arg._name.equals("regression_stop")) { arg.disable("Only for regression.", inputArgs); } if((arg._name.equals("max_after_balance_size") || arg._name.equals("class_sampling_factors")) && !balance_classes) { arg.disable("Requires balance_classes.", inputArgs); } } else { if(arg._name.equals("classification_stop") || arg._name.equals("max_confusion_matrix_size") || arg._name.equals("max_hit_ratio_k") || arg._name.equals("max_after_balance_size") || arg._name.equals("balance_classes") || arg._name.equals("class_sampling_factors") ) { arg.disable("Only for classification.", inputArgs); } if (validation != null && arg._name.equals("score_validation_sampling")) { score_validation_sampling = ClassSamplingMethod.Uniform; arg.disable("Using uniform sampling for validation scoring dataset.", inputArgs); } } if ((arg._name.equals("score_validation_samples") || arg._name.equals("score_validation_sampling")) && validation == null) { arg.disable("Requires a validation data set.", inputArgs); } if (Utils.contains(expert_options, arg._name) && !expert_mode) { arg.disable("Only in expert mode.", inputArgs); } if (!adaptive_rate) { if (arg._name.equals("rho") || arg._name.equals("epsilon")) { arg.disable("Only for adaptive learning rate.", inputArgs); rho = 0; epsilon = 0; } } else { if (arg._name.equals("rate") || arg._name.equals("rate_annealing") || arg._name.equals("rate_decay") || arg._name.equals("nesterov_accelerated_gradient") || arg._name.equals("momentum_start") || arg._name.equals("momentum_ramp") || arg._name.equals("momentum_stable") ) { arg.disable("Only for non-adaptive learning rate.", inputArgs); momentum_start = 0; momentum_stable = 0; } } if (arg._name.equals("hidden_dropout_ratios")) { if (activation != Activation.TanhWithDropout && activation != Activation.MaxoutWithDropout && activation != Activation.RectifierWithDropout) { arg.disable("Only for activation functions with dropout.", inputArgs); } } if (arg._name.equals("replicate_training_data") && (H2O.CLOUD.size() == 1)) { arg.disable("Only for multi-node operation."); replicate_training_data = false; } if (arg._name.equals("single_node_mode") && (H2O.CLOUD.size() == 1 || !replicate_training_data)) { arg.disable("Only for multi-node operation with replication."); single_node_mode = false; } if (arg._name.equals("use_all_factor_levels") && autoencoder ) { arg.disable("Automatically enabled for auto-encoders."); use_all_factor_levels = true; } if(arg._name.equals("override_with_best_model") && n_folds != 0) { arg.disable("Only without n-fold cross-validation.", inputArgs); override_with_best_model = false; } if(arg._name.equals("average_activation") && !autoencoder) { arg.disable("Only for autoencoder.", inputArgs); } if(arg._name.equals("sparsity_beta") && !autoencoder) { arg.disable("Only for autoencoder.", inputArgs); } } /** Print model parameters as JSON */ @Override public boolean toHTML(StringBuilder sb) { try { return makeJsonBox(sb); } catch (Throwable t) { return false; } } /** * Return a query link to this page * @param k Model Key * @param content Link text * @return HTML Link */ public static String link(Key k, String content) { return link(k, content, null, null, null); } /** * Return a query link to this page * @param k Model Key * @param content Link text * @param cp Key to checkpoint to continue training with (optional) * @param response Response * @param val Validation data set key * @return HTML Link */ public static String link(Key k, String content, Key cp, String response, Key val) { DeepLearning req = new DeepLearning(); RString rs = new RString("<a href='" + req.href() + ".query?source=%$key" + (cp == null ? "" : "&checkpoint=%$cp") + (response == null ? "" : "&response=%$resp") + (val == null ? "" : "&validation=%$valkey") + "'>%content</a>"); rs.replace("key", k.toString()); rs.replace("content", content); if (cp != null) rs.replace("cp", cp.toString()); if (response != null) rs.replace("resp", response); if (val != null) rs.replace("valkey", val); return rs.toString(); } /** * Report the relative progress of building a Deep Learning model (measured by how many epochs are done) * @return floating point number between 0 and 1 */ @Override public float progress(){ if(UKV.get(dest()) == null)return 0; DeepLearningModel m = UKV.get(dest()); if (m != null && m.model_info()!=null ) { final float p = (float) Math.min(1, (m.epoch_counter / m.model_info().get_params().epochs)); return cv_progress(p); } return 0; } @Override protected final void execImpl() { try { buildModel(); if (n_folds > 0) CrossValUtils.crossValidate(this); } finally { delete(); state = UKV.<Job>get(self()).state; new TAtomic<DeepLearningModel>() { @Override public DeepLearningModel atomic(DeepLearningModel m) { if (m != null) m.get_params().state = state; return m; } }.invoke(dest()); } } /** * Train a Deep Learning model, assumes that all members are populated * If checkpoint == null, then start training a new model, otherwise continue from a checkpoint */ private void buildModel() { DeepLearningModel cp = null; if (checkpoint == null) { cp = initModel(); cp.start_training(null); } else { final DeepLearningModel previous = UKV.get(checkpoint); if (previous == null) throw new IllegalArgumentException("Checkpoint not found."); Log.info("Resuming from checkpoint."); if (n_folds != 0) { throw new UnsupportedOperationException("n_folds must be 0: Cross-validation is not supported during checkpoint restarts."); } else { ((ValidatedJob)previous.job()).xval_models = null; //remove existing cross-validation keys after checkpoint restart } if (source == null || (previous.model_info().get_params().source != null && !Arrays.equals(source._key._kb, previous.model_info().get_params().source._key._kb))) { throw new IllegalArgumentException("source must be the same as for the checkpointed model."); } autoencoder = previous.model_info().get_params().autoencoder; if (!autoencoder && (response == null || !source.names()[source.find(response)].equals(previous.responseName()))) { throw new IllegalArgumentException("response must be the same as for the checkpointed model."); } // if (!autoencoder && (response == null || !Arrays.equals(response._key._kb, previous.model_info().get_params().response._key._kb))) { // throw new IllegalArgumentException("response must be the same as for the checkpointed model."); // } if (Utils.difference(ignored_cols, previous.model_info().get_params().ignored_cols).length != 0 || Utils.difference(previous.model_info().get_params().ignored_cols, ignored_cols).length != 0) { ignored_cols = previous.model_info().get_params().ignored_cols; Log.warn("Automatically re-using ignored_cols from the checkpointed model."); } if ((validation == null) == (previous._validationKey != null) || (validation != null && validation._key != null && previous._validationKey != null && !Arrays.equals(validation._key._kb, previous._validationKey._kb))) { throw new IllegalArgumentException("validation must be the same as for the checkpointed model."); } if (classification != previous.model_info().get_params().classification) { Log.warn("Automatically switching to " + ((classification=!classification) ? "classification" : "regression") + " (same as the checkpointed model)."); } epochs += previous.epoch_counter; //add new epochs to existing model Log.info("Adding " + String.format("%.3f", previous.epoch_counter) + " epochs from the checkpointed model."); try { final DataInfo dataInfo = prepareDataInfo(); cp = new DeepLearningModel(previous, destination_key, job_key, dataInfo); cp.write_lock(self()); cp.start_training(previous); assert(state==JobState.RUNNING); final DeepLearning A = cp.model_info().get_params(); Object B = this; for (Field fA : A.getClass().getDeclaredFields()) { if (Utils.contains(cp_modifiable, fA.getName())) { if (!expert_mode && Utils.contains(expert_options, fA.getName())) continue; for (Field fB : B.getClass().getDeclaredFields()) { if (fA.equals(fB)) { try { if (fB.get(B) == null || fA.get(A) == null || !fA.get(A).toString().equals(fB.get(B).toString())) { // if either of the two parameters is null, skip the toString() if (fA.get(A) == null && fB.get(B) == null) continue; //if both parameters are null, we don't need to do anything Log.info("Applying user-requested modification of '" + fA.getName() + "': " + fA.get(A) + " -> " + fB.get(B)); fA.set(A, fB.get(B)); } } catch (IllegalAccessException e) { e.printStackTrace(); } } } } } if (A.n_folds != 0) { Log.warn("Disabling cross-validation: Not supported when resuming training from a checkpoint."); A.n_folds = 0; } cp.update(self()); } finally { if (cp != null) cp.unlock(self()); } } trainModel(cp); cp.stop_training(); } /** * Redirect to the model page for that model that is trained by this job * @return Response */ @Override protected Response redirect() { return DeepLearningProgressPage.redirect(this, self(), dest()); } private boolean _fakejob; //Sanity check for Deep Learning job parameters private void checkParams() { if (source.numCols() <= 1) throw new IllegalArgumentException("Training data must have at least 2 features (incl. response)."); if (hidden == null) throw new IllegalArgumentException("There must be at least one hidden layer."); for (int i=0;i<hidden.length;++i) { if (hidden[i]==0) throw new IllegalArgumentException("Hidden layer size must be >0."); } //Auto-fill defaults if (hidden_dropout_ratios == null) { if (activation == Activation.TanhWithDropout || activation == Activation.MaxoutWithDropout || activation == Activation.RectifierWithDropout) { hidden_dropout_ratios = new double[hidden.length]; if (!quiet_mode) Log.info("Automatically setting all hidden dropout ratios to 0.5."); Arrays.fill(hidden_dropout_ratios, 0.5); } } else if (hidden_dropout_ratios.length != hidden.length) throw new IllegalArgumentException("Must have " + hidden.length + " hidden layer dropout ratios."); else if (activation != Activation.TanhWithDropout && activation != Activation.MaxoutWithDropout && activation != Activation.RectifierWithDropout) { if (!quiet_mode) Log.info("Ignoring hidden_dropout_ratios because a non-Dropout activation function was specified."); } if (input_dropout_ratio < 0 || input_dropout_ratio >= 1) { throw new IllegalArgumentException("Input dropout must be in [0,1)."); } if (class_sampling_factors != null && !balance_classes) { if (!quiet_mode) Log.info("Ignoring class_sampling_factors since balance_classes is not enabled."); } if (!quiet_mode) { if (adaptive_rate) { Log.info("Using automatic learning rate. Ignoring the following input parameters:"); Log.info(" rate, rate_decay, rate_annealing, momentum_start, momentum_ramp, momentum_stable, nesterov_accelerated_gradient."); } else { Log.info("Using manual learning rate. Ignoring the following input parameters:"); Log.info(" rho, epsilon."); } if (initial_weight_distribution == InitialWeightDistribution.UniformAdaptive) { Log.info("Ignoring initial_weight_scale for UniformAdaptive weight distribution."); } if (n_folds != 0) { if (override_with_best_model) { Log.info("Automatically setting override_with_best_model to false, since the final model is the only scored model with n-fold cross-validation."); override_with_best_model = false; } } } if(loss == Loss.Automatic) { if (!classification) { if (!quiet_mode) Log.info("Automatically setting loss to MeanSquare for regression."); loss = Loss.MeanSquare; } else if (autoencoder) { if (!quiet_mode) Log.info("Automatically setting loss to MeanSquare for auto-encoder."); loss = Loss.MeanSquare; } else { if (!quiet_mode) Log.info("Automatically setting loss to Cross-Entropy for classification."); loss = Loss.CrossEntropy; } } if(autoencoder && sparsity_beta > 0) { if (activation == Activation.Tanh || activation == Activation.TanhWithDropout) { if (average_activation >= 1 || average_activation <= -1) throw new IllegalArgumentException("Tanh average activation must be in (-1,1)."); } else if (activation == Activation.Rectifier || activation == Activation.RectifierWithDropout) { if (average_activation <= 0) throw new IllegalArgumentException("Rectifier average activation must be positive."); } } if (!classification && loss == Loss.CrossEntropy) throw new IllegalArgumentException("Cannot use CrossEntropy loss function for regression."); if (autoencoder && loss != Loss.MeanSquare) throw new IllegalArgumentException("Must use MeanSquare loss function for auto-encoder."); if (autoencoder && classification) { classification = false; Log.info("Using regression mode for auto-encoder.");} // reason for the error message below is that validation might not have the same horizontalized features as the training data (or different order) if (autoencoder && validation != null) throw new UnsupportedOperationException("Cannot specify a validation dataset for auto-encoder."); if (autoencoder && activation == Activation.Maxout) throw new UnsupportedOperationException("Maxout activation is not supported for auto-encoder."); if (max_categorical_features < 1) throw new IllegalArgumentException("max_categorical_features must be at least " + 1); // make default job_key and destination_key in case they are missing if (dest() == null) { destination_key = Key.make(); } if (self() == null) { job_key = Key.make(); } if (UKV.get(self()) == null) { start_time = System.currentTimeMillis(); state = JobState.RUNNING; UKV.put(self(), this); _fakejob = true; } if (!sparse && col_major) { if (!quiet_mode) throw new IllegalArgumentException("Cannot use column major storage for non-sparse data handling."); } if (reproducible) { if (!quiet_mode) Log.info("Automatically enabling force_load_balancing, disabling single_node_mode and replicate_training_data\nand setting train_samples_per_iteration to -1 to enforce reproducibility."); force_load_balance = true; single_node_mode = false; train_samples_per_iteration = -1; replicate_training_data = false; //there's no benefit from having multiple nodes compute the exact same thing, and then average it back to the same // replicate_training_data = true; //doesn't hurt, but does replicated identical work } } /** * Helper to create a DataInfo object from the source and response * @return DataInfo object */ private DataInfo prepareDataInfo() { final boolean del_enum_resp = classification && !response.isEnum(); final Frame train = FrameTask.DataInfo.prepareFrame(source, autoencoder ? null : response, ignored_cols, classification, ignore_const_cols, true /*drop >20% NA cols*/); final DataInfo dinfo = new FrameTask.DataInfo(train, autoencoder ? 0 : 1, true, autoencoder || use_all_factor_levels, //use all FactorLevels for auto-encoder autoencoder ? DataInfo.TransformType.NORMALIZE : DataInfo.TransformType.STANDARDIZE, //transform predictors classification ? DataInfo.TransformType.NONE : DataInfo.TransformType.STANDARDIZE); //transform response if (!autoencoder) { final Vec resp = dinfo._adaptedFrame.lastVec(); //convention from DataInfo: response is the last Vec assert (!classification ^ resp.isEnum()) : "Must have enum response for classification!"; //either regression or enum response if (del_enum_resp) ltrash(resp); } return dinfo; } /** * Create an initial Deep Learning model, typically to be trained by trainModel(model) * @return Randomly initialized model */ public final DeepLearningModel initModel() { try { lock_data(); checkParams(); final DataInfo dinfo = prepareDataInfo(); final Vec resp = dinfo._adaptedFrame.lastVec(); //convention from DataInfo: response is the last Vec float[] priorDist = classification ? new MRUtils.ClassDist(resp).doAll(resp).rel_dist() : null; final DeepLearningModel model = new DeepLearningModel(dest(), self(), source._key, dinfo, (DeepLearning)this.clone(), priorDist); model.model_info().initializeMembers(); return model; } finally { unlock_data(); } } /** * Helper to update a Frame and adding it to the local trash at the same time * @param target Frame referece, to be overwritten * @param src Newly made frame, to be deleted via local trash * @return src */ Frame updateFrame(Frame target, Frame src) { if (src != target) ltrash(src); return src; } /** * Train a Deep Learning neural net model * @param model Input model (e.g., from initModel(), or from a previous training run) * @return Trained model */ public final DeepLearningModel trainModel(DeepLearningModel model) { Frame validScoreFrame = null; Frame train, trainScoreFrame; try { lock_data(); if (checkpoint == null && !quiet_mode) logStart(); //if checkpoint is given, some Job's params might be uninitialized (but the restarted model's parameters are correct) if (model == null) { model = UKV.get(dest()); } model.write_lock(self()); final DeepLearning mp = model.model_info().get_params(); //use the model's parameters for everything below - NOT the job's parameters (can be different after checkpoint restart) prepareValidationWithModel(model); final long model_size = model.model_info().size(); if (!quiet_mode) Log.info("Number of model parameters (weights/biases): " + String.format("%,d", model_size)); train = model.model_info().data_info()._adaptedFrame; if (mp.force_load_balance) train = updateFrame(train, reBalance(train, mp.replicate_training_data)); if (mp.classification && mp.balance_classes) { float[] trainSamplingFactors = new float[train.lastVec().domain().length]; //leave initialized to 0 -> will be filled up below if (class_sampling_factors != null) { if (class_sampling_factors.length != train.lastVec().domain().length) throw new IllegalArgumentException("class_sampling_factors must have " + train.lastVec().domain().length + " elements"); trainSamplingFactors = class_sampling_factors.clone(); //clone: don't modify the original } train = updateFrame(train, sampleFrameStratified( train, train.lastVec(), trainSamplingFactors, (long)(mp.max_after_balance_size*train.numRows()), mp.seed, true, false)); model.setModelClassDistribution(new MRUtils.ClassDist(train.lastVec()).doAll(train.lastVec()).rel_dist()); } model.training_rows = train.numRows(); trainScoreFrame = updateFrame(train, sampleFrame(train, mp.score_training_samples, mp.seed)); //training scoring dataset is always sampled uniformly from the training dataset if (!quiet_mode) Log.info("Number of chunks of the training data: " + train.anyVec().nChunks()); if (validation != null) { model.validation_rows = validation.numRows(); Frame adaptedValid = getValidation(); if (getValidAdaptor().needsAdaptation2CM()) { adaptedValid.add(getValidAdaptor().adaptedValidationResponse(_responseName), getValidAdaptor().getAdaptedValidationResponse2CM()); } // validation scoring dataset can be sampled in multiple ways from the given validation dataset if (mp.classification && mp.balance_classes && mp.score_validation_sampling == ClassSamplingMethod.Stratified) { validScoreFrame = updateFrame(adaptedValid, sampleFrameStratified(adaptedValid, adaptedValid.lastVec(), null, mp.score_validation_samples > 0 ? mp.score_validation_samples : adaptedValid.numRows(), mp.seed+1, false /* no oversampling */, false)); } else { validScoreFrame = updateFrame(adaptedValid, sampleFrame(adaptedValid, mp.score_validation_samples, mp.seed+1)); } if (mp.force_load_balance) validScoreFrame = updateFrame(validScoreFrame, reBalance(validScoreFrame, false /*always split up globally since scoring should be distributed*/)); if (!quiet_mode) Log.info("Number of chunks of the validation data: " + validScoreFrame.anyVec().nChunks()); } // Set train_samples_per_iteration size (cannot be done earlier since this depends on whether stratified sampling is done) model.actual_train_samples_per_iteration = computeTrainSamplesPerIteration(mp, train.numRows(), model); // Determine whether shuffling is enforced if(mp.replicate_training_data && (model.actual_train_samples_per_iteration == train.numRows()*(mp.single_node_mode?1:H2O.CLOUD.size())) && !mp.shuffle_training_data && H2O.CLOUD.size() > 1 && !mp.reproducible) { Log.warn("Enabling training data shuffling, because all nodes train on the full dataset (replicated training data)."); mp.shuffle_training_data = true; } model._timeLastScoreEnter = System.currentTimeMillis(); //to keep track of time per iteration, must be called before first call to doScoring if (!mp.quiet_mode) Log.info("Initial model:\n" + model.model_info()); if (autoencoder) model.doScoring(train, trainScoreFrame, validScoreFrame, self(), getValidAdaptor()); //get the null model reconstruction error // put the initial version of the model into DKV model.update(self()); Log.info("Starting to train the Deep Learning model."); //main loop do model.set_model_info(H2O.CLOUD.size() > 1 && mp.replicate_training_data ? ( mp.single_node_mode ? new DeepLearningTask2(train, model.model_info(), rowFraction(train, mp, model)).invoke(Key.make()).model_info() : //replicated data + single node mode new DeepLearningTask2(train, model.model_info(), rowFraction(train, mp, model)).invokeOnAllNodes().model_info() ) : //replicated data + multi-node mode new DeepLearningTask(model.model_info(), rowFraction(train, mp, model)).doAll(train).model_info()); //distributed data (always in multi-node mode) while (model.doScoring(train, trainScoreFrame, validScoreFrame, self(), getValidAdaptor())); // replace the model with the best model so far (if it's better) if (!isCancelledOrCrashed() && override_with_best_model && model.actual_best_model_key != null && n_folds == 0) { DeepLearningModel best_model = UKV.get(model.actual_best_model_key); if (best_model != null && best_model.error() < model.error() && Arrays.equals(best_model.model_info().units, model.model_info().units)) { Log.info("Setting the model to be the best model so far (based on scoring history)."); DeepLearningModel.DeepLearningModelInfo mi = best_model.model_info().deep_clone(); // Don't cheat - count full amount of training samples, since that's the amount of training it took to train (without finding anything better) mi.set_processed_global(model.model_info().get_processed_global()); mi.set_processed_local(model.model_info().get_processed_local()); model.set_model_info(mi); model.update(self()); model.doScoring(train, trainScoreFrame, validScoreFrame, self(), getValidAdaptor()); assert(best_model.error() == model.error()); } } Log.info(model); Log.info("Finished training the Deep Learning model."); return model; } catch(JobCancelledException ex) { model = UKV.get(dest()); state = JobState.CANCELLED; //for JSON REST response model.get_params().state = state; //for parameter JSON on the HTML page Log.info("Deep Learning model building was cancelled."); return model; } catch(Throwable t) { t.printStackTrace(); model = UKV.get(dest()); state = JobState.FAILED; //for JSON REST response if (model != null) { model.get_params().state = state; //for parameter JSON on the HTML page Log.info("Deep Learning model building failed."); } return model; } finally { if (model != null && DKV.get(model._key) != null) model.unlock(self()); unlock_data(); } } /** * Lock the input datasets against deletes */ private void lock_data() { source.read_lock(self()); if( validation != null && source._key != null && validation._key !=null && !source._key.equals(validation._key) ) validation.read_lock(self()); } /** * Release the lock for the input datasets */ private void unlock_data() { source.unlock(self()); if( validation != null && source._key != null && validation._key != null && !source._key.equals(validation._key) ) validation.unlock(self()); } /** * Delete job related keys */ public void delete() { cleanup(); if (_fakejob) UKV.remove(job_key); remove(); } /** * Rebalance a frame for load balancing * @param fr Input frame * @param local whether to only create enough chunks to max out all cores on one node only * @return Frame that has potentially more chunks */ private Frame reBalance(final Frame fr, boolean local) { int chunks = (int)Math.min( 4 * H2O.NUMCPUS * (local ? 1 : H2O.CLOUD.size()), fr.numRows()); if (fr.anyVec().nChunks() > chunks && !reproducible) { Log.info("Dataset already contains " + fr.anyVec().nChunks() + " chunks. No need to rebalance."); return fr; } else if (reproducible) { Log.warn("Reproducibility enforced - using only 1 thread - can be slow."); chunks = 1; } if (!quiet_mode) Log.info("ReBalancing dataset into (at least) " + chunks + " chunks."); // return MRUtils.shuffleAndBalance(fr, chunks, seed, local, shuffle_training_data); String snewKey = fr._key != null ? (fr._key.toString() + ".balanced") : Key.rand(); Key newKey = Key.makeSystem(snewKey); RebalanceDataSet rb = new RebalanceDataSet(fr, newKey, chunks); H2O.submitTask(rb); rb.join(); return UKV.get(newKey); } /** * Compute the actual train_samples_per_iteration size from the user-given parameter * @param mp Model parameter (DeepLearning object) * @param numRows number of training rows * @param model DL Model * @return The total number of training rows to be processed per iteration (summed over on all nodes) */ private static long computeTrainSamplesPerIteration(final DeepLearning mp, final long numRows, DeepLearningModel model) { long tspi = mp.train_samples_per_iteration; assert(tspi == 0 || tspi == -1 || tspi == -2 || tspi >= 1); if (tspi == 0 || (!mp.replicate_training_data && tspi == -1) ) { tspi = numRows; if (!mp.quiet_mode) Log.info("Setting train_samples_per_iteration (" + mp.train_samples_per_iteration + ") to one epoch: #rows (" + tspi + ")."); } else if (tspi == -1) { tspi = (mp.single_node_mode ? 1 : H2O.CLOUD.size()) * numRows; if (!mp.quiet_mode) Log.info("Setting train_samples_per_iteration (" + mp.train_samples_per_iteration + ") to #nodes x #rows (" + tspi + ")."); } else if (tspi == -2) { // automatic tuning based on CPU speed, network speed and model size // measure cpu speed double total_gflops = 0; for (H2ONode h2o : H2O.CLOUD._memary) { HeartBeat hb = h2o._heartbeat; total_gflops += hb._gflops; } if (mp.single_node_mode) total_gflops /= H2O.CLOUD.size(); if (total_gflops == 0) { total_gflops = Linpack.run(H2O.SELF._heartbeat._cpus_allowed) * (mp.single_node_mode ? 1 : H2O.CLOUD.size()); } final long model_size = model.model_info().size(); int[] msg_sizes = new int[]{ (int)(model_size*4) == (model_size*4) ? (int)(model_size*4) : Integer.MAX_VALUE }; double[] microseconds_collective = new double[msg_sizes.length]; NetworkTest.NetworkTester nt = new NetworkTest.NetworkTester(msg_sizes,null,microseconds_collective,model_size>1e6 ? 1 : 5 /*repeats*/,false,true /*only collectives*/); nt.compute2(); //length of the network traffic queue based on log-tree rollup (2 log(nodes)) int network_queue_length = mp.single_node_mode || H2O.CLOUD.size() == 1? 1 : 2*(int)Math.floor(Math.log(H2O.CLOUD.size())/Math.log(2)); // heuristics double flops_overhead_per_row = 30; if (mp.activation == Activation.Maxout || mp.activation == Activation.MaxoutWithDropout) { flops_overhead_per_row *= 8; } else if (mp.activation == Activation.Tanh || mp.activation == Activation.TanhWithDropout) { flops_overhead_per_row *= 5; } // target fraction of comm vs cpu time: 5% double fraction = mp.single_node_mode || H2O.CLOUD.size() == 1 ? 1e-3 : 0.05; //one single node mode, there's no model averaging effect, so less need to shorten the M/R iteration // estimate the time for communication (network) and training (compute) model.time_for_communication_us = (H2O.CLOUD.size() == 1 ? 1e4 /* add 10ms for single-node */ : 0) + network_queue_length * microseconds_collective[0]; double time_per_row_us = flops_overhead_per_row * model_size / (total_gflops * 1e9) / H2O.SELF._heartbeat._cpus_allowed * 1e6; // compute the optimal number of training rows per iteration // fraction := time_comm_us / (time_comm_us + tspi * time_per_row_us) ==> tspi = (time_comm_us/fraction - time_comm_us)/time_per_row_us tspi = (long)((model.time_for_communication_us / fraction - model.time_for_communication_us)/ time_per_row_us); tspi = Math.min(tspi, (mp.single_node_mode ? 1 : H2O.CLOUD.size()) * numRows * 10); //not more than 10x of what train_samples_per_iteration=-1 would do // If the number is close to a multiple of epochs, use that -> prettier scoring if (tspi > numRows && Math.abs(tspi % numRows)/(double)numRows < 0.2) tspi = tspi - tspi % numRows; tspi = Math.min(tspi, (long)(mp.epochs * numRows / 10)); //limit to number of epochs desired, but at least 10 iterations total tspi = Math.max(1, tspi); //at least 1 point if (!mp.quiet_mode) { Log.info("Auto-tuning parameter 'train_samples_per_iteration':"); Log.info("Estimated compute power : " + (int)total_gflops + " GFlops"); Log.info("Estimated time for comm : " + PrettyPrint.usecs((long)model.time_for_communication_us)); Log.info("Estimated time per row : " + ((long)time_per_row_us > 0 ? PrettyPrint.usecs((long)time_per_row_us) : time_per_row_us + " usecs")); Log.info("Estimated training speed: " + (int)(1e6/time_per_row_us) + " rows/sec"); Log.info("Setting train_samples_per_iteration (" + mp.train_samples_per_iteration + ") to auto-tuned value: " + tspi); } } else { // limit user-given value to number of epochs desired tspi = Math.min(tspi, (long)(mp.epochs * numRows)); } assert(tspi != 0 && tspi != -1 && tspi != -2 && tspi >= 1); return tspi; } /** * Compute the fraction of rows that need to be used for training during one iteration * @param numRows number of training rows * @param train_samples_per_iteration number of training rows to be processed per iteration * @param replicate_training_data whether of not the training data is replicated on each node * @return fraction of rows to be used for training during one iteration */ private static float computeRowUsageFraction(final long numRows, final long train_samples_per_iteration, final boolean replicate_training_data) { float rowUsageFraction = (float)train_samples_per_iteration / numRows; if (replicate_training_data) rowUsageFraction /= H2O.CLOUD.size(); assert(rowUsageFraction > 0); return rowUsageFraction; } private static float rowFraction(Frame train, DeepLearning p, DeepLearningModel m) { return computeRowUsageFraction(train.numRows(), m.actual_train_samples_per_iteration, p.replicate_training_data); } /** * Cross-Validate a DeepLearning model by building new models on N train/test holdout splits * @param splits Frames containing train/test splits * @param cv_preds Array of Frames to store the predictions for each cross-validation run * @param offsets Array to store the offsets of starting row indices for each cross-validation run * @param i Which fold of cross-validation to perform */ @Override public void crossValidate(Frame[] splits, Frame[] cv_preds, long[] offsets, int i) { // Train a clone with slightly modified parameters (to account for cross-validation) final DeepLearning cv = (DeepLearning) this.clone(); cv.genericCrossValidation(splits, offsets, i); cv_preds[i] = ((DeepLearningModel) UKV.get(cv.dest())).score(cv.validation); new TAtomic<DeepLearningModel>() { @Override public DeepLearningModel atomic(DeepLearningModel m) { if (!keep_cross_validation_splits && /*paranoid*/cv.dest().toString().contains("xval")) { m.get_params().source = null; m.get_params().validation=null; m.get_params().response=null; } return m; } }.invoke(cv.dest()); } }