package hex.deeplearning;
import com.amazonaws.services.simpleworkflow.model.Run;
import hex.*;
import water.*;
import water.util.*;
import static water.util.MRUtils.sampleFrame;
import static water.util.MRUtils.sampleFrameStratified;
import hex.FrameTask.DataInfo;
import water.api.*;
import water.fvec.Frame;
import water.fvec.RebalanceDataSet;
import water.fvec.Vec;
import java.lang.reflect.Field;
import java.util.Arrays;
import java.util.Random;
/**
* Deep Learning Neural Net implementation based on MRTask2
*/
public class DeepLearning extends Job.ValidatedJob {
static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields
public static DocGen.FieldDoc[] DOC_FIELDS;
public static final String DOC_GET = "Deep Learning";
/**
* A model key associated with a previously trained Deep Learning
* model. This option allows users to build a new model as a
* continuation of a previously generated model (e.g., by a grid search).
*/
@API(help = "Model checkpoint to resume training with", filter= Default.class, json = true)
public Key checkpoint;
/**
* If enabled, store the best model under the destination key of this model at the end of training.
* Only applicable if training is not cancelled.
*/
@API(help = "If enabled, override the final model with the best model found during training", filter= Default.class, json = true)
public boolean override_with_best_model = true;
/**
* Unlock expert mode parameters than can affect model building speed,
* predictive accuracy and scoring. Leaving expert mode parameters at default
* values is fine for many problems, but best results on complex datasets are often
* only attainable via expert mode options.
*/
@API(help = "Enable expert mode (to access all options from GUI)", filter = Default.class, json = true)
public boolean expert_mode = false;
@API(help = "Auto-Encoder (Experimental)", filter= Default.class, json = true)
public boolean autoencoder = false;
@API(help="Use all factor levels of categorical variables. Otherwise, the first factor level is omitted (without loss of accuracy). Useful for variable importances and auto-enabled for autoencoder.",filter=Default.class, json=true, importance = ParamImportance.SECONDARY)
public boolean use_all_factor_levels = true;
/*Neural Net Topology*/
/**
* The activation function (non-linearity) to be used the neurons in the hidden layers.
* Tanh: Hyperbolic tangent function (same as scaled and shifted sigmoid).
* Rectifier: Chooses the maximum of (0, x) where x is the input value.
* Maxout: Choose the maximum coordinate of the input vector.
* With Dropout: Zero out a random user-given fraction of the
* incoming weights to each hidden layer during training, for each
* training row. This effectively trains exponentially many models at
* once, and can improve generalization.
*/
@API(help = "Activation function", filter = Default.class, json = true, importance = ParamImportance.CRITICAL)
public Activation activation = Activation.Rectifier;
/**
* The number and size of each hidden layer in the model.
* For example, if a user specifies "100,200,100" a model with 3 hidden
* layers will be produced, and the middle hidden layer will have 200
* neurons.To specify a grid search, add parentheses around each
* model's specification: "(100,100), (50,50,50), (20,20,20,20)".
*/
@API(help = "Hidden layer sizes (e.g. 100,100). Grid search: (10,10), (20,20,20)", filter = Default.class, json = true, importance = ParamImportance.CRITICAL)
public int[] hidden = new int[] { 200, 200 };
/**
* The number of passes over the training dataset to be carried out.
* It is recommended to start with lower values for initial grid searches.
* This value can be modified during checkpoint restarts and allows continuation
* of selected models.
*/
@API(help = "How many times the dataset should be iterated (streamed), can be fractional", filter = Default.class, dmin = 1e-3, json = true, importance = ParamImportance.CRITICAL)
public double epochs = 10;
/**
* The number of training data rows to be processed per iteration. Note that
* independent of this parameter, each row is used immediately to update the model
* with (online) stochastic gradient descent. This parameter controls the
* synchronization period between nodes in a distributed environment and the
* frequency at which scoring and model cancellation can happen. For example, if
* it is set to 10,000 on H2O running on 4 nodes, then each node will
* process 2,500 rows per iteration, sampling randomly from their local data.
* Then, model averaging between the nodes takes place, and scoring can happen
* (dependent on scoring interval and duty factor). Special values are 0 for
* one epoch per iteration, -1 for processing the maximum amount of data
* per iteration (if **replicate training data** is enabled, N epochs
* will be trained per iteration on N nodes, otherwise one epoch). Special value
* of -2 turns on automatic mode (auto-tuning).
*/
@API(help = "Number of training samples (globally) per MapReduce iteration. Special values are 0: one epoch, -1: all available data (e.g., replicated training data), -2: automatic", filter = Default.class, lmin = -2, json = true, importance = ParamImportance.SECONDARY)
public long train_samples_per_iteration = -2;
// @API(help = "Target ratio of communication overhead to computation. Only for multi-node operation and train_samples_per_iteration=-2 (auto-tuning)", filter = Default.class, dmin = 1e-3, dmax=0.999, json = true, importance = ParamImportance.SECONDARY)
public double target_ratio_comm_to_comp = 0.02;
/**
* The random seed controls sampling and initialization. Reproducible
* results are only expected with single-threaded operation (i.e.,
* when running on one node, turning off load balancing and providing
* a small dataset that fits in one chunk). In general, the
* multi-threaded asynchronous updates to the model parameters will
* result in (intentional) race conditions and non-reproducible
* results. Note that deterministic sampling and initialization might
* still lead to some weak sense of determinism in the model.
*/
@API(help = "Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded", filter = Default.class, json = true)
public long seed = new Random().nextLong();
/*Adaptive Learning Rate*/
/**
* The implemented adaptive learning rate algorithm (ADADELTA) automatically
* combines the benefits of learning rate annealing and momentum
* training to avoid slow convergence. Specification of only two
* parameters (rho and epsilon) simplifies hyper parameter search.
* In some cases, manually controlled (non-adaptive) learning rate and
* momentum specifications can lead to better results, but require the
* specification (and hyper parameter search) of up to 7 parameters.
* If the model is built on a topology with many local minima or
* long plateaus, it is possible for a constant learning rate to produce
* sub-optimal results. Learning rate annealing allows digging deeper into
* local minima, while rate decay allows specification of different
* learning rates per layer. When the gradient is being estimated in
* a long valley in the optimization landscape, a large learning rate
* can cause the gradient to oscillate and move in the wrong
* direction. When the gradient is computed on a relatively flat
* surface with small learning rates, the model can converge far
* slower than necessary.
*/
@API(help = "Adaptive learning rate (ADADELTA)", filter = Default.class, json = true, importance = ParamImportance.SECONDARY)
public boolean adaptive_rate = true;
/**
* The first of two hyper parameters for adaptive learning rate (ADADELTA).
* It is similar to momentum and relates to the memory to prior weight updates.
* Typical values are between 0.9 and 0.999.
* This parameter is only active if adaptive learning rate is enabled.
*/
@API(help = "Adaptive learning rate time decay factor (similarity to prior updates)", filter = Default.class, dmin = 0.01, dmax = 1, json = true, importance = ParamImportance.SECONDARY)
public double rho = 0.99;
/**
* The second of two hyper parameters for adaptive learning rate (ADADELTA).
* It is similar to learning rate annealing during initial training
* and momentum at later stages where it allows forward progress.
* Typical values are between 1e-10 and 1e-4.
* This parameter is only active if adaptive learning rate is enabled.
*/
@API(help = "Adaptive learning rate smoothing factor (to avoid divisions by zero and allow progress)", filter = Default.class, dmin = 1e-15, dmax = 1, json = true, importance = ParamImportance.SECONDARY)
public double epsilon = 1e-8;
/*Learning Rate*/
/**
* When adaptive learning rate is disabled, the magnitude of the weight
* updates are determined by the user specified learning rate
* (potentially annealed), and are a function of the difference
* between the predicted value and the target value. That difference,
* generally called delta, is only available at the output layer. To
* correct the output at each hidden layer, back propagation is
* used. Momentum modifies back propagation by allowing prior
* iterations to influence the current update. Using the momentum
* parameter can aid in avoiding local minima and the associated
* instability. Too much momentum can lead to instabilities, that's
* why the momentum is best ramped up slowly.
* This parameter is only active if adaptive learning rate is disabled.
*/
@API(help = "Learning rate (higher => less stable, lower => slower convergence)", filter = Default.class, dmin = 1e-10, dmax = 1, json = true, importance = ParamImportance.SECONDARY)
public double rate = .005;
/**
* Learning rate annealing reduces the learning rate to "freeze" into
* local minima in the optimization landscape. The annealing rate is the
* inverse of the number of training samples it takes to cut the learning rate in half
* (e.g., 1e-6 means that it takes 1e6 training samples to halve the learning rate).
* This parameter is only active if adaptive learning rate is disabled.
*/
@API(help = "Learning rate annealing: rate / (1 + rate_annealing * samples)", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY)
public double rate_annealing = 1e-6;
/**
* The learning rate decay parameter controls the change of learning rate across layers.
* For example, assume the rate parameter is set to 0.01, and the rate_decay parameter is set to 0.5.
* Then the learning rate for the weights connecting the input and first hidden layer will be 0.01,
* the learning rate for the weights connecting the first and the second hidden layer will be 0.005,
* and the learning rate for the weights connecting the second and third hidden layer will be 0.0025, etc.
* This parameter is only active if adaptive learning rate is disabled.
*/
@API(help = "Learning rate decay factor between layers (N-th layer: rate*alpha^(N-1))", filter = Default.class, dmin = 0, json = true, importance = ParamImportance.EXPERT)
public double rate_decay = 1.0;
/*Momentum*/
/**
* The momentum_start parameter controls the amount of momentum at the beginning of training.
* This parameter is only active if adaptive learning rate is disabled.
*/
@API(help = "Initial momentum at the beginning of training (try 0.5)", filter = Default.class, dmin = 0, dmax = 0.9999999999, json = true, importance = ParamImportance.SECONDARY)
public double momentum_start = 0;
/**
* The momentum_ramp parameter controls the amount of learning for which momentum increases
* (assuming momentum_stable is larger than momentum_start). The ramp is measured in the number
* of training samples.
* This parameter is only active if adaptive learning rate is disabled.
*/
@API(help = "Number of training samples for which momentum increases", filter = Default.class, dmin = 1, json = true, importance = ParamImportance.SECONDARY)
public double momentum_ramp = 1e6;
/**
* The momentum_stable parameter controls the final momentum value reached after momentum_ramp training samples.
* The momentum used for training will remain the same for training beyond reaching that point.
* This parameter is only active if adaptive learning rate is disabled.
*/
@API(help = "Final momentum after the ramp is over (try 0.99)", filter = Default.class, dmin = 0, dmax = 0.9999999999, json = true, importance = ParamImportance.SECONDARY)
public double momentum_stable = 0;
/**
* The Nesterov accelerated gradient descent method is a modification to
* traditional gradient descent for convex functions. The method relies on
* gradient information at various points to build a polynomial approximation that
* minimizes the residuals in fewer iterations of the descent.
* This parameter is only active if adaptive learning rate is disabled.
*/
@API(help = "Use Nesterov accelerated gradient (recommended)", filter = Default.class, json = true, importance = ParamImportance.SECONDARY)
public boolean nesterov_accelerated_gradient = true;
/*Regularization*/
/**
* A fraction of the features for each training row to be omitted from training in order
* to improve generalization (dimension sampling).
*/
@API(help = "Input layer dropout ratio (can improve generalization, try 0.1 or 0.2)", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY)
public double input_dropout_ratio = 0.0;
/**
* A fraction of the inputs for each hidden layer to be omitted from training in order
* to improve generalization. Defaults to 0.5 for each hidden layer if omitted.
*/
@API(help = "Hidden layer dropout ratios (can improve generalization), specify one value per hidden layer, defaults to 0.5", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY)
public double[] hidden_dropout_ratios;
/**
* A regularization method that constrains the absolute value of the weights and
* has the net effect of dropping some weights (setting them to zero) from a model
* to reduce complexity and avoid overfitting.
*/
@API(help = "L1 regularization (can add stability and improve generalization, causes many weights to become 0)", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY)
public double l1 = 0.0;
/**
* A regularization method that constrdains the sum of the squared
* weights. This method introduces bias into parameter estimates, but
* frequently produces substantial gains in modeling as estimate variance is
* reduced.
*/
@API(help = "L2 regularization (can add stability and improve generalization, causes many weights to be small", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.SECONDARY)
public double l2 = 0.0;
/**
* A maximum on the sum of the squared incoming weights into
* any one neuron. This tuning parameter is especially useful for unbound
* activation functions such as Maxout or Rectifier.
*/
@API(help = "Constraint for squared sum of incoming weights per unit (e.g. for Rectifier)", filter = Default.class, dmin = 1e-10, json = true, importance = ParamImportance.EXPERT)
public float max_w2 = Float.POSITIVE_INFINITY;
/*Initialization*/
/**
* The distribution from which initial weights are to be drawn. The default
* option is an optimized initialization that considers the size of the network.
* The "uniform" option uses a uniform distribution with a mean of 0 and a given
* interval. The "normal" option draws weights from the standard normal
* distribution with a mean of 0 and given standard deviation.
*/
@API(help = "Initial Weight Distribution", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public InitialWeightDistribution initial_weight_distribution = InitialWeightDistribution.UniformAdaptive;
/**
* The scale of the distribution function for Uniform or Normal distributions.
* For Uniform, the values are drawn uniformly from -initial_weight_scale...initial_weight_scale.
* For Normal, the values are drawn from a Normal distribution with a standard deviation of initial_weight_scale.
*/
@API(help = "Uniform: -value...value, Normal: stddev)", filter = Default.class, dmin = 0, json = true, importance = ParamImportance.EXPERT)
public double initial_weight_scale = 1.0;
/**
* The loss (error) function to be minimized by the model.
* Cross Entropy loss is used when the model output consists of independent
* hypotheses, and the outputs can be interpreted as the probability that each
* hypothesis is true. Cross entropy is the recommended loss function when the
* target values are class labels, and especially for imbalanced data.
* It strongly penalizes error in the prediction of the actual class label.
* Mean Square loss is used when the model output are continuous real values, but can
* be used for classification as well (where it emphasizes the error on all
* output classes, not just for the actual class).
*/
@API(help = "Loss function", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public Loss loss = Loss.Automatic;
/*Scoring*/
/**
* The minimum time (in seconds) to elapse between model scoring. The actual
* interval is determined by the number of training samples per iteration and the scoring duty cycle.
*/
@API(help = "Shortest time interval (in secs) between model scoring", filter = Default.class, dmin = 0, json = true, importance = ParamImportance.SECONDARY)
public double score_interval = 5;
/**
* The number of training dataset points to be used for scoring. Will be
* randomly sampled. Use 0 for selecting the entire training dataset.
*/
@API(help = "Number of training set samples for scoring (0 for all)", filter = Default.class, lmin = 0, json = true, importance = ParamImportance.EXPERT)
public long score_training_samples = 10000l;
/**
* The number of validation dataset points to be used for scoring. Can be
* randomly sampled or stratified (if "balance classes" is set and "score
* validation sampling" is set to stratify). Use 0 for selecting the entire
* training dataset.
*/
@API(help = "Number of validation set samples for scoring (0 for all)", filter = Default.class, lmin = 0, json = true, importance = ParamImportance.EXPERT)
public long score_validation_samples = 0l;
/**
* Maximum fraction of wall clock time spent on model scoring on training and validation samples,
* and on diagnostics such as computation of feature importances (i.e., not on training).
*/
@API(help = "Maximum duty cycle fraction for scoring (lower: more training, higher: more scoring).", filter = Default.class, dmin = 0, dmax = 1, json = true, importance = ParamImportance.EXPERT)
public double score_duty_cycle = 0.1;
/**
* The stopping criteria in terms of classification error (1-accuracy) on the
* training data scoring dataset. When the error is at or below this threshold,
* training stops.
*/
@API(help = "Stopping criterion for classification error fraction on training data (-1 to disable)", filter = Default.class, dmin=-1, dmax=1, json = true, importance = ParamImportance.EXPERT)
public double classification_stop = 0;
/**
* The stopping criteria in terms of regression error (MSE) on the training
* data scoring dataset. When the error is at or below this threshold, training
* stops.
*/
@API(help = "Stopping criterion for regression error (MSE) on training data (-1 to disable)", filter = Default.class, dmin=-1, json = true, importance = ParamImportance.EXPERT)
public double regression_stop = 1e-6;
/**
* Enable quiet mode for less output to standard output.
*/
@API(help = "Enable quiet mode for less output to standard output", filter = Default.class, json = true)
public boolean quiet_mode = false;
/**
* For classification models, the maximum size (in terms of classes) of the
* confusion matrix for it to be printed. This option is meant to avoid printing
* extremely large confusion matrices.
*/
@API(help = "Max. size (number of classes) for confusion matrices to be shown", filter = Default.class, json = true)
public int max_confusion_matrix_size = 20;
/**
* The maximum number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)
*/
@API(help = "Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)", filter = Default.class, lmin=0, json = true, importance = ParamImportance.EXPERT)
public int max_hit_ratio_k = 10;
/*Imbalanced Classes*/
/**
* For imbalanced data, balance training data class counts via
* over/under-sampling. This can result in improved predictive accuracy.
*/
@API(help = "Balance training data class counts via over/under-sampling (for imbalanced data)", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public boolean balance_classes = false;
/**
* Desired over/under-sampling ratios per class (lexicographic order). Only when balance_classes is enabled. If not specified, they will be automatically computed to obtain class balance during training.
*/
@API(help = "Desired over/under-sampling ratios per class (lexicographic order).", filter = Default.class, dmin = 0, json = true, importance = ParamImportance.SECONDARY)
public float[] class_sampling_factors;
/**
* When classes are balanced, limit the resulting dataset size to the
* specified multiple of the original dataset size.
*/
@API(help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0)", filter = Default.class, json = true, dmin=1e-3, importance = ParamImportance.EXPERT)
public float max_after_balance_size = 5.0f;
/**
* Method used to sample the validation dataset for scoring, see Score Validation Samples above.
*/
@API(help = "Method used to sample validation dataset for scoring", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public ClassSamplingMethod score_validation_sampling = ClassSamplingMethod.Uniform;
/*Misc*/
/**
* Gather diagnostics for hidden layers, such as mean and RMS values of learning
* rate, momentum, weights and biases.
*/
@API(help = "Enable diagnostics for hidden layers", filter = Default.class, json = true)
public boolean diagnostics = true;
/**
* Whether to compute variable importances for input features.
* The implemented method (by Gedeon) considers the weights connecting the
* input features to the first two hidden layers.
*/
@API(help = "Compute variable importances for input features (Gedeon method) - can be slow for large networks", filter = Default.class, json = true)
public boolean variable_importances = false;
/**
* Enable fast mode (minor approximation in back-propagation), should not affect results significantly.
*/
@API(help = "Enable fast mode (minor approximation in back-propagation)", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public boolean fast_mode = true;
/**
* Ignore constant training columns (no information can be gained anyway).
*/
@API(help = "Ignore constant training columns (no information can be gained anyway)", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public boolean ignore_const_cols = true;
/**
* Increase training speed on small datasets by splitting it into many chunks
* to allow utilization of all cores.
*/
@API(help = "Force extra load balancing to increase training speed for small datasets (to keep all cores busy)", filter = Default.class, json = true)
public boolean force_load_balance = true;
/**
* Replicate the entire training dataset onto every node for faster training on small datasets.
*/
@API(help = "Replicate the entire training dataset onto every node for faster training on small datasets", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public boolean replicate_training_data = true;
/**
* Run on a single node for fine-tuning of model parameters. Can be useful for
* checkpoint resumes after training on multiple nodes for fast initial
* convergence.
*/
@API(help = "Run on a single node for fine-tuning of model parameters", filter = Default.class, json = true)
public boolean single_node_mode = false;
/**
* Enable shuffling of training data (on each node). This option is
* recommended if training data is replicated on N nodes, and the number of training samples per iteration
* is close to N times the dataset size, where all nodes train will (almost) all
* the data. It is automatically enabled if the number of training samples per iteration is set to -1 (or to N
* times the dataset size or larger).
*/
@API(help = "Enable shuffling of training data (recommended if training data is replicated and train_samples_per_iteration is close to #nodes x #rows)", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public boolean shuffle_training_data = false;
// @API(help = "Handling of missing values. Either Skip or MeanImputation.", filter= Default.class, json = true)
public MissingValuesHandling missing_values_handling = MissingValuesHandling.MeanImputation;
@API(help = "Sparse data handling (Experimental).", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public boolean sparse = false;
@API(help = "Use a column major weight matrix for input layer. Can speed up forward propagation, but might slow down backpropagation (Experimental).", filter = Default.class, json = true, importance = ParamImportance.EXPERT)
public boolean col_major = false;
@API(help = "Average activation for sparse auto-encoder (Experimental)", filter= Default.class, json = true)
public double average_activation = 0;
@API(help = "Sparsity regularization (Experimental)", filter= Default.class, json = true)
public double sparsity_beta = 0;
@API(help = "Max. number of categorical features, enforced via hashing (Experimental).", filter= Default.class, lmin = 1, json = true)
public int max_categorical_features = Integer.MAX_VALUE;
@API(help = "Force reproducibility on small data (will be slow - only uses 1 thread)", filter= Default.class, json = true)
public boolean reproducible = false;
public enum MissingValuesHandling {
Skip, MeanImputation
}
public enum ClassSamplingMethod {
Uniform, Stratified
}
public enum InitialWeightDistribution {
UniformAdaptive, Uniform, Normal
}
/**
* Activation functions
*/
public enum Activation {
Tanh, TanhWithDropout, Rectifier, RectifierWithDropout, Maxout, MaxoutWithDropout
}
/**
* Loss functions
* CrossEntropy is recommended
*/
public enum Loss {
Automatic, MeanSquare, CrossEntropy
}
// the following parameters can only be specified in expert mode
transient final String [] expert_options = new String[] {
"use_all_factor_levels",
"loss",
"max_w2",
"score_training_samples",
"score_validation_samples",
"initial_weight_distribution",
"initial_weight_scale",
"diagnostics",
"rate_decay",
"score_duty_cycle",
"variable_importances",
"fast_mode",
"score_validation_sampling",
"ignore_const_cols",
"force_load_balance",
"replicate_training_data",
"shuffle_training_data",
"nesterov_accelerated_gradient",
"classification_stop",
"regression_stop",
"quiet_mode",
"max_confusion_matrix_size",
"max_hit_ratio_k",
"hidden_dropout_ratios",
"single_node_mode",
"sparse",
"col_major",
"autoencoder",
"average_activation",
"sparsity_beta",
"max_categorical_features",
};
// the following parameters can be modified when restarting from a checkpoint
transient final String [] cp_modifiable = new String[] {
"expert_mode",
"seed",
"epochs",
"score_interval",
"train_samples_per_iteration",
"target_ratio_comm_to_comp",
"score_duty_cycle",
"classification_stop",
"regression_stop",
"quiet_mode",
"max_confusion_matrix_size",
"max_hit_ratio_k",
"diagnostics",
"variable_importances",
"force_load_balance",
"replicate_training_data",
"shuffle_training_data",
"single_node_mode",
"sparse",
"col_major",
// Allow modification of the regularization parameters after a checkpoint restart
"l1",
"l2",
"max_w2",
};
/**
* Helper to specify which arguments trigger a refresh on change
* @param ver
*/
@Override
protected void registered(RequestServer.API_VERSION ver) {
super.registered(ver);
for (Argument arg : _arguments) {
if ( arg._name.equals("activation") || arg._name.equals("initial_weight_distribution")
|| arg._name.equals("expert_mode") || arg._name.equals("adaptive_rate")
|| arg._name.equals("replicate_training_data")
|| arg._name.equals("balance_classes")
|| arg._name.equals("n_folds")
|| arg._name.equals("autoencoder")
|| arg._name.equals("checkpoint")) {
arg.setRefreshOnChange();
}
}
}
/**
* Helper to handle arguments based on existing input values
* @param arg
* @param inputArgs
*/
@Override protected void queryArgumentValueSet(Argument arg, java.util.Properties inputArgs) {
super.queryArgumentValueSet(arg, inputArgs);
if (!arg._name.equals("checkpoint") && !Utils.contains(cp_modifiable, arg._name)) {
if (checkpoint != null) {
arg.disable("Taken from model checkpoint.");
final DeepLearningModel cp_model = UKV.get(checkpoint);
if (cp_model == null) {
throw new IllegalArgumentException("Checkpointed model was not found.");
}
if (cp_model.model_info().unstable()) {
throw new IllegalArgumentException("Checkpointed model was unstable. Not restarting.");
}
return;
}
}
if(arg._name.equals("initial_weight_scale") &&
(initial_weight_distribution == InitialWeightDistribution.UniformAdaptive)
) {
arg.disable("Using sqrt(6 / (# units + # units of previous layer)) for Uniform distribution.", inputArgs);
}
if (classification) {
if(arg._name.equals("regression_stop")) {
arg.disable("Only for regression.", inputArgs);
}
if((arg._name.equals("max_after_balance_size") || arg._name.equals("class_sampling_factors")) && !balance_classes) {
arg.disable("Requires balance_classes.", inputArgs);
}
}
else {
if(arg._name.equals("classification_stop")
|| arg._name.equals("max_confusion_matrix_size")
|| arg._name.equals("max_hit_ratio_k")
|| arg._name.equals("max_after_balance_size")
|| arg._name.equals("balance_classes")
|| arg._name.equals("class_sampling_factors")
) {
arg.disable("Only for classification.", inputArgs);
}
if (validation != null && arg._name.equals("score_validation_sampling")) {
score_validation_sampling = ClassSamplingMethod.Uniform;
arg.disable("Using uniform sampling for validation scoring dataset.", inputArgs);
}
}
if ((arg._name.equals("score_validation_samples") || arg._name.equals("score_validation_sampling")) && validation == null) {
arg.disable("Requires a validation data set.", inputArgs);
}
if (Utils.contains(expert_options, arg._name) && !expert_mode) {
arg.disable("Only in expert mode.", inputArgs);
}
if (!adaptive_rate) {
if (arg._name.equals("rho") || arg._name.equals("epsilon")) {
arg.disable("Only for adaptive learning rate.", inputArgs);
rho = 0;
epsilon = 0;
}
} else {
if (arg._name.equals("rate") || arg._name.equals("rate_annealing") || arg._name.equals("rate_decay") || arg._name.equals("nesterov_accelerated_gradient")
|| arg._name.equals("momentum_start") || arg._name.equals("momentum_ramp") || arg._name.equals("momentum_stable") ) {
arg.disable("Only for non-adaptive learning rate.", inputArgs);
momentum_start = 0;
momentum_stable = 0;
}
}
if (arg._name.equals("hidden_dropout_ratios")) {
if (activation != Activation.TanhWithDropout && activation != Activation.MaxoutWithDropout && activation != Activation.RectifierWithDropout) {
arg.disable("Only for activation functions with dropout.", inputArgs);
}
}
if (arg._name.equals("replicate_training_data") && (H2O.CLOUD.size() == 1)) {
arg.disable("Only for multi-node operation.");
replicate_training_data = false;
}
if (arg._name.equals("single_node_mode") && (H2O.CLOUD.size() == 1 || !replicate_training_data)) {
arg.disable("Only for multi-node operation with replication.");
single_node_mode = false;
}
if (arg._name.equals("use_all_factor_levels") && autoencoder ) {
arg.disable("Automatically enabled for auto-encoders.");
use_all_factor_levels = true;
}
if(arg._name.equals("override_with_best_model") && n_folds != 0) {
arg.disable("Only without n-fold cross-validation.", inputArgs);
override_with_best_model = false;
}
if(arg._name.equals("average_activation") && !autoencoder) {
arg.disable("Only for autoencoder.", inputArgs);
}
if(arg._name.equals("sparsity_beta") && !autoencoder) {
arg.disable("Only for autoencoder.", inputArgs);
}
}
/** Print model parameters as JSON */
@Override public boolean toHTML(StringBuilder sb) {
try {
return makeJsonBox(sb);
} catch (Throwable t) {
return false;
}
}
/**
* Return a query link to this page
* @param k Model Key
* @param content Link text
* @return HTML Link
*/
public static String link(Key k, String content) {
return link(k, content, null, null, null);
}
/**
* Return a query link to this page
* @param k Model Key
* @param content Link text
* @param cp Key to checkpoint to continue training with (optional)
* @param response Response
* @param val Validation data set key
* @return HTML Link
*/
public static String link(Key k, String content, Key cp, String response, Key val) {
DeepLearning req = new DeepLearning();
RString rs = new RString("<a href='" + req.href() + ".query?source=%$key" +
(cp == null ? "" : "&checkpoint=%$cp") +
(response == null ? "" : "&response=%$resp") +
(val == null ? "" : "&validation=%$valkey") +
"'>%content</a>");
rs.replace("key", k.toString());
rs.replace("content", content);
if (cp != null) rs.replace("cp", cp.toString());
if (response != null) rs.replace("resp", response);
if (val != null) rs.replace("valkey", val);
return rs.toString();
}
/**
* Report the relative progress of building a Deep Learning model (measured by how many epochs are done)
* @return floating point number between 0 and 1
*/
@Override public float progress(){
if(UKV.get(dest()) == null)return 0;
DeepLearningModel m = UKV.get(dest());
if (m != null && m.model_info()!=null ) {
final float p = (float) Math.min(1, (m.epoch_counter / m.model_info().get_params().epochs));
return cv_progress(p);
}
return 0;
}
@Override
protected final void execImpl() {
try {
buildModel();
if (n_folds > 0) CrossValUtils.crossValidate(this);
} finally {
delete();
state = UKV.<Job>get(self()).state;
new TAtomic<DeepLearningModel>() {
@Override
public DeepLearningModel atomic(DeepLearningModel m) {
if (m != null) m.get_params().state = state;
return m;
}
}.invoke(dest());
}
}
/**
* Train a Deep Learning model, assumes that all members are populated
* If checkpoint == null, then start training a new model, otherwise continue from a checkpoint
*/
private void buildModel() {
DeepLearningModel cp = null;
if (checkpoint == null) {
cp = initModel();
cp.start_training(null);
} else {
final DeepLearningModel previous = UKV.get(checkpoint);
if (previous == null) throw new IllegalArgumentException("Checkpoint not found.");
Log.info("Resuming from checkpoint.");
if (n_folds != 0) {
throw new UnsupportedOperationException("n_folds must be 0: Cross-validation is not supported during checkpoint restarts.");
}
else {
((ValidatedJob)previous.job()).xval_models = null; //remove existing cross-validation keys after checkpoint restart
}
if (source == null || (previous.model_info().get_params().source != null && !Arrays.equals(source._key._kb, previous.model_info().get_params().source._key._kb))) {
throw new IllegalArgumentException("source must be the same as for the checkpointed model.");
}
autoencoder = previous.model_info().get_params().autoencoder;
if (!autoencoder && (response == null || !source.names()[source.find(response)].equals(previous.responseName()))) {
throw new IllegalArgumentException("response must be the same as for the checkpointed model.");
}
// if (!autoencoder && (response == null || !Arrays.equals(response._key._kb, previous.model_info().get_params().response._key._kb))) {
// throw new IllegalArgumentException("response must be the same as for the checkpointed model.");
// }
if (Utils.difference(ignored_cols, previous.model_info().get_params().ignored_cols).length != 0
|| Utils.difference(previous.model_info().get_params().ignored_cols, ignored_cols).length != 0) {
ignored_cols = previous.model_info().get_params().ignored_cols;
Log.warn("Automatically re-using ignored_cols from the checkpointed model.");
}
if ((validation == null) == (previous._validationKey != null)
|| (validation != null && validation._key != null && previous._validationKey != null
&& !Arrays.equals(validation._key._kb, previous._validationKey._kb))) {
throw new IllegalArgumentException("validation must be the same as for the checkpointed model.");
}
if (classification != previous.model_info().get_params().classification) {
Log.warn("Automatically switching to " + ((classification=!classification) ? "classification" : "regression") + " (same as the checkpointed model).");
}
epochs += previous.epoch_counter; //add new epochs to existing model
Log.info("Adding " + String.format("%.3f", previous.epoch_counter) + " epochs from the checkpointed model.");
try {
final DataInfo dataInfo = prepareDataInfo();
cp = new DeepLearningModel(previous, destination_key, job_key, dataInfo);
cp.write_lock(self());
cp.start_training(previous);
assert(state==JobState.RUNNING);
final DeepLearning A = cp.model_info().get_params();
Object B = this;
for (Field fA : A.getClass().getDeclaredFields()) {
if (Utils.contains(cp_modifiable, fA.getName())) {
if (!expert_mode && Utils.contains(expert_options, fA.getName())) continue;
for (Field fB : B.getClass().getDeclaredFields()) {
if (fA.equals(fB)) {
try {
if (fB.get(B) == null || fA.get(A) == null || !fA.get(A).toString().equals(fB.get(B).toString())) { // if either of the two parameters is null, skip the toString()
if (fA.get(A) == null && fB.get(B) == null) continue; //if both parameters are null, we don't need to do anything
Log.info("Applying user-requested modification of '" + fA.getName() + "': " + fA.get(A) + " -> " + fB.get(B));
fA.set(A, fB.get(B));
}
} catch (IllegalAccessException e) {
e.printStackTrace();
}
}
}
}
}
if (A.n_folds != 0) {
Log.warn("Disabling cross-validation: Not supported when resuming training from a checkpoint.");
A.n_folds = 0;
}
cp.update(self());
} finally {
if (cp != null) cp.unlock(self());
}
}
trainModel(cp);
cp.stop_training();
}
/**
* Redirect to the model page for that model that is trained by this job
* @return Response
*/
@Override protected Response redirect() {
return DeepLearningProgressPage.redirect(this, self(), dest());
}
private boolean _fakejob;
//Sanity check for Deep Learning job parameters
private void checkParams() {
if (source.numCols() <= 1)
throw new IllegalArgumentException("Training data must have at least 2 features (incl. response).");
if (hidden == null) throw new IllegalArgumentException("There must be at least one hidden layer.");
for (int i=0;i<hidden.length;++i) {
if (hidden[i]==0)
throw new IllegalArgumentException("Hidden layer size must be >0.");
}
//Auto-fill defaults
if (hidden_dropout_ratios == null) {
if (activation == Activation.TanhWithDropout || activation == Activation.MaxoutWithDropout || activation == Activation.RectifierWithDropout) {
hidden_dropout_ratios = new double[hidden.length];
if (!quiet_mode) Log.info("Automatically setting all hidden dropout ratios to 0.5.");
Arrays.fill(hidden_dropout_ratios, 0.5);
}
}
else if (hidden_dropout_ratios.length != hidden.length) throw new IllegalArgumentException("Must have " + hidden.length + " hidden layer dropout ratios.");
else if (activation != Activation.TanhWithDropout && activation != Activation.MaxoutWithDropout && activation != Activation.RectifierWithDropout) {
if (!quiet_mode) Log.info("Ignoring hidden_dropout_ratios because a non-Dropout activation function was specified.");
}
if (input_dropout_ratio < 0 || input_dropout_ratio >= 1) {
throw new IllegalArgumentException("Input dropout must be in [0,1).");
}
if (class_sampling_factors != null && !balance_classes) {
if (!quiet_mode) Log.info("Ignoring class_sampling_factors since balance_classes is not enabled.");
}
if (!quiet_mode) {
if (adaptive_rate) {
Log.info("Using automatic learning rate. Ignoring the following input parameters:");
Log.info(" rate, rate_decay, rate_annealing, momentum_start, momentum_ramp, momentum_stable, nesterov_accelerated_gradient.");
} else {
Log.info("Using manual learning rate. Ignoring the following input parameters:");
Log.info(" rho, epsilon.");
}
if (initial_weight_distribution == InitialWeightDistribution.UniformAdaptive) {
Log.info("Ignoring initial_weight_scale for UniformAdaptive weight distribution.");
}
if (n_folds != 0) {
if (override_with_best_model) {
Log.info("Automatically setting override_with_best_model to false, since the final model is the only scored model with n-fold cross-validation.");
override_with_best_model = false;
}
}
}
if(loss == Loss.Automatic) {
if (!classification) {
if (!quiet_mode) Log.info("Automatically setting loss to MeanSquare for regression.");
loss = Loss.MeanSquare;
}
else if (autoencoder) {
if (!quiet_mode) Log.info("Automatically setting loss to MeanSquare for auto-encoder.");
loss = Loss.MeanSquare;
}
else {
if (!quiet_mode) Log.info("Automatically setting loss to Cross-Entropy for classification.");
loss = Loss.CrossEntropy;
}
}
if(autoencoder && sparsity_beta > 0) {
if (activation == Activation.Tanh || activation == Activation.TanhWithDropout) {
if (average_activation >= 1 || average_activation <= -1)
throw new IllegalArgumentException("Tanh average activation must be in (-1,1).");
}
else if (activation == Activation.Rectifier || activation == Activation.RectifierWithDropout) {
if (average_activation <= 0)
throw new IllegalArgumentException("Rectifier average activation must be positive.");
}
}
if (!classification && loss == Loss.CrossEntropy) throw new IllegalArgumentException("Cannot use CrossEntropy loss function for regression.");
if (autoencoder && loss != Loss.MeanSquare) throw new IllegalArgumentException("Must use MeanSquare loss function for auto-encoder.");
if (autoencoder && classification) { classification = false; Log.info("Using regression mode for auto-encoder.");}
// reason for the error message below is that validation might not have the same horizontalized features as the training data (or different order)
if (autoencoder && validation != null) throw new UnsupportedOperationException("Cannot specify a validation dataset for auto-encoder.");
if (autoencoder && activation == Activation.Maxout) throw new UnsupportedOperationException("Maxout activation is not supported for auto-encoder.");
if (max_categorical_features < 1) throw new IllegalArgumentException("max_categorical_features must be at least " + 1);
// make default job_key and destination_key in case they are missing
if (dest() == null) {
destination_key = Key.make();
}
if (self() == null) {
job_key = Key.make();
}
if (UKV.get(self()) == null) {
start_time = System.currentTimeMillis();
state = JobState.RUNNING;
UKV.put(self(), this);
_fakejob = true;
}
if (!sparse && col_major) {
if (!quiet_mode) throw new IllegalArgumentException("Cannot use column major storage for non-sparse data handling.");
}
if (reproducible) {
if (!quiet_mode)
Log.info("Automatically enabling force_load_balancing, disabling single_node_mode and replicate_training_data\nand setting train_samples_per_iteration to -1 to enforce reproducibility.");
force_load_balance = true;
single_node_mode = false;
train_samples_per_iteration = -1;
replicate_training_data = false; //there's no benefit from having multiple nodes compute the exact same thing, and then average it back to the same
// replicate_training_data = true; //doesn't hurt, but does replicated identical work
}
}
/**
* Helper to create a DataInfo object from the source and response
* @return DataInfo object
*/
private DataInfo prepareDataInfo() {
final boolean del_enum_resp = classification && !response.isEnum();
final Frame train = FrameTask.DataInfo.prepareFrame(source, autoencoder ? null : response, ignored_cols, classification, ignore_const_cols, true /*drop >20% NA cols*/);
final DataInfo dinfo = new FrameTask.DataInfo(train, autoencoder ? 0 : 1, true, autoencoder || use_all_factor_levels, //use all FactorLevels for auto-encoder
autoencoder ? DataInfo.TransformType.NORMALIZE : DataInfo.TransformType.STANDARDIZE, //transform predictors
classification ? DataInfo.TransformType.NONE : DataInfo.TransformType.STANDARDIZE); //transform response
if (!autoencoder) {
final Vec resp = dinfo._adaptedFrame.lastVec(); //convention from DataInfo: response is the last Vec
assert (!classification ^ resp.isEnum()) : "Must have enum response for classification!"; //either regression or enum response
if (del_enum_resp) ltrash(resp);
}
return dinfo;
}
/**
* Create an initial Deep Learning model, typically to be trained by trainModel(model)
* @return Randomly initialized model
*/
public final DeepLearningModel initModel() {
try {
lock_data();
checkParams();
final DataInfo dinfo = prepareDataInfo();
final Vec resp = dinfo._adaptedFrame.lastVec(); //convention from DataInfo: response is the last Vec
float[] priorDist = classification ? new MRUtils.ClassDist(resp).doAll(resp).rel_dist() : null;
final DeepLearningModel model = new DeepLearningModel(dest(), self(), source._key, dinfo, (DeepLearning)this.clone(), priorDist);
model.model_info().initializeMembers();
return model;
}
finally {
unlock_data();
}
}
/**
* Helper to update a Frame and adding it to the local trash at the same time
* @param target Frame referece, to be overwritten
* @param src Newly made frame, to be deleted via local trash
* @return src
*/
Frame updateFrame(Frame target, Frame src) {
if (src != target) ltrash(src);
return src;
}
/**
* Train a Deep Learning neural net model
* @param model Input model (e.g., from initModel(), or from a previous training run)
* @return Trained model
*/
public final DeepLearningModel trainModel(DeepLearningModel model) {
Frame validScoreFrame = null;
Frame train, trainScoreFrame;
try {
lock_data();
if (checkpoint == null && !quiet_mode) logStart(); //if checkpoint is given, some Job's params might be uninitialized (but the restarted model's parameters are correct)
if (model == null) {
model = UKV.get(dest());
}
model.write_lock(self());
final DeepLearning mp = model.model_info().get_params(); //use the model's parameters for everything below - NOT the job's parameters (can be different after checkpoint restart)
prepareValidationWithModel(model);
final long model_size = model.model_info().size();
if (!quiet_mode) Log.info("Number of model parameters (weights/biases): " + String.format("%,d", model_size));
train = model.model_info().data_info()._adaptedFrame;
if (mp.force_load_balance) train = updateFrame(train, reBalance(train, mp.replicate_training_data));
if (mp.classification && mp.balance_classes) {
float[] trainSamplingFactors = new float[train.lastVec().domain().length]; //leave initialized to 0 -> will be filled up below
if (class_sampling_factors != null) {
if (class_sampling_factors.length != train.lastVec().domain().length)
throw new IllegalArgumentException("class_sampling_factors must have " + train.lastVec().domain().length + " elements");
trainSamplingFactors = class_sampling_factors.clone(); //clone: don't modify the original
}
train = updateFrame(train, sampleFrameStratified(
train, train.lastVec(), trainSamplingFactors, (long)(mp.max_after_balance_size*train.numRows()), mp.seed, true, false));
model.setModelClassDistribution(new MRUtils.ClassDist(train.lastVec()).doAll(train.lastVec()).rel_dist());
}
model.training_rows = train.numRows();
trainScoreFrame = updateFrame(train, sampleFrame(train, mp.score_training_samples, mp.seed)); //training scoring dataset is always sampled uniformly from the training dataset
if (!quiet_mode) Log.info("Number of chunks of the training data: " + train.anyVec().nChunks());
if (validation != null) {
model.validation_rows = validation.numRows();
Frame adaptedValid = getValidation();
if (getValidAdaptor().needsAdaptation2CM()) {
adaptedValid.add(getValidAdaptor().adaptedValidationResponse(_responseName), getValidAdaptor().getAdaptedValidationResponse2CM());
}
// validation scoring dataset can be sampled in multiple ways from the given validation dataset
if (mp.classification && mp.balance_classes && mp.score_validation_sampling == ClassSamplingMethod.Stratified) {
validScoreFrame = updateFrame(adaptedValid, sampleFrameStratified(adaptedValid, adaptedValid.lastVec(), null,
mp.score_validation_samples > 0 ? mp.score_validation_samples : adaptedValid.numRows(), mp.seed+1, false /* no oversampling */, false));
} else {
validScoreFrame = updateFrame(adaptedValid, sampleFrame(adaptedValid, mp.score_validation_samples, mp.seed+1));
}
if (mp.force_load_balance) validScoreFrame = updateFrame(validScoreFrame, reBalance(validScoreFrame, false /*always split up globally since scoring should be distributed*/));
if (!quiet_mode) Log.info("Number of chunks of the validation data: " + validScoreFrame.anyVec().nChunks());
}
// Set train_samples_per_iteration size (cannot be done earlier since this depends on whether stratified sampling is done)
model.actual_train_samples_per_iteration = computeTrainSamplesPerIteration(mp, train.numRows(), model);
// Determine whether shuffling is enforced
if(mp.replicate_training_data && (model.actual_train_samples_per_iteration == train.numRows()*(mp.single_node_mode?1:H2O.CLOUD.size())) && !mp.shuffle_training_data && H2O.CLOUD.size() > 1 && !mp.reproducible) {
Log.warn("Enabling training data shuffling, because all nodes train on the full dataset (replicated training data).");
mp.shuffle_training_data = true;
}
model._timeLastScoreEnter = System.currentTimeMillis(); //to keep track of time per iteration, must be called before first call to doScoring
if (!mp.quiet_mode) Log.info("Initial model:\n" + model.model_info());
if (autoencoder) model.doScoring(train, trainScoreFrame, validScoreFrame, self(), getValidAdaptor()); //get the null model reconstruction error
// put the initial version of the model into DKV
model.update(self());
Log.info("Starting to train the Deep Learning model.");
//main loop
do model.set_model_info(H2O.CLOUD.size() > 1 && mp.replicate_training_data ? ( mp.single_node_mode ?
new DeepLearningTask2(train, model.model_info(), rowFraction(train, mp, model)).invoke(Key.make()).model_info() : //replicated data + single node mode
new DeepLearningTask2(train, model.model_info(), rowFraction(train, mp, model)).invokeOnAllNodes().model_info() ) : //replicated data + multi-node mode
new DeepLearningTask(model.model_info(), rowFraction(train, mp, model)).doAll(train).model_info()); //distributed data (always in multi-node mode)
while (model.doScoring(train, trainScoreFrame, validScoreFrame, self(), getValidAdaptor()));
// replace the model with the best model so far (if it's better)
if (!isCancelledOrCrashed() && override_with_best_model && model.actual_best_model_key != null && n_folds == 0) {
DeepLearningModel best_model = UKV.get(model.actual_best_model_key);
if (best_model != null && best_model.error() < model.error() && Arrays.equals(best_model.model_info().units, model.model_info().units)) {
Log.info("Setting the model to be the best model so far (based on scoring history).");
DeepLearningModel.DeepLearningModelInfo mi = best_model.model_info().deep_clone();
// Don't cheat - count full amount of training samples, since that's the amount of training it took to train (without finding anything better)
mi.set_processed_global(model.model_info().get_processed_global());
mi.set_processed_local(model.model_info().get_processed_local());
model.set_model_info(mi);
model.update(self());
model.doScoring(train, trainScoreFrame, validScoreFrame, self(), getValidAdaptor());
assert(best_model.error() == model.error());
}
}
Log.info(model);
Log.info("Finished training the Deep Learning model.");
return model;
}
catch(JobCancelledException ex) {
model = UKV.get(dest());
state = JobState.CANCELLED; //for JSON REST response
model.get_params().state = state; //for parameter JSON on the HTML page
Log.info("Deep Learning model building was cancelled.");
return model;
}
catch(Throwable t) {
t.printStackTrace();
model = UKV.get(dest());
state = JobState.FAILED; //for JSON REST response
if (model != null) {
model.get_params().state = state; //for parameter JSON on the HTML page
Log.info("Deep Learning model building failed.");
}
return model;
}
finally {
if (model != null && DKV.get(model._key) != null) model.unlock(self());
unlock_data();
}
}
/**
* Lock the input datasets against deletes
*/
private void lock_data() {
source.read_lock(self());
if( validation != null && source._key != null && validation._key !=null && !source._key.equals(validation._key) )
validation.read_lock(self());
}
/**
* Release the lock for the input datasets
*/
private void unlock_data() {
source.unlock(self());
if( validation != null && source._key != null && validation._key != null && !source._key.equals(validation._key) )
validation.unlock(self());
}
/**
* Delete job related keys
*/
public void delete() {
cleanup();
if (_fakejob) UKV.remove(job_key);
remove();
}
/**
* Rebalance a frame for load balancing
* @param fr Input frame
* @param local whether to only create enough chunks to max out all cores on one node only
* @return Frame that has potentially more chunks
*/
private Frame reBalance(final Frame fr, boolean local) {
int chunks = (int)Math.min( 4 * H2O.NUMCPUS * (local ? 1 : H2O.CLOUD.size()), fr.numRows());
if (fr.anyVec().nChunks() > chunks && !reproducible) {
Log.info("Dataset already contains " + fr.anyVec().nChunks() + " chunks. No need to rebalance.");
return fr;
} else if (reproducible) {
Log.warn("Reproducibility enforced - using only 1 thread - can be slow.");
chunks = 1;
}
if (!quiet_mode) Log.info("ReBalancing dataset into (at least) " + chunks + " chunks.");
// return MRUtils.shuffleAndBalance(fr, chunks, seed, local, shuffle_training_data);
String snewKey = fr._key != null ? (fr._key.toString() + ".balanced") : Key.rand();
Key newKey = Key.makeSystem(snewKey);
RebalanceDataSet rb = new RebalanceDataSet(fr, newKey, chunks);
H2O.submitTask(rb);
rb.join();
return UKV.get(newKey);
}
/**
* Compute the actual train_samples_per_iteration size from the user-given parameter
* @param mp Model parameter (DeepLearning object)
* @param numRows number of training rows
* @param model DL Model
* @return The total number of training rows to be processed per iteration (summed over on all nodes)
*/
private static long computeTrainSamplesPerIteration(final DeepLearning mp, final long numRows, DeepLearningModel model) {
long tspi = mp.train_samples_per_iteration;
assert(tspi == 0 || tspi == -1 || tspi == -2 || tspi >= 1);
if (tspi == 0 || (!mp.replicate_training_data && tspi == -1) ) {
tspi = numRows;
if (!mp.quiet_mode) Log.info("Setting train_samples_per_iteration (" + mp.train_samples_per_iteration + ") to one epoch: #rows (" + tspi + ").");
}
else if (tspi == -1) {
tspi = (mp.single_node_mode ? 1 : H2O.CLOUD.size()) * numRows;
if (!mp.quiet_mode) Log.info("Setting train_samples_per_iteration (" + mp.train_samples_per_iteration + ") to #nodes x #rows (" + tspi + ").");
} else if (tspi == -2) {
// automatic tuning based on CPU speed, network speed and model size
// measure cpu speed
double total_gflops = 0;
for (H2ONode h2o : H2O.CLOUD._memary) {
HeartBeat hb = h2o._heartbeat;
total_gflops += hb._gflops;
}
if (mp.single_node_mode) total_gflops /= H2O.CLOUD.size();
if (total_gflops == 0) {
total_gflops = Linpack.run(H2O.SELF._heartbeat._cpus_allowed) * (mp.single_node_mode ? 1 : H2O.CLOUD.size());
}
final long model_size = model.model_info().size();
int[] msg_sizes = new int[]{ (int)(model_size*4) == (model_size*4) ? (int)(model_size*4) : Integer.MAX_VALUE };
double[] microseconds_collective = new double[msg_sizes.length];
NetworkTest.NetworkTester nt = new NetworkTest.NetworkTester(msg_sizes,null,microseconds_collective,model_size>1e6 ? 1 : 5 /*repeats*/,false,true /*only collectives*/);
nt.compute2();
//length of the network traffic queue based on log-tree rollup (2 log(nodes))
int network_queue_length = mp.single_node_mode || H2O.CLOUD.size() == 1? 1 : 2*(int)Math.floor(Math.log(H2O.CLOUD.size())/Math.log(2));
// heuristics
double flops_overhead_per_row = 30;
if (mp.activation == Activation.Maxout || mp.activation == Activation.MaxoutWithDropout) {
flops_overhead_per_row *= 8;
} else if (mp.activation == Activation.Tanh || mp.activation == Activation.TanhWithDropout) {
flops_overhead_per_row *= 5;
}
// target fraction of comm vs cpu time: 5%
double fraction = mp.single_node_mode || H2O.CLOUD.size() == 1 ? 1e-3 : 0.05; //one single node mode, there's no model averaging effect, so less need to shorten the M/R iteration
// estimate the time for communication (network) and training (compute)
model.time_for_communication_us = (H2O.CLOUD.size() == 1 ? 1e4 /* add 10ms for single-node */ : 0) + network_queue_length * microseconds_collective[0];
double time_per_row_us = flops_overhead_per_row * model_size / (total_gflops * 1e9) / H2O.SELF._heartbeat._cpus_allowed * 1e6;
// compute the optimal number of training rows per iteration
// fraction := time_comm_us / (time_comm_us + tspi * time_per_row_us) ==> tspi = (time_comm_us/fraction - time_comm_us)/time_per_row_us
tspi = (long)((model.time_for_communication_us / fraction - model.time_for_communication_us)/ time_per_row_us);
tspi = Math.min(tspi, (mp.single_node_mode ? 1 : H2O.CLOUD.size()) * numRows * 10); //not more than 10x of what train_samples_per_iteration=-1 would do
// If the number is close to a multiple of epochs, use that -> prettier scoring
if (tspi > numRows && Math.abs(tspi % numRows)/(double)numRows < 0.2) tspi = tspi - tspi % numRows;
tspi = Math.min(tspi, (long)(mp.epochs * numRows / 10)); //limit to number of epochs desired, but at least 10 iterations total
tspi = Math.max(1, tspi); //at least 1 point
if (!mp.quiet_mode) {
Log.info("Auto-tuning parameter 'train_samples_per_iteration':");
Log.info("Estimated compute power : " + (int)total_gflops + " GFlops");
Log.info("Estimated time for comm : " + PrettyPrint.usecs((long)model.time_for_communication_us));
Log.info("Estimated time per row : " + ((long)time_per_row_us > 0 ? PrettyPrint.usecs((long)time_per_row_us) : time_per_row_us + " usecs"));
Log.info("Estimated training speed: " + (int)(1e6/time_per_row_us) + " rows/sec");
Log.info("Setting train_samples_per_iteration (" + mp.train_samples_per_iteration + ") to auto-tuned value: " + tspi);
}
} else {
// limit user-given value to number of epochs desired
tspi = Math.min(tspi, (long)(mp.epochs * numRows));
}
assert(tspi != 0 && tspi != -1 && tspi != -2 && tspi >= 1);
return tspi;
}
/**
* Compute the fraction of rows that need to be used for training during one iteration
* @param numRows number of training rows
* @param train_samples_per_iteration number of training rows to be processed per iteration
* @param replicate_training_data whether of not the training data is replicated on each node
* @return fraction of rows to be used for training during one iteration
*/
private static float computeRowUsageFraction(final long numRows, final long train_samples_per_iteration, final boolean replicate_training_data) {
float rowUsageFraction = (float)train_samples_per_iteration / numRows;
if (replicate_training_data) rowUsageFraction /= H2O.CLOUD.size();
assert(rowUsageFraction > 0);
return rowUsageFraction;
}
private static float rowFraction(Frame train, DeepLearning p, DeepLearningModel m) {
return computeRowUsageFraction(train.numRows(), m.actual_train_samples_per_iteration, p.replicate_training_data);
}
/**
* Cross-Validate a DeepLearning model by building new models on N train/test holdout splits
* @param splits Frames containing train/test splits
* @param cv_preds Array of Frames to store the predictions for each cross-validation run
* @param offsets Array to store the offsets of starting row indices for each cross-validation run
* @param i Which fold of cross-validation to perform
*/
@Override public void crossValidate(Frame[] splits, Frame[] cv_preds, long[] offsets, int i) {
// Train a clone with slightly modified parameters (to account for cross-validation)
final DeepLearning cv = (DeepLearning) this.clone();
cv.genericCrossValidation(splits, offsets, i);
cv_preds[i] = ((DeepLearningModel) UKV.get(cv.dest())).score(cv.validation);
new TAtomic<DeepLearningModel>() {
@Override public DeepLearningModel atomic(DeepLearningModel m) {
if (!keep_cross_validation_splits && /*paranoid*/cv.dest().toString().contains("xval")) {
m.get_params().source = null;
m.get_params().validation=null;
m.get_params().response=null;
}
return m;
}
}.invoke(cv.dest());
}
}