DeepLearningV3.java example

Explorer
h2o-3-master
package hex.schemas;

import hex.Distribution;
import hex.deeplearning.DeepLearning;
import hex.deeplearning.DeepLearningModel.DeepLearningParameters;
import water.api.API;
import water.api.schemas3.ModelParametersSchemaV3;
import water.api.schemas3.KeyV3;

public class DeepLearningV3 extends ModelBuilderSchema<DeepLearning,DeepLearningV3,DeepLearningV3.DeepLearningParametersV3> {

  public static final class DeepLearningParametersV3 extends ModelParametersSchemaV3<DeepLearningParameters, DeepLearningParametersV3> {

    // Determines the order of parameters in the GUI
    public static String[] fields = {
        "model_id",
        "training_frame",
        "validation_frame",
        "nfolds",
        "keep_cross_validation_predictions",
        "keep_cross_validation_fold_assignment",
        "fold_assignment",
        "fold_column",
        "response_column",
        "ignored_columns",
        "ignore_const_cols",
        "score_each_iteration",
        "weights_column",
        "offset_column",
        "balance_classes",
        "class_sampling_factors",
        "max_after_balance_size",
        "max_confusion_matrix_size",
        "max_hit_ratio_k",
        "checkpoint",
        "pretrained_autoencoder",
        "overwrite_with_best_model",
        "use_all_factor_levels",
        "standardize",
        "activation",
        "hidden",
        "epochs",
        "train_samples_per_iteration",
        "target_ratio_comm_to_comp",
        "seed",
        "adaptive_rate",
        "rho",
        "epsilon",
        "rate",
        "rate_annealing",
        "rate_decay",
        "momentum_start",
        "momentum_ramp",
        "momentum_stable",
        "nesterov_accelerated_gradient",
        "input_dropout_ratio",
        "hidden_dropout_ratios",
        "l1",
        "l2",
        "max_w2",
        "initial_weight_distribution",
        "initial_weight_scale",
        "initial_weights",
        "initial_biases",
        "loss",
        "distribution",
        "quantile_alpha",
        "tweedie_power",
        "huber_alpha",
        "score_interval",
        "score_training_samples",
        "score_validation_samples",
        "score_duty_cycle",
        "classification_stop",
        "regression_stop",
        "stopping_rounds",
        "stopping_metric",
        "stopping_tolerance",
        "max_runtime_secs",
        "score_validation_sampling",
        "diagnostics",
        "fast_mode",
        "force_load_balance",
        "variable_importances",
        "replicate_training_data",
        "single_node_mode",
        "shuffle_training_data",
        "missing_values_handling",
        "quiet_mode",
        "autoencoder",
        "sparse",
        "col_major",
        "average_activation",
        "sparsity_beta",
        "max_categorical_features",
        "reproducible",
        "export_weights_and_biases",
        "mini_batch_size",
        "categorical_encoding",
        "elastic_averaging",
        "elastic_averaging_moving_rate",
        "elastic_averaging_regularization"
    };


    /* Imbalanced Classes */

    /**
     * For imbalanced data, balance training data class counts via
     * over/under-sampling. This can result in improved predictive accuracy.
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Balance training data class counts via over/under-sampling (for imbalanced data).")
    public boolean balance_classes;

    /**
     * Desired over/under-sampling ratios per class (lexicographic order).
     * Only when balance_classes is enabled.
     * If not specified, they will be automatically computed to obtain class balance during training.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling " +
            "factors will be automatically computed to obtain class balance during training. Requires balance_classes.")
    public float[] class_sampling_factors;

    /**
     * When classes are balanced, limit the resulting dataset size to the
     * specified multiple of the original dataset size.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = false,
        help = "Maximum relative size of the training data after balancing class counts (can be less than 1.0). " +
            "Requires balance_classes.")
    public float max_after_balance_size;

    /** For classification models, the maximum size (in terms of classes) of
     *  the confusion matrix for it to be printed. This option is meant to
     *  avoid printing extremely large confusion matrices.
     *  */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = false,
        help = "[Deprecated] Maximum size (# classes) for confusion matrices to be printed in the Logs.")
    public int max_confusion_matrix_size;

    /**
     * The maximum number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to disable)
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Max. number (top K) of predictions to use for hit ratio computation (for multi-class only, 0 to " +
            "disable).")
    public int max_hit_ratio_k;


    /* Neural Net Topology */

    /**
     * The activation function (non-linearity) to be used by the neurons in the hidden layers.
     * Tanh: Hyperbolic tangent function (same as scaled and shifted sigmoid).
     * Rectifier: Rectifier Linear Unit: Chooses the maximum of (0, x) where x is the input value.
     * Maxout: Choose the maximum coordinate of the input vector.
     * ExpRectifier: Exponential Rectifier Linear Unit function (http://arxiv.org/pdf/1511.07289v2.pdf)
     * With Dropout: Zero out a random user-given fraction of the
     *      incoming weights to each hidden layer during training, for each
     *      training row. This effectively trains exponentially many models at
     *      once, and can improve generalization.
     */
    @API(level = API.Level.critical, direction = API.Direction.INOUT, gridable = true,
        values = {"Tanh", "TanhWithDropout", "Rectifier", "RectifierWithDropout", "Maxout", "MaxoutWithDropout"},
        help = "Activation function.")
    public DeepLearningParameters.Activation activation;

    /**
     * The number and size of each hidden layer in the model.
     * For example, if a user specifies "100,200,100" a model with 3 hidden
     * layers will be produced, and the middle hidden layer will have 200
     * neurons.
     */
    @API(level = API.Level.critical, direction = API.Direction.INOUT, gridable = true,
        help = "Hidden layer sizes (e.g. [100, 100]).")
    public int[] hidden;

    /**
     * The number of passes over the training dataset to be carried out.
     * It is recommended to start with lower values for initial grid searches.
     * This value can be modified during checkpoint restarts and allows continuation
     * of selected models.
     */
    @API(level = API.Level.critical, direction = API.Direction.INOUT, gridable = true,
        help = "How many times the dataset should be iterated (streamed), can be fractional.")
    public double epochs;

    /**
     * The number of training data rows to be processed per iteration. Note that
     * independent of this parameter, each row is used immediately to update the model
     * with (online) stochastic gradient descent. This parameter controls the
     * synchronization period between nodes in a distributed environment and the
     * frequency at which scoring and model cancellation can happen. For example, if
     * it is set to 10,000 on H2O running on 4 nodes, then each node will
     * process 2,500 rows per iteration, sampling randomly from their local data.
     * Then, model averaging between the nodes takes place, and scoring can happen
     * (dependent on scoring interval and duty factor). Special values are 0 for
     * one epoch per iteration, -1 for processing the maximum amount of data
     * per iteration (if **replicate training data** is enabled, N epochs
     * will be trained per iteration on N nodes, otherwise one epoch). Special value
     * of -2 turns on automatic mode (auto-tuning).
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Number of training samples (globally) per MapReduce iteration. Special values are 0: one epoch, -1: " +
            "all available data (e.g., replicated training data), -2: automatic.")
    public long train_samples_per_iteration;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Target ratio of communication overhead to computation. Only for multi-node operation and " +
            "train_samples_per_iteration = -2 (auto-tuning).")
    public double target_ratio_comm_to_comp;

    /**
     * The random seed controls sampling and initialization. Reproducible
     * results are only expected with single-threaded operation (i.e.,
     * when running on one node, turning off load balancing and providing
     * a small dataset that fits in one chunk).  In general, the
     * multi-threaded asynchronous updates to the model parameters will
     * result in (intentional) race conditions and non-reproducible
     * results. Note that deterministic sampling and initialization might
     * still lead to some weak sense of determinism in the model.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Seed for random numbers (affects sampling) - Note: only reproducible when running single threaded.")
    public long seed;


    /*Adaptive Learning Rate*/

    /**
     * The implemented adaptive learning rate algorithm (ADADELTA) automatically
     * combines the benefits of learning rate annealing and momentum
     * training to avoid slow convergence. Specification of only two
     * parameters (rho and epsilon)  simplifies hyper parameter search.
     * In some cases, manually controlled (non-adaptive) learning rate and
     * momentum specifications can lead to better results, but require the
     * specification (and hyper parameter search) of up to 7 parameters.
     * If the model is built on a topology with many local minima or
     * long plateaus, it is possible for a constant learning rate to produce
     * sub-optimal results. Learning rate annealing allows digging deeper into
     * local minima, while rate decay allows specification of different
     * learning rates per layer.  When the gradient is being estimated in
     * a long valley in the optimization landscape, a large learning rate
     * can cause the gradient to oscillate and move in the wrong
     * direction. When the gradient is computed on a relatively flat
     * surface with small learning rates, the model can converge far
     * slower than necessary.
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Adaptive learning rate.")
    public boolean adaptive_rate;

    /**
     * The first of two hyper parameters for adaptive learning rate (ADADELTA).
     * It is similar to momentum and relates to the memory to prior weight updates.
     * Typical values are between 0.9 and 0.999.
     * This parameter is only active if adaptive learning rate is enabled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Adaptive learning rate time decay factor (similarity to prior updates).")
    public double rho;

    /**
     * The second of two hyper parameters for adaptive learning rate (ADADELTA).
     * It is similar to learning rate annealing during initial training
     * and momentum at later stages where it allows forward progress.
     * Typical values are between 1e-10 and 1e-4.
     * This parameter is only active if adaptive learning rate is enabled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Adaptive learning rate smoothing factor (to avoid divisions by zero and allow progress).")
    public double epsilon;


    /*Learning Rate*/

    /**
     * When adaptive learning rate is disabled, the magnitude of the weight
     * updates are determined by the user specified learning rate
     * (potentially annealed), and are a function  of the difference
     * between the predicted value and the target value. That difference,
     * generally called delta, is only available at the output layer. To
     * correct the output at each hidden layer, back propagation is
     * used. Momentum modifies back propagation by allowing prior
     * iterations to influence the current update. Using the momentum
     * parameter can aid in avoiding local minima and the associated
     * instability. Too much momentum can lead to instabilities, that's
     * why the momentum is best ramped up slowly.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Learning rate (higher => less stable, lower => slower convergence).")
    public double rate;

    /**
     * Learning rate annealing reduces the learning rate to "freeze" into
     * local minima in the optimization landscape.  The annealing rate is the
     * inverse of the number of training samples it takes to cut the learning rate in half
     * (e.g., 1e-6 means that it takes 1e6 training samples to halve the learning rate).
     * This parameter is only active if adaptive learning rate is disabled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Learning rate annealing: rate / (1 + rate_annealing * samples).")
    public double rate_annealing;

    /**
     * The learning rate decay parameter controls the change of learning rate across layers.
     * For example, assume the rate parameter is set to 0.01, and the rate_decay parameter is set to 0.5.
     * Then the learning rate for the weights connecting the input and first hidden layer will be 0.01,
     * the learning rate for the weights connecting the first and the second hidden layer will be 0.005,
     * and the learning rate for the weights connecting the second and third hidden layer will be 0.0025, etc.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Learning rate decay factor between layers (N-th layer: rate * rate_decay ^ (n - 1).")
    public double rate_decay;


    /*Momentum*/

    /**
     * The momentum_start parameter controls the amount of momentum at the beginning of training.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Initial momentum at the beginning of training (try 0.5).")
    public double momentum_start;

    /**
     * The momentum_ramp parameter controls the amount of learning for which momentum increases
     * (assuming momentum_stable is larger than momentum_start). The ramp is measured in the number
     * of training samples.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Number of training samples for which momentum increases.")
    public double momentum_ramp;

    /**
     * The momentum_stable parameter controls the final momentum value reached after momentum_ramp training samples.
     * The momentum used for training will remain the same for training beyond reaching that point.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Final momentum after the ramp is over (try 0.99).")
    public double momentum_stable;

    /**
     * The Nesterov accelerated gradient descent method is a modification to
     * traditional gradient descent for convex functions. The method relies on
     * gradient information at various points to build a polynomial approximation that
     * minimizes the residuals in fewer iterations of the descent.
     * This parameter is only active if adaptive learning rate is disabled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Use Nesterov accelerated gradient (recommended).")
    public boolean nesterov_accelerated_gradient;


    /*Regularization*/

    /**
     * A fraction of the features for each training row to be omitted from training in order
     * to improve generalization (dimension sampling).
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Input layer dropout ratio (can improve generalization, try 0.1 or 0.2).")
    public double input_dropout_ratio;

    /**
     * A fraction of the inputs for each hidden layer to be omitted from training in order
     * to improve generalization. Defaults to 0.5 for each hidden layer if omitted.
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Hidden layer dropout ratios (can improve generalization), specify one value per hidden layer, " +
            "defaults to 0.5.")
    public double[] hidden_dropout_ratios;

    /**
     * A regularization method that constrains the absolute value of the weights and
     * has the net effect of dropping some weights (setting them to zero) from a model
     * to reduce complexity and avoid overfitting.
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "L1 regularization (can add stability and improve generalization, causes many weights to become 0).")
    public double l1;

    /**
     *  A regularization method that constrains the sum of the squared
     * weights. This method introduces bias into parameter estimates, but
     * frequently produces substantial gains in modeling as estimate variance is
     * reduced.
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "L2 regularization (can add stability and improve generalization, causes many weights to be small.")
    public double l2;

    /**
     *  A maximum on the sum of the squared incoming weights into
     * any one neuron. This tuning parameter is especially useful for unbound
     * activation functions such as Maxout or Rectifier.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Constraint for squared sum of incoming weights per unit (e.g. for Rectifier).")
    public float max_w2;


    /*Initialization*/

    /**
     * The distribution from which initial weights are to be drawn. The default
     * option is an optimized initialization that considers the size of the network.
     * The "uniform" option uses a uniform distribution with a mean of 0 and a given
     * interval. The "normal" option draws weights from the standard normal
     * distribution with a mean of 0 and given standard deviation.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        values = {"UniformAdaptive", "Uniform", "Normal"},
        help = "Initial weight distribution.")
    public DeepLearningParameters.InitialWeightDistribution initial_weight_distribution;

    /**
     * The scale of the distribution function for Uniform or Normal distributions.
     * For Uniform, the values are drawn uniformly from -initial_weight_scale...initial_weight_scale.
     * For Normal, the values are drawn from a Normal distribution with a standard deviation of initial_weight_scale.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Uniform: -value...value, Normal: stddev.")
    public double initial_weight_scale;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable=true,
        help = "A list of H2OFrame ids to initialize the weight matrices of this model with.")
    public KeyV3.FrameKeyV3[] initial_weights;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable=true,
        help = "A list of H2OFrame ids to initialize the bias vectors of this model with.")
    public KeyV3.FrameKeyV3[] initial_biases;

    /**
     * The loss (error) function to be minimized by the model.
     * CrossEntropy loss is used when the model output consists of independent
     * hypotheses, and the outputs can be interpreted as the probability that each
     * hypothesis is true. Cross entropy is the recommended loss function when the
     * target values are class labels, and especially for imbalanced data.
     * It strongly penalizes error in the prediction of the actual class label.
     * Quadratic loss is used when the model output are continuous real values, but can
     * be used for classification as well (where it emphasizes the error on all
     * output classes, not just for the actual class).
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true, required = false,
        values = {"Automatic", "CrossEntropy", "Quadratic", "Huber", "Absolute", "Quantile"},
        help = "Loss function.")
    public DeepLearningParameters.Loss loss;

    /*Scoring*/

    /**
     * The minimum time (in seconds) to elapse between model scoring. The actual
     * interval is determined by the number of training samples per iteration and the scoring duty cycle.
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Shortest time interval (in seconds) between model scoring.")
    public double score_interval;

    /**
     * The number of training dataset points to be used for scoring. Will be
     * randomly sampled. Use 0 for selecting the entire training dataset.
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Number of training set samples for scoring (0 for all).")
    public long score_training_samples;

    /**
     * The number of validation dataset points to be used for scoring. Can be
     * randomly sampled or stratified (if "balance classes" is set and "score
     * validation sampling" is set to stratify). Use 0 for selecting the entire
     * training dataset.
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Number of validation set samples for scoring (0 for all).")
    public long score_validation_samples;

    /**
     * Maximum fraction of wall clock time spent on model scoring on training and validation samples,
     * and on diagnostics such as computation of feature importances (i.e., not on training).
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Maximum duty cycle fraction for scoring (lower: more training, higher: more scoring).")
    public double score_duty_cycle;

    /**
     * The stopping criteria in terms of classification error (1-accuracy) on the
     * training data scoring dataset. When the error is at or below this threshold,
     * training stops.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Stopping criterion for classification error fraction on training data (-1 to disable).")
    public double classification_stop;

    /**
     * The stopping criteria in terms of regression error (MSE) on the training
     * data scoring dataset. When the error is at or below this threshold, training
     * stops.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Stopping criterion for regression error (MSE) on training data (-1 to disable).")
    public double regression_stop;

    /**
     * Enable quiet mode for less output to standard output.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Enable quiet mode for less output to standard output.")
    public boolean quiet_mode;

    /**
     * Method used to sample the validation dataset for scoring, see Score Validation Samples above.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        values = {"Uniform", "Stratified"},
        help = "Method used to sample validation dataset for scoring.")
    public DeepLearningParameters.ClassSamplingMethod score_validation_sampling;


    /* Miscellaneous */

    /**
     * If enabled, store the best model under the destination key of this model at the end of training.
     * Only applicable if training is not cancelled.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "If enabled, override the final model with the best model found during training.")
    public boolean overwrite_with_best_model;

    @API(level = API.Level.secondary, direction = API.Direction.INOUT,
        help = "Auto-Encoder.")
    public boolean autoencoder;

    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Use all factor levels of categorical variables. Otherwise, the first factor level is omitted (without" +
            " loss of accuracy). Useful for variable importances and auto-enabled for autoencoder.")
    public boolean use_all_factor_levels;

    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "If enabled, automatically standardize the data. If disabled, the user must provide properly scaled " +
            "input data.")
    public boolean standardize;

    /**
     * Gather diagnostics for hidden layers, such as mean and RMS values of learning
     * rate, momentum, weights and biases.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT,
        help = "Enable diagnostics for hidden layers.")
    public boolean diagnostics;

    /**
     * Whether to compute variable importances for input features.
     * The implemented method (by Gedeon) considers the weights connecting the
     * input features to the first two hidden layers.
     */
    @API(level = API.Level.critical, direction = API.Direction.INOUT, gridable = true,
        help = "Compute variable importances for input features (Gedeon method) - can be slow for large networks.")
    public boolean variable_importances;

    /**
     * Enable fast mode (minor approximation in back-propagation), should not affect results significantly.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Enable fast mode (minor approximation in back-propagation).")
    public boolean fast_mode;

    /**
     * Increase training speed on small datasets by splitting it into many chunks
     * to allow utilization of all cores.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Force extra load balancing to increase training speed for small datasets (to keep all cores busy).")
    public boolean force_load_balance;

    /**
     * Replicate the entire training dataset onto every node for faster training on small datasets.
     */
    @API(level = API.Level.secondary, direction = API.Direction.INOUT, gridable = true,
        help = "Replicate the entire training dataset onto every node for faster training on small datasets.")
    public boolean replicate_training_data;

    /**
     * Run on a single node for fine-tuning of model parameters. Can be useful for
     * checkpoint resumes after training on multiple nodes for fast initial
     * convergence.
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Run on a single node for fine-tuning of model parameters.")
    public boolean single_node_mode;

    /**
     * Enable shuffling of training data (on each node). This option is
     * recommended if training data is replicated on N nodes, and the number of training samples per iteration
     * is close to N times the dataset size, where all nodes train will (almost) all
     * the data. It is automatically enabled if the number of training samples per iteration is set to -1 (or to N
     * times the dataset size or larger).
     */
    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Enable shuffling of training data (recommended if training data is replicated and " +
            "train_samples_per_iteration is close to #nodes x #rows, of if using balance_classes).")
    public boolean shuffle_training_data;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        values = {"MeanImputation", "Skip"},
        help = "Handling of missing values. Either MeanImputation or Skip.")
    public DeepLearningParameters.MissingValuesHandling missing_values_handling;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Sparse data handling (more efficient for data with lots of 0 values).")
    public boolean sparse;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "#DEPRECATED Use a column major weight matrix for input layer. Can speed up forward propagation, but " +
            "might slow down backpropagation.")
    public boolean col_major;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Average activation for sparse auto-encoder. #Experimental")
    public double average_activation;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Sparsity regularization. #Experimental")
    public double sparsity_beta;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Max. number of categorical features, enforced via hashing. #Experimental")
    public int max_categorical_features;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Force reproducibility on small data (will be slow - only uses 1 thread).")
    public boolean reproducible;

    @API(level = API.Level.expert, direction=API.Direction.INOUT,
        help = "Whether to export Neural Network weights and biases to H2O Frames.")
    public boolean export_weights_and_biases;

    @API(level = API.Level.expert, direction=API.Direction.INOUT,
        help = "Mini-batch size (smaller leads to better fit, larger can speed up and generalize better).")
    public int mini_batch_size;

    @API(level = API.Level.expert, direction=API.Direction.INOUT, gridable = true,
        help = "Elastic averaging between compute nodes can improve distributed model convergence. #Experimental")
    public boolean elastic_averaging;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Elastic averaging moving rate (only if elastic averaging is enabled).")
    public double elastic_averaging_moving_rate;

    @API(level = API.Level.expert, direction = API.Direction.INOUT, gridable = true,
        help = "Elastic averaging regularization strength (only if elastic averaging is enabled).")
    public double elastic_averaging_regularization;

    @API(level = API.Level.expert, direction = API.Direction.INOUT,
        help = "Pretrained autoencoder model to initialize this model with.")
    public KeyV3.ModelKeyV3 pretrained_autoencoder;
  }
}