DeepLearningModelInfo.java example

Explorer
h2o-3-master
package hex.deeplearning;

import hex.DataInfo;
import hex.genmodel.utils.DistributionFamily;
import static java.lang.Double.isNaN;

import hex.Model;
import hex.deeplearning.DeepLearningModel.DeepLearningParameters;
import water.*;
import water.fvec.Frame;
import water.util.*;

import java.util.Arrays;
import java.util.Random;


/**
 * This class contains the state of the Deep Learning model
 * This will be shared: one per node
 */
final public class DeepLearningModelInfo extends Iced<DeepLearningModelInfo> {

  public TwoDimTable summaryTable;

  public DataInfo data_info;
  public DataInfo data_info() {
    return data_info;
  }

  // model is described by parameters and the following arrays
  private Storage.DenseRowMatrix[] dense_row_weights; //one 2D weight matrix per layer (stored as a 1D array each)
  private Storage.DenseVector[] biases; //one 1D bias array per layer
  private Storage.DenseVector[] avg_activations; //one 1D array per hidden layer

  // helpers for storing previous step deltas
  // Note: These two arrays *could* be made transient and then initialized freshly in makeNeurons() and in DeepLearningTask.initLocal()
  // But then, after each reduction, the weights would be lost and would have to restart afresh -> not *exactly* right, but close...
  private Storage.DenseRowMatrix[] dense_row_weights_momenta;
  private Storage.DenseVector[] biases_momenta;

  // helpers for AdaDelta
  private Storage.DenseRowMatrix[] dense_row_ada_dx_g;
  private Storage.DenseVector[] biases_ada_dx_g;

  private boolean[] _saw_missing_cats;  // whether missing value was encountered for each categorical predictor - needed for varimp

  // compute model size (number of model parameters required for making predictions)
  // momenta are not counted here, but they are needed for model building
  public long size() {
    long siz = 0;
    for (Storage.DenseRowMatrix w : dense_row_weights) if (w != null) siz += w.size();
    for (Storage.Vector b : biases) siz += b.size();
    return siz;
  }

  /**
   * Check whether a missing value was found for every categorical predictor
   * @param cats activation of categorical buckets for a given row
   */
  void checkMissingCats(int[] cats)  {
    if (cats == null) return;
    if (_saw_missing_cats == null) return;
    for (int i=0; i<cats.length; ++i) {
      assert(data_info._catMissing[i]); //have a missing bucket for each categorical
      if (_saw_missing_cats[i]) continue;
      _saw_missing_cats[i] = (cats[i] == data_info._catOffsets[i+1]-1);
    }
  }

  // accessors to (shared) weights and biases - those will be updated racily (c.f. Hogwild!)
  boolean has_momenta() {
    return get_params()._momentum_start != 0 || get_params()._momentum_stable != 0;
  }

  boolean adaDelta() {
    return get_params()._adaptive_rate;
  }

  public final Storage.DenseRowMatrix get_weights(int i) {
    return dense_row_weights[i];
  }

  public final Storage.DenseVector get_biases(int i) {
    return biases[i];
  }

  public final Storage.DenseRowMatrix get_weights_momenta(int i) {
    return dense_row_weights_momenta[i];
  }

  public final Storage.DenseVector get_biases_momenta(int i) {
    return biases_momenta[i];
  }

  public final Storage.DenseRowMatrix get_ada_dx_g(int i) {
    return dense_row_ada_dx_g[i];
  }

  public final Storage.DenseVector get_biases_ada_dx_g(int i) {
    return biases_ada_dx_g[i];
  }

  //accessor to shared parameter defining avg activations
  public final Storage.DenseVector get_avg_activations(int i) {
    return avg_activations[i];
  }

  public DeepLearningParameters parameters;
  Key<Model> _model_id;
  public final DeepLearningParameters get_params() { return parameters; }
  public final void set_params(DeepLearningParameters p, Key<Model> model_id ) {
    parameters = (DeepLearningParameters) p.clone();
    _model_id = model_id;
  }

  private double[] mean_rate;
  private double[] rms_rate;
  private double[] mean_bias;
  private double[] rms_bias;
  private double[] mean_weight;
  public double[] rms_weight;
  public double[] mean_a;

  private volatile boolean unstable = false;
  public boolean isUnstable() { return unstable; }
  public void setUnstable() {
    if (!unstable) computeStats();
    unstable = true;
  }

  private long processed_global;
  public synchronized long get_processed_global() { return processed_global; }
  public synchronized void set_processed_global(long p) { processed_global = p; }
  public synchronized void add_processed_global(long p) { processed_global += p; }
  private long processed_local;
  public synchronized long get_processed_local() { return processed_local; }
  public synchronized void set_processed_local(long p) { processed_local = p; }
  public synchronized void add_processed_local(long p) { processed_local += p; }
  public synchronized long get_processed_total() { return processed_global + processed_local; }

  // package local helpers
  int[] units; //number of neurons per layer, extracted from parameters and from datainfo

  final boolean _classification; // Classification cache (nclasses>1)
  final Frame _train;         // Prepared training frame
  final Frame _valid;         // Prepared validation frame

  /**
   * Dummy constructor, only to be used for deserialization from autobuffer
   */
  private DeepLearningModelInfo() {
    super(); // key is null
    _classification = false;
    _train = _valid = null;
  }

  /**
   * Main constructor
   * @param params Model parameters
   * @param dinfo Data Info
   * @param nClasses number of classes (1 for regression, 0 for autoencoder)
   * @param train User-given training data frame, prepared by AdaptTestTrain
   * @param valid User-specified validation data frame, prepared by AdaptTestTrain
   */
  public DeepLearningModelInfo(final DeepLearningParameters params, Key model_id, final DataInfo dinfo, int nClasses, Frame train, Frame valid) {
    _classification = nClasses > 1;
    _train = train;
    _valid = valid;
    data_info = dinfo;
    parameters = (DeepLearningParameters) params.clone(); //make a copy, don't change model's parameters
    _model_id = model_id;
    DeepLearningParameters.Sanity.modifyParms(parameters, parameters, nClasses); //sanitize the model_info's parameters

    final int num_input = dinfo.fullN();
    final int num_output = get_params()._autoencoder ? num_input :
            (_classification && parameters._distribution != DistributionFamily.modified_huber ? train.vec(parameters._response_column).cardinality() : 1);
    if (!get_params()._autoencoder) assert(num_output == nClasses || parameters._distribution == DistributionFamily.modified_huber );

    _saw_missing_cats = dinfo._cats > 0 ? new boolean[data_info._cats] : null;
    assert (num_input > 0);
    assert (num_output > 0);
    if (has_momenta() && adaDelta())
      throw new IllegalArgumentException("Cannot have non-zero momentum and adaptive rate at the same time.");
    final int layers = get_params()._hidden.length;
    // units (# neurons for each layer)
    units = new int[layers + 2];
    if (get_params()._max_categorical_features <= Integer.MAX_VALUE - dinfo._nums)
      units[0] = Math.min(dinfo._nums + get_params()._max_categorical_features, num_input);
    else
      units[0] = num_input;
    System.arraycopy(get_params()._hidden, 0, units, 1, layers);
    units[layers + 1] = num_output;

    boolean printLevels = units[0] > 1000L;
    boolean warn = units[0] > 100000L;
    if (printLevels) {
      final String[][] domains = dinfo._adaptedFrame.domains();
      if (warn) {
        Log.warn("===================================================================================================================================");
        Log.warn(num_input + " input features" + (dinfo._cats > 0 ? " (after categorical one-hot encoding)" : "") + ". Can be slow and require a lot of memory.");
      }
      FrameUtils.printTopCategoricalLevels(dinfo._adaptedFrame, warn, 10);
      if (warn) {
        Log.warn("Suggestions:");
        Log.warn(" *) Limit the size of the first hidden layer");
        if (dinfo._cats > 0) {
          Log.warn(" *) Limit the total number of one-hot encoded features with the parameter 'max_categorical_features'");
          Log.warn(" *) Run h2o.interaction(...,pairwise=F) on high-cardinality categorical columns to limit the factor count, see http://learn.h2o.ai");
        }
        Log.warn("===================================================================================================================================");
      }
    }

    int[] mult = new int[layers + 1];
    for (int i=0;i<layers;++i) {
      mult[i] = (get_params()._activation == DeepLearningParameters.Activation.Maxout || get_params()._activation == DeepLearningParameters.Activation.MaxoutWithDropout) ? 2 : 1;
    }
    mult[layers]=1; //Output is never Maxout

    // weights (to connect layers)
    dense_row_weights = new Storage.DenseRowMatrix[layers + 1];
    dense_row_weights[0] = new Storage.DenseRowMatrix(mult[0]*units[1], units[0]);
    for (int i = 1; i <= layers; ++i)
      dense_row_weights[i] = new Storage.DenseRowMatrix(mult[i] * units[i + 1] /*rows*/, units[i] /*cols*/);

    // biases (only for hidden layers and output layer)
    biases = new Storage.DenseVector[layers + 1];
    for (int i = 0; i <= layers; ++i)
      biases[i] = new Storage.DenseVector(mult[i] * units[i+1]);
    // average activation (only for hidden layers)
    if (get_params()._autoencoder && get_params()._sparsity_beta > 0) {
      avg_activations = new Storage.DenseVector[layers];
      mean_a = new double[layers];
      for (int i = 0; i < layers; ++i)
        avg_activations[i] = new Storage.DenseVector(mult[i] * units[i + 1]);
    }
    allocateHelperArrays();
    // for diagnostics
    mean_rate = new double[units.length-1];
    rms_rate = new double[units.length-1];
    mean_bias = new double[units.length-1];
    rms_bias = new double[units.length-1];
    mean_weight = new double[units.length-1];
    rms_weight = new double[units.length-1];
  }

  /**
   * Allocate helper arrays for momentum/learning rate, etc.
   */
  void allocateHelperArrays() {
    int[] mult = new int[units.length-1];
    for (int i=0;i<units.length-1;++i) {
      mult[i] = (get_params()._activation == DeepLearningParameters.Activation.Maxout || get_params()._activation == DeepLearningParameters.Activation.MaxoutWithDropout) ? 2 : 1;
    }
    mult[units.length-2]=1; //Output is never Maxout

    if (has_momenta()) {
      dense_row_weights_momenta = new Storage.DenseRowMatrix[dense_row_weights.length];
      if (dense_row_weights[0] != null)
        dense_row_weights_momenta[0] = new Storage.DenseRowMatrix(mult[0]*units[1], units[0]);
      for (int i = 1; i < dense_row_weights_momenta.length; ++i)
        dense_row_weights_momenta[i] = new Storage.DenseRowMatrix(mult[i]*units[i + 1], units[i]);
      biases_momenta = new Storage.DenseVector[biases.length];
      for (int i = 0; i < biases_momenta.length; ++i)
        biases_momenta[i] = new Storage.DenseVector(mult[i]*units[i + 1]);
    } else if (adaDelta()) {
      dense_row_ada_dx_g = new Storage.DenseRowMatrix[dense_row_weights.length];
      //AdaGrad
      dense_row_ada_dx_g[0] = new Storage.DenseRowMatrix(mult[0]*2*units[1], units[0]);
      for (int i = 1; i < dense_row_ada_dx_g.length; ++i) {
        dense_row_ada_dx_g[i] = new Storage.DenseRowMatrix(mult[i]*units[i + 1], 2 * units[i]);
      }
      biases_ada_dx_g = new Storage.DenseVector[biases.length];
      for (int i = 0; i < biases_ada_dx_g.length; ++i) {
        biases_ada_dx_g[i] = new Storage.DenseVector(mult[i]*2* units[i + 1]);
      }
    }
  }

  /**
   * Create a summary table
   * @return TwoDimTable with the summary of the model
   */
  TwoDimTable createSummaryTable() {
    computeStats();
    Neurons[] neurons = DeepLearningTask.makeNeuronsForTesting(this);
    long byte_size = new AutoBuffer().put(this).buf().length;
    TwoDimTable table = new TwoDimTable(
            "Status of Neuron Layers",
            (!get_params()._autoencoder ? ("predicting " + data_info._adaptedFrame.lastVecName() + ", ") : "") +
                    (get_params()._autoencoder ? "auto-encoder" :
                            _classification ? (units[units.length - 1] + "-class classification") : "regression")
                    + ", " + get_params()._distribution + " distribution, " + get_params()._loss + " loss, "
                    + String.format("%,d", size()) + " weights/biases, " + PrettyPrint.bytes(byte_size) + ", "
                    + String.format("%,d", get_processed_global()) + " training samples, "
                    + "mini-batch size " + String.format("%,d", get_params()._mini_batch_size),
            new String[neurons.length],
            new String[]{"Layer", "Units", "Type", "Dropout", "L1", "L2",
                    "Mean Rate", "Rate RMS", "Momentum",
                    "Mean Weight", "Weight RMS",
                    "Mean Bias", "Bias RMS"
            },
            new String[]{"int", "int", "string", "double", "double", "double",
                    "double", "double", "double",
                    "double", "double",
                    "double", "double"
            },
            new String[]{"%d", "%d", "%s", "%2.2f %%", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f", "%5f"},
            "");

    for (int i = 0; i < neurons.length; ++i) {
      table.set(i, 0, i + 1);
      table.set(i, 1, neurons[i].units);
      table.set(i, 2, neurons[i].getClass().getSimpleName());

      if (i == 0) {
        table.set(i, 3, neurons[i].params._input_dropout_ratio * 100);
        continue;
      } else if (i < neurons.length - 1) {
        if (neurons[i].params._hidden_dropout_ratios == null) {
          table.set(i, 3, 0);
        } else {
          table.set(i, 3, neurons[i].params._hidden_dropout_ratios[i - 1] * 100);
        }
      }
      table.set(i, 4, neurons[i].params._l1);
      table.set(i, 5, neurons[i].params._l2);
      table.set(i, 6, (get_params()._adaptive_rate ? mean_rate[i-1] : neurons[i].rate(get_processed_total())));
      table.set(i, 7, (get_params()._adaptive_rate ? rms_rate[i-1] : 0));
      table.set(i, 8, get_params()._adaptive_rate ? 0 : neurons[i].momentum(get_processed_total()));
      table.set(i, 9, mean_weight[i-1]);
      table.set(i, 10, rms_weight[i-1]);
      table.set(i, 11, mean_bias[i-1]);
      table.set(i, 12, rms_bias[i-1]);
    }
    summaryTable = table;
    return summaryTable;
  }

  /**
   * Print a summary table
   * @return String containing ASCII version of summary table
   */
  @Override public String toString() {
    StringBuilder sb = new StringBuilder();
    if (!get_params()._quiet_mode) {
      if (get_params()._sparsity_beta > 0) {
        for (int k = 0; k < get_params()._hidden.length; k++) {
          sb.append("Average activation in hidden layer ").append(k).append(" is  ").append(mean_a[k]).append(" \n");
        }
      }
      createSummaryTable();
      sb.append(summaryTable.toString(1));
    }
    return sb.toString();
  }

  /**
   * Debugging printout
   * @return String with useful info
   */
  public String toStringAll() {
    StringBuilder sb = new StringBuilder();
    sb.append(toString());

    for (int i = 0; i < units.length - 1; ++i)
      sb.append("\nweights[").append(i).append("][]=").append(Arrays.toString(get_weights(i).raw()));
    for (int i = 0; i < units.length - 1; ++i) {
      sb.append("\nbiases[").append(i).append("][]=").append(Arrays.toString(get_biases(i).raw()));
    }
    if (has_momenta()) {
      for (int i = 0; i < units.length - 1; ++i)
        sb.append("\nweights_momenta[").append(i).append("][]=").append(Arrays.toString(get_weights_momenta(i).raw()));
    }
    if (biases_momenta != null) {
      for (int i = 0; i < units.length - 1; ++i) {
        sb.append("\nbiases_momenta[").append(i).append("][]=").append(Arrays.toString(biases_momenta[i].raw()));
      }
    }
    sb.append("\nunits[]=").append(Arrays.toString(units));
    sb.append("\nprocessed global: ").append(get_processed_global());
    sb.append("\nprocessed local:  ").append(get_processed_local());
    sb.append("\nprocessed total:  ").append(get_processed_total());
    sb.append("\n");
    return sb.toString();
  }

  /**
   * Initialize weights/biases
   */
  void initializeMembers(Key<Frame>[] initial_weights, Key<Frame>[] initial_biases) {
    randomizeWeights();
    //TODO: determine good/optimal/best initialization scheme for biases
    // hidden layers
    for (int i = 0; i < get_params()._hidden.length; ++i) {
      if (get_params()._activation == DeepLearningParameters.Activation.Rectifier
              || get_params()._activation == DeepLearningParameters.Activation.RectifierWithDropout
              || get_params()._activation == DeepLearningParameters.Activation.Maxout
              || get_params()._activation == DeepLearningParameters.Activation.MaxoutWithDropout
              ) {
//          Arrays.fill(biases[i], 1.); //old behavior
        Arrays.fill(biases[i].raw(), i == 0 ? 0.5f : 1f); //new behavior, might be slightly better
      } else if (get_params()._activation == DeepLearningParameters.Activation.Tanh || get_params()._activation == DeepLearningParameters.Activation.TanhWithDropout) {
        Arrays.fill(biases[i].raw(), 0f);
      }
    }
    Arrays.fill(biases[biases.length - 1].raw(), 0f); //output layer
    if (initial_weights!=null || initial_biases!=null) {
      Log.info("Initializing initial model state from user-given weights/biases.");
      for (int i = 0; i < get_params()._hidden.length+1; ++i) {
        if (initial_weights[i] == null) {
          Log.info("No user-given weight matrix given for weights #" + (i+1) + ". Initializing those weights randomly.");
          continue;
        }
        if (initial_biases[i] == null) {
          Log.info("No user-given bias vector given for biases #" + (i+1) + ". Initializing those biases randomly.");
          continue;
        }
        Frame w = initial_weights[i].get();
        if (w==null) {
          throw new IllegalArgumentException("User-given weight matrix for weights #" + (i+1) + " '" + initial_weights[i].toString() + "' not found. Initializing those weights randomly.");
        }
        if (w.numRows() != get_weights(i).rows() || w.numCols() != get_weights(i).cols()) {
          throw new IllegalArgumentException("Dimensionality mismatch: initial_weights matrix #" + i +
                  " should have " +  get_weights(i).rows() + " rows and " + get_weights(i).cols()
                  + " columns, but has " + w.numRows() + " rows and " + w.numCols() + " columns.");
        }
        Frame b = initial_biases[i].get();
        if (b==null) {
          throw new IllegalArgumentException("User-given bias vector for biases #" + (i+1) + " '" + initial_biases[i].toString() + "' not found. Initializing those biases randomly.");
        }
        if (b.numRows() != get_biases(i).size() || b.numCols() != 1) {
          throw new IllegalArgumentException("Dimensionality mismatch: initial_biases vector #" + i +
                  " should have " +  get_biases(i).size() + " rows and 1"
                  + " column, but has " + b.numRows() + " rows and " + b.numCols() + " column(s).");
        }
        for (int c=0; c<w.numCols(); ++c)
          for (int r=0; r<w.numRows(); ++r)
            get_weights(i).set(r,c,(float)w.vec(c).at(r));
        for (int r=0; r<w.numRows(); ++r)
          get_biases(i).set(r,(float)b.vec(0).at(r));
      }
    }
    else {
      Log.info("Created random initial model state.");
    }
  }

  /**
   * Fill weights and biases from a pretrained autoencoder model
   * @param autoencoder Autoencoder DL model with matching inputs and hidden layers
   */
  void initializeFromPretrainedModel(DeepLearningModelInfo autoencoder) {
    assert(autoencoder.parameters._autoencoder);
    randomizeWeights();
    // now overwrite the weights with those from the pretrained model
    for (int w = 0; w < dense_row_weights.length-1 /*skip output layer*/; ++w) {
      if (get_weights(w).rows() != autoencoder.get_weights(w).rows())
        throw new IllegalArgumentException("Mismatch between weights in pretrained model and this model: rows in layer " + w + ": " + autoencoder.get_weights(w).rows() + " vs " + get_weights(w).rows() +
        ". Enable ignored_const_cols for both models and/or check categorical levels for consistency.");
      if (get_weights(w).cols() != autoencoder.get_weights(w).cols())
        throw new IllegalArgumentException("Mismatch between weights in pretrained model and this model: cols in layer " + w + ": " + autoencoder.get_weights(w).cols() + " vs " + get_weights(w).cols() +
      ". Enable ignored_const_cols for both models and/or check categorical levels for consistency.");
      for (int i = 0; i < get_weights(w).rows(); i++) {
        for (int j = 0; j < get_weights(w).cols(); j++) {
          get_weights(w).set(i, j, autoencoder.get_weights(w).get(i, j));
        }
      }
    }
    for (int i = 0; i < get_params()._hidden.length; ++i) {
      for (int j = 0; j < biases[i].raw().length; ++j) {
        biases[i].set(j, autoencoder.biases[i].get(j));
      }
    }
    Arrays.fill(biases[biases.length - 1].raw(), 0f); //output layer
  }

  /**
   * Add another model info into this
   * This will add the weights/biases/learning rate helpers, and the number of processed training samples
   * Note: It will NOT add the elastic averaging helpers, which are always kept constant (they already are the result of a reduction)
   * @param other Other DeepLearningModelInfo to add into this one
   */
  public void add(DeepLearningModelInfo other) {
    for (int i = 0; i < dense_row_weights.length; ++i)
      ArrayUtils.add(get_weights(i).raw(), other.get_weights(i).raw());
    for (int i = 0; i < biases.length; ++i) ArrayUtils.add(biases[i].raw(), other.biases[i].raw());
    if (avg_activations != null)
      for (int i = 0; i < avg_activations.length; ++i)
        ArrayUtils.add(avg_activations[i].raw(), other.biases[i].raw());
    if (has_momenta()) {
      assert (other.has_momenta());
      for (int i = 0; i < dense_row_weights_momenta.length; ++i)
        ArrayUtils.add(get_weights_momenta(i).raw(), other.get_weights_momenta(i).raw());
      for (int i = 0; i < biases_momenta.length; ++i)
        ArrayUtils.add(biases_momenta[i].raw(), other.biases_momenta[i].raw());
    }
    if (adaDelta()) {
      assert (other.adaDelta());
      for (int i = 0; i < dense_row_ada_dx_g.length; ++i) {
        ArrayUtils.add(get_ada_dx_g(i).raw(), other.get_ada_dx_g(i).raw());
      }
    }
    add_processed_local(other.get_processed_local());
  }

  /**
   * Multiply all weights/biases by a real-valued number
   * @param N multiplication factor
   */
  protected void mult(double N) {
    div(1 / N);
  }

  /**
   * Divide all weights/biases by a real-valued number
   * @param N divisor
   */
  protected void div(double N) {
    for (int i = 0; i < dense_row_weights.length; ++i)
      ArrayUtils.div(get_weights(i).raw(), (float)N);
    for (Storage.Vector bias : biases) ArrayUtils.div(bias.raw(), N);
    if (avg_activations != null)
      for (Storage.Vector avgac : avg_activations)
        ArrayUtils.div(avgac.raw(), N);
    if (has_momenta()) {
      for (int i = 0; i < dense_row_weights_momenta.length; ++i)
        ArrayUtils.div(get_weights_momenta(i).raw(), (float)N);
      for (Storage.Vector bias_momenta : biases_momenta) ArrayUtils.div(bias_momenta.raw(), N);
    }
    if (adaDelta()) {
      for (int i = 0; i < dense_row_ada_dx_g.length; ++i) {
        ArrayUtils.div(get_ada_dx_g(i).raw(), (float)N);
      }
    }
  }

  double uniformDist(Random rand, double min, double max) {
    return min + rand.nextFloat() * (max - min);
  }

  /**
   * Initialization of neural net weights
   * cf. http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf
   */
  private void randomizeWeights() {
    for (int w = 0; w < dense_row_weights.length; ++w) {
      final Random rng = water.util.RandomUtils.getRNG(get_params()._seed + 0xBAD5EED + w + 1); //to match NeuralNet behavior
      final double range = Math.sqrt(6. / (units[w] + units[w + 1]));
      for (int i = 0; i < get_weights(w).rows(); i++) {
        for (int j = 0; j < get_weights(w).cols(); j++) {
          if (get_params()._initial_weight_distribution == DeepLearningParameters.InitialWeightDistribution.UniformAdaptive) {
            // cf. http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf
            if (w == dense_row_weights.length - 1 && _classification)
              get_weights(w).set(i, j, (float) (4. * uniformDist(rng, -range, range))); //Softmax might need an extra factor 4, since it's like a sigmoid
            else
              get_weights(w).set(i, j, (float) uniformDist(rng, -range, range));
          } else if (get_params()._initial_weight_distribution == DeepLearningParameters.InitialWeightDistribution.Uniform) {
            get_weights(w).set(i, j, (float) uniformDist(rng, -get_params()._initial_weight_scale, get_params()._initial_weight_scale));
          } else if (get_params()._initial_weight_distribution == DeepLearningParameters.InitialWeightDistribution.Normal) {
            get_weights(w).set(i, j, (float) (rng.nextGaussian() * get_params()._initial_weight_scale));
          }
        }
      }
    }
  }

  // TODO: Add "subset randomize" function
//        int count = Math.min(15, _previous.units);
//        double min = -.1f, max = +.1f;
//        //double min = -1f, max = +1f;
//        for( int o = 0; o < units; o++ ) {
//          for( int n = 0; n < count; n++ ) {
//            int i = rand.nextInt(_previous.units);
//            int w = o * _previous.units + i;
//            _w[w] = uniformDist(rand, min, max);
//          }
//        }

  /**
   * Compute Variable Importance, based on
   * GEDEON: DATA MINING OF INPUTS: ANALYSING MAGNITUDE AND FUNCTIONAL MEASURES
   *
   * @return variable importances for input features
   */
  public float[] computeVariableImportances() {
    float[] vi = new float[units[0]];
    Arrays.fill(vi, 0f);

    float[][] Qik = new float[units[0]][units[2]]; //importance of input i on output k
    float[] sum_wj = new float[units[1]]; //sum of incoming weights into first hidden layer
    float[] sum_wk = new float[units[2]]; //sum of incoming weights into output layer (or second hidden layer)
    for (float[] Qi : Qik) Arrays.fill(Qi, 0f);
    Arrays.fill(sum_wj, 0f);
    Arrays.fill(sum_wk, 0f);

    // compute sum of absolute incoming weights
    for (int j = 0; j < units[1]; j++) {
      for (int i = 0; i < units[0]; i++) {
        float wij = get_weights(0).get(j, i);
        sum_wj[j] += Math.abs(wij);
      }
    }
    for (int k = 0; k < units[2]; k++) {
      for (int j = 0; j < units[1]; j++) {
        float wjk = get_weights(1).get(k, j);
        sum_wk[k] += Math.abs(wjk);
      }
    }
    // compute importance of input i on output k as product of connecting weights going through j
    for (int i = 0; i < units[0]; i++) {
      for (int k = 0; k < units[2]; k++) {
        for (int j = 0; j < units[1]; j++) {
          float wij = get_weights(0).get(j, i);
          float wjk = get_weights(1).get(k, j);
          //Qik[i][k] += Math.abs(wij)/sum_wj[j] * wjk; //Wong,Gedeon,Taggart '95
          Qik[i][k] += Math.abs(wij) / sum_wj[j] * Math.abs(wjk) / sum_wk[k]; //Gedeon '97
        }
      }
    }
    // normalize Qik over all outputs k
    for (int k = 0; k < units[2]; k++) {
      float sumQk = 0;
      for (int i = 0; i < units[0]; i++) sumQk += Qik[i][k];
      for (int i = 0; i < units[0]; i++) Qik[i][k] /= sumQk;
    }
    // importance for feature i is the sum over k of i->k importances
    for (int i = 0; i < units[0]; i++) vi[i] = ArrayUtils.sum(Qik[i]);

    //normalize importances such that max(vi) = 1
    ArrayUtils.div(vi, ArrayUtils.maxValue(vi));

    // zero out missing categorical variables if they were never seen
    if (_saw_missing_cats != null) {
      for (int i = 0; i < _saw_missing_cats.length; ++i) {
        assert (data_info._catMissing[i]); //have a missing bucket for each categorical
        if (!_saw_missing_cats[i]) vi[data_info._catOffsets[i + 1] - 1] = 0;
      }
    }
    return vi;
  }

  /**
   * Compute statistics about this model on all nodes
   */
  public void computeStats() {
    float[][] rate = get_params()._adaptive_rate ? new float[units.length - 1][] : null;

    if (get_params()._autoencoder && get_params()._sparsity_beta > 0) {
      for (int k = 0; k < get_params()._hidden.length; k++) {
        mean_a[k] = 0;
        for (int j = 0; j < avg_activations[k].size(); j++)
          mean_a[k] += avg_activations[k].get(j);
        mean_a[k] /= avg_activations[k].size();
      }
    }

    for (int y = 0; y < units.length-1; y++) {
      mean_rate[y] = rms_rate[y] = 0;
      mean_bias[y] = rms_bias[y] = 0;
      mean_weight[y] = rms_weight[y] = 0;
      for (int u = 0; u < biases[y].size(); u++) {
        mean_bias[y] += biases[y].get(u);
      }
      if (rate != null) rate[y] = new float[get_weights(y).raw().length];
      for (int u = 0; u < get_weights(y).raw().length; u++) {
        mean_weight[y] += get_weights(y).raw()[u];
        if (rate != null) {
//            final float RMS_dx = (float)Math.sqrt(ada[y][2*u]+(float)get_params().epsilon);
//            final float invRMS_g = (float)(1/Math.sqrt(ada[y][2*u+1]+(float)get_params().epsilon));
          final float RMS_dx = MathUtils.approxSqrt(get_ada_dx_g(y).raw()[2 * u] + (float) get_params()._epsilon);
          final float invRMS_g = MathUtils.approxInvSqrt(get_ada_dx_g(y).raw()[2 * u + 1] + (float) get_params()._epsilon);
          rate[y][u] = RMS_dx * invRMS_g; //not exactly right, RMS_dx should be from the previous time step -> but close enough for diagnostics.
          mean_rate[y] += rate[y][u];
        }
      }
      mean_bias[y] /= biases[y].size();
      mean_weight[y] /= get_weights(y).size();
      if (rate != null) mean_rate[y] /= rate[y].length;
      for (int u = 0; u < biases[y].size(); u++) {
        final double db = biases[y].get(u) - mean_bias[y];
        rms_bias[y] += db * db;
      }
      for (int u = 0; u < get_weights(y).size(); u++) {
        final double dw = get_weights(y).raw()[u] - mean_weight[y];
        rms_weight[y] += dw * dw;
        if (rate != null) {
          final double drate = rate[y][u] - mean_rate[y];
          rms_rate[y] += drate * drate;
        }
      }
      rms_bias[y] = MathUtils.approxSqrt(rms_bias[y] / biases[y].size());
      rms_weight[y] = MathUtils.approxSqrt(rms_weight[y] / get_weights(y).size());
      if (rate != null) rms_rate[y] = MathUtils.approxSqrt(rms_rate[y]/ rate[y].length);
//        rms_bias[y] = (float)Math.sqrt(rms_bias[y]/biases[y].length);
//        rms_weight[y] = (float)Math.sqrt(rms_weight[y]/weights[y].length);
//        if (rate != null) rms_rate[y] = (float)Math.sqrt(rms_rate[y]/rate[y].length);

      // Abort the run if weights or biases are unreasonably large (Note that all input values are normalized upfront)
      // This can happen with Rectifier units when L1/L2/max_w2 are all set to 0, especially when using more than 1 hidden layer.
      final double thresh = 1e10;
      final double bthresh = 1e5;
      unstable |= isNaN(mean_bias[y]) || isNaN(rms_bias[y])
          || isNaN(mean_weight[y]) || isNaN(rms_weight[y])
          // large weights
          || Math.abs(mean_weight[y]) > thresh
          || rms_weight[y] > thresh
          // large biases
          || Math.abs(mean_bias[y]) > bthresh
          || rms_bias[y] > bthresh;
    }
  }

  /**
   * Unique identifier for this model's state, based on raw numbers
   */
  protected long checksum_impl() {
    computeStats();
    Random rng = new Random(0xDECAFBBB);
    double cs = Double.longBitsToDouble(get_params()._seed);
    cs += size() * get_processed_total();
    for (double d : mean_bias) cs += (rng.nextDouble() * (d+123.23));
    for (double d : rms_bias) cs += (rng.nextDouble() * (d+123.23));
    for (double d : mean_weight) cs += (rng.nextDouble() * (d+123.23));
    for (double d : rms_weight) cs += (rng.nextDouble() * (d+123.23));
    for (double d : mean_rate) cs += (rng.nextDouble() * (d+123.23));
    for (double d : rms_rate) cs += (rng.nextDouble() * (d+123.23));
    return Double.doubleToRawLongBits(cs);
  }

  /**
   * TimeAveraging as part of Elastic Averaging Algorithm
   * Cf. equation 6 of arXiv:1412.6651v5
   * @param nodeAverageModel current average of per-node models
   * @return Time-average of node-averages (consensus model, "the" model)
   */
  public static DeepLearningModelInfo timeAverage(DeepLearningModelInfo nodeAverageModel) {
    float pa = (float) nodeAverageModel.get_params()._elastic_averaging_moving_rate;
    assert(pa > 0 && pa <= 1);
    DeepLearningModelInfo elasticAverage = DKV.getGet(nodeAverageModel.elasticAverageModelInfoKey()); //get latest version from DKV
    if (elasticAverage == null || pa == 1) {
      elasticAverage = IcedUtils.deepCopy(nodeAverageModel);
    } else {
      nodeAverageModel.mult(pa);
      elasticAverage.mult(1 - pa);
      elasticAverage.add(nodeAverageModel); //ignore processed local value set here
      elasticAverage.set_processed_global(nodeAverageModel.get_processed_global());
    }
    elasticAverage.set_processed_local(0);
    DKV.put(elasticAverage.elasticAverageModelInfoKey(), elasticAverage);

//    nodeAverageModel.computeStats();
//    elasticAverage.computeStats();
//    Log.info("Local Model    :\n" + nodeAverageModel.toString());
//    Log.info("Elastic Average:\n" + elasticAverage.toString());
    return elasticAverage;
  }

  public Key localModelInfoKey(H2ONode node) {
    return Key.make(_model_id + ".node" + node.index(), (byte) 1 /*replica factor*/, (byte) 31 /*hidden user-key*/, true, node);
  }

  public Key elasticAverageModelInfoKey() {
    return Key.make(_model_id + ".elasticaverage", (byte) 1 /*replica factor*/, (byte) 31 /*hidden user-key*/, true, H2O.CLOUD._memary[0]);
  }

  static public class GradientCheck {
    GradientCheck(int l, int r, int c) { layer=l; row=r; col=c; gradient=0;}
    int layer;
    int row;
    int col;
    double gradient;
    void apply(int l, int r, int c, double g) {
      if (r==row && c==col && l==layer) {
        gradient += g;
      }
    }
  }
  static public GradientCheck gradientCheck = null;
  static public GradientCheck gradientCheckBias = null;
}