Neurons.java example

Explorer
h2o-3-master
package hex.deeplearning;

import hex.DataInfo;
import hex.Distribution;
import hex.deeplearning.DeepLearningModel.DeepLearningParameters;
import water.H2O;
import water.MemoryManager;
import water.util.ArrayUtils;
import water.util.MathUtils;

import java.nio.ByteBuffer;
import java.util.Arrays;

/**
 * This class implements the concept of a Neuron layer in a Neural Network
 * During training, every MRTask F/J thread is expected to create these neurons for every map call (Cheap to make).
 * These Neurons are NOT sent over the wire.
 * The weights connecting the neurons are in a separate class (DeepLearningModel.DeepLearningModelInfo), and will be shared per node.
 */
public abstract class Neurons {
  short _k; //number of parallel channels
  int[/*minibatch*/][/*k*/] _maxIncoming; //index of largest incoming signal (out of k channels)

  Distribution _dist;
  protected int units;

  /**
   * Constructor of a Neuron Layer
   * @param units How many neurons are in this layer?
   */
  Neurons(int units) {
    this.units = units;
  }

  /**
   * Print the status of this neuron layer
   * @return populated String
   */
  @Override
  public String toString() {
    String s = this.getClass().getSimpleName();
    s += "\nNumber of Neurons: " + units;
    s += "\nParameters:\n" + params.toString();
    if (_dropout != null) s += "\nDropout:\n" + _dropout.toString();
    return s;
  }

  /**
   * Parameters (deep-cloned() from the user input, can be modified here, e.g. learning rate decay)
   */
  protected transient DeepLearningParameters params;
  protected transient int _index; //which hidden layer it is

  /**
   * Layer state (one per neuron): activity, error
   */
  public transient Storage.DenseVector[] _origa;
  public transient Storage.DenseVector[] _a;
  public transient Storage.DenseVector[] _e;

  /**
   * References for feed-forward connectivity
   */
  public Neurons _previous;
  public Neurons _input;
  DeepLearningModelInfo _minfo; //reference to shared model info
  public Storage.DenseRowMatrix _w;
  public Storage.DenseRowMatrix _wEA; //weights for elastic averaging
  public Storage.DenseVector _b;
  public Storage.DenseVector _bEA; //bias for elastic averaging

  /**
   * References for momentum training
   */
  Storage.DenseRowMatrix _wm;
  Storage.DenseVector _bm;

  /**
   * References for ADADELTA
   */
  Storage.DenseRowMatrix _ada_dx_g;
  Storage.DenseVector _bias_ada_dx_g;

  /**
   * For Dropout training
   */
  protected Dropout _dropout;

  /**
   * Helper to shortcut bprop
   */
  private boolean _shortcut = false;

  public Storage.DenseVector _avg_a;

  /**
   * Helper to check sanity of Neuron layers
   * @param training whether training or testing is done
   */
  void sanityCheck(boolean training) {
    if (this instanceof Input) {
      assert(_previous == null);
    } else {
      assert(_previous != null);
      if (_minfo.has_momenta()) {
        assert(_wm != null);
        assert(_bm != null);
        assert(_ada_dx_g == null);
      }
      if (_minfo.adaDelta()) {
        if (params._rho == 0) throw new IllegalArgumentException("rho must be > 0 if epsilon is >0.");
        if (params._epsilon == 0) throw new IllegalArgumentException("epsilon must be > 0 if rho is >0.");
        assert(_minfo.adaDelta());
        assert(_bias_ada_dx_g != null);
        assert(_wm == null);
        assert(_bm == null);
      }
      if (this instanceof MaxoutDropout || this instanceof TanhDropout || this instanceof RectifierDropout) {
        assert (!training || _dropout != null);
      }
    }
  }

  /**
   * Initialization of the parameters and connectivity of a Neuron layer
   * @param neurons Array of all neuron layers, to establish feed-forward connectivity
   * @param index Which layer am I?
   * @param p User-given parameters (Job parental object hierarchy is not used)
   * @param minfo Model information (weights/biases and their momenta)
   * @param training Whether training is done or just testing (no need for dropout)
   */
  public final void init(Neurons[] neurons, int index, DeepLearningParameters p, final DeepLearningModelInfo minfo, boolean training) {
    _index = index-1;
    params = (DeepLearningParameters)p.clone();
    params._hidden_dropout_ratios = minfo.get_params()._hidden_dropout_ratios;
    params._rate *= Math.pow(params._rate_decay, index-1);
    params._distribution = minfo.get_params()._distribution;
    _dist = new Distribution(params);
    _a = new Storage.DenseVector[params._mini_batch_size];
    for (int mb=0;mb<_a.length;++mb) _a[mb] = new Storage.DenseVector(units);
    if (!(this instanceof Input)) {
      _e = new Storage.DenseVector[params._mini_batch_size];
      for (int mb=0;mb<_e.length;++mb) _e[mb] = new Storage.DenseVector(units);
    } else if (params._autoencoder && params._input_dropout_ratio > 0) {
      _origa = new Storage.DenseVector[params._mini_batch_size];
      for (int mb=0;mb<_origa.length;++mb) _origa[mb] = new Storage.DenseVector(units);
    }
    if (training && (this instanceof MaxoutDropout || this instanceof TanhDropout
            || this instanceof RectifierDropout || this instanceof ExpRectifierDropout || this instanceof Input) ) {
      _dropout = this instanceof Input ?
              (params._input_dropout_ratio==0 ? null : new Dropout(units, params._input_dropout_ratio)) //input dropout
              : new Dropout(units, params._hidden_dropout_ratios[_index]); //hidden dropout
    }
    if (!(this instanceof Input)) {
      _previous = neurons[_index]; //incoming neurons
      _minfo = minfo;
      _w = minfo.get_weights(_index); //incoming weights
      _b = minfo.get_biases(_index); //bias for this layer (starting at hidden layer)
      if(params._autoencoder && params._sparsity_beta > 0 && _index < params._hidden.length) {
        _avg_a = minfo.get_avg_activations(_index);
      }
      if (minfo.has_momenta()) {
        _wm = minfo.get_weights_momenta(_index); //incoming weights
        _bm = minfo.get_biases_momenta(_index); //bias for this layer (starting at hidden layer)
      }
      if (minfo.adaDelta()) {
        _ada_dx_g = minfo.get_ada_dx_g(_index);
        _bias_ada_dx_g = minfo.get_biases_ada_dx_g(_index);
      }
      _shortcut = (params._fast_mode || (
              // not doing fast mode, but also don't have anything else to update (neither momentum nor ADADELTA history), and no L1/L2
              !params._adaptive_rate && !_minfo.has_momenta() && params._l1 == 0.0 && params._l2 == 0.0));
    }
    sanityCheck(training);
  }

  /**
   * Forward propagation
   * @param seed For seeding the RNG inside (for dropout)
   * @param training Whether training is done or just testing (no need for dropout)
   * @param n number of actually trained samples in this mini-batch
   */
  protected abstract void fprop(long seed, boolean training, int n);

  /**
   *  Back propagation of error terms stored in _e (for non-final layers)
   */
  protected abstract void bprop(int n);

  /**
   * Back-propagate gradient in output layer
   */
  final protected void bpropOutputLayer(int n) {
    assert(_index == params._hidden.length);
    assert(_a.length == params._mini_batch_size);

    final int rows = _a[0].size();
    float m = _minfo.adaDelta() ? 0 : momentum();
    float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
    for( int row = 0; row < rows; row++ ) {
      double[] g = new double[n];
      for (int mb=0;mb<n;++mb)
        g[mb]=_e[mb].raw()[row];
      bprop(row, g, r, m, n);
    }
  }

  /**
   * Accumulation of reconstruction errors for a generic Neurons class
   * (This is only used for AutoEncoders)
   */
  protected void setOutputLayerGradient(double ignored, int mb, int n) {
    assert (_minfo.get_params()._autoencoder && _index == _minfo.get_params()._hidden.length);
    final int rows = _a[mb].size();
    for (int row = 0; row < rows; row++)
      _e[mb].set(row, autoEncoderGradient(row, mb)/n);
  }

  /**
   * Backpropagation: w -= rate * dE/dw, where dE/dw = dE/dy * dy/dnet * dnet/dw
   * This method adds the dnet/dw = activation term per unit
   * @param row row index (update weights feeding to this neuron)
   * @param partial_grad partial derivative dE/dnet = dE/dy * dy/net
   * @param rate learning rate
   * @param momentum momentum factor (needed only if ADADELTA isn't used)
   * @param n Actual mini-batch size
   */
  final void bprop(final int row, final double[/*actual mini-batch size*/] partial_grad, final float rate, final float momentum, int n) {
    final float rho = (float)params._rho;
    final float eps = (float)params._epsilon;
    final float l1 = (float)params._l1;
    final float l2 = (float)params._l2;
    final float max_w2 = params._max_w2;
    final boolean have_momenta = _minfo.has_momenta();
    final boolean have_ada = _minfo.adaDelta();
    final boolean nesterov = params._nesterov_accelerated_gradient;
    final boolean fast_mode = params._fast_mode;
    final int cols = _previous._a[0].size();
    assert(partial_grad.length == n);

    double avg_grad2 = 0;

    final int idx = row * cols;

    for( int mb = 0; mb < n; mb++ ) {
      if (_shortcut && partial_grad[mb] == 0f) return;
      final boolean update_prev = _previous._e != null && _previous._e[mb] != null;
      for( int col = 0; col < cols; col++ ) {
        int w = idx + col;

        // for Maxout, return the "winning" linear index into the matrix
        if (_k != 0) w = _k * w + _maxIncoming[mb][row];

        final double weight = _w.raw()[w];
        if( update_prev ) _previous._e[mb].add(col, partial_grad[mb] * weight); // propagate the error dE/dnet to the previous layer, via connecting weights
        final double previous_a = _previous._a[mb].get(col);
        if (fast_mode && previous_a == 0) continue;

        //this is the actual gradient dE/dw
        double grad = partial_grad[mb] * previous_a + Math.signum(weight) * l1 + weight * l2;
        if (_wEA !=null) {
          grad += params._elastic_averaging_regularization * (_w.raw()[w] -_wEA.raw()[w]);
//        Log.info("weight: my: " + _w.raw()[w] + ", consensus: " + _wEA.raw()[w] + ", delta: " + (_w.raw()[w] -_wEA.raw()[w]) + ", relative delta: " + (_w.raw()[w] -_wEA.raw()[w])/_w.raw()[w]);
        }

        // store the gradient
        if (DeepLearningModelInfo.gradientCheck != null)
          DeepLearningModelInfo.gradientCheck.apply(_index, row, col, grad);

        if (have_ada) {
          final double grad2 = grad*grad;
          avg_grad2 += grad2;
          float brate = computeAdaDeltaRateForWeight(grad, w, _ada_dx_g, rho, eps);
          _w.raw()[w] -= brate * grad;
        } else {
          if (!nesterov) {
            final double delta = -rate * grad;
            _w.raw()[w] += delta;
            if( have_momenta ) {
              _w.raw()[w] += momentum * _wm.raw()[w];
              _wm.raw()[w] = (float)delta;
            }
          } else {
            double tmp = -grad;
            if( have_momenta ) {
              _wm.raw()[w] *= momentum;
              _wm.raw()[w] += tmp;
              tmp = _wm.raw()[w];
            }
            _w.raw()[w] += rate * tmp;
          }
        }
      }
    }
    if (max_w2 != Float.POSITIVE_INFINITY)
      for( int mb = 0; mb < n; mb++ )
        rescale_weights(_w, row, max_w2, mb);
    if (have_ada) avg_grad2 /= cols * n;
    for( int mb = 0; mb < n; mb++ ) {
      update_bias(_b, _bEA, _bm, row, partial_grad, avg_grad2, rate, momentum, mb);
    }
  }

  private void rescale_weights(final Storage.DenseRowMatrix w, final int row, final float max_w2, int mb) {
    final int cols = _previous._a[0].size();
    int start;
    int end;
    if (_k != 0) {
      start = _k * (row*cols           ) + _maxIncoming[mb][row];
      end =   _k * (row*cols + (cols-1)) + _maxIncoming[mb][row];
    } else {
      if (mb>0) return; //already done rescaling for mb=0
      start = row * cols;
      end =   row * cols + cols;
    }
    float r2 = MathUtils.sumSquares(w.raw(), start, end);
//    float r2 = MathUtils.approxSumSquares(w.raw(), idx, idx + cols);
    if( r2 > max_w2) {
      final float scale = MathUtils.approxSqrt(max_w2 / r2);
      for( int c = start; c < end; c++ )
        w.raw()[c] *= scale;
    }
  }

  /**
   * Helper to compute the reconstruction error for auto-encoders (part of the gradient computation)
   * @param row neuron index
   * @param mb minibatch-internal index
   * @return difference between the output (auto-encoder output layer activation) and the target (input layer activation)
   */
  protected double autoEncoderGradient(int row, int mb) {
    assert (_minfo.get_params()._autoencoder && _index == _minfo.get_params()._hidden.length);
    final double t = _input._origa != null ? _input._origa[mb].get(row) : _input._a[mb].get(row);
    final double y = _a[mb].get(row);
    return -2*_dist.negHalfGradient(t, y);
  }

  /**
   * Compute learning rate with AdaDelta, specialized for DenseRowMatrix
   * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
   * @param grad gradient
   * @param w neuron index
   * @param ada_dx_g Matrix holding helper values (2 floats per weight)
   * @param rho hyper-parameter #1
   * @param eps hyper-parameter #2
   * @return learning rate
   */
  private static float computeAdaDeltaRateForWeight(final double grad, final int w,
                                                  final Storage.DenseRowMatrix ada_dx_g,
                                                  final float rho, final float eps) {
    final double grad2 = grad*grad;
    ada_dx_g.raw()[2*w+1] = (float)(rho * ada_dx_g.raw()[2*w+1] + (1 - rho) * grad2);
    final float rate = MathUtils.approxSqrt((ada_dx_g.raw()[2 * w] + eps) / (ada_dx_g.raw()[2 * w + 1] + eps));
    ada_dx_g.raw()[2*w  ] = (float)(rho * ada_dx_g.raw()[2*w]   + (1 - rho) * rate * rate * grad2);
    return rate;
  }

  /**
   * Compute learning rate with AdaDelta, specialized for DenseVector (Bias)
   * @param grad2 squared gradient
   * @param row neuron index
   * @param bias_ada_dx_g Matrix holding helper values (2 floats per weight)
   * @param rho hyper-parameter #1
   * @param eps hyper-parameter #2
   * @return learning rate
   */
  private static double computeAdaDeltaRateForBias(final double grad2, final int row,
                                                  final Storage.DenseVector bias_ada_dx_g,
                                                  final float rho, final float eps) {
    bias_ada_dx_g.raw()[2*row+1] = rho * bias_ada_dx_g.raw()[2*row+1] + (1f - rho) * grad2;
    final double rate = MathUtils.approxSqrt((bias_ada_dx_g.raw()[2 * row] + eps) / (bias_ada_dx_g.raw()[2 * row + 1] + eps));
    bias_ada_dx_g.raw()[2*row]   = rho * bias_ada_dx_g.raw()[2*row  ] + (1f - rho) * rate * rate * grad2;
    return rate;
  }

  /**
   * Helper to enforce learning rule to satisfy sparsity constraint:
   * Computes the (rolling) average activation for each (hidden) neuron.
   */
  void compute_sparsity() {
    if (_avg_a != null) {
      if (params._mini_batch_size > 1) throw H2O.unimpl("Sparsity constraint is not yet implemented for mini-batch size > 1.");
      for (int mb = 0; mb < _minfo.get_params()._mini_batch_size; ++mb) {
        for (int row = 0; row < _avg_a.size(); row++) {
          _avg_a.set(row, 0.999 * (_avg_a.get(row)) + 0.001 * (_a[mb].get(row))); //TODO: fix for mini-batch size > 1
        }
      }
    }
  }

  /**
   * Helper to update the bias values
   * @param _b bias vector
   * @param _bEA elastic average bias vector
   * @param _bm bias momentum vector
   * @param row index of the neuron for which we back-propagate
   * @param partial_grad partial derivative dE/dnet = dE/dy * dy/net
   * @param avg_grad2 average squared gradient for this neuron's incoming weights (only for ADADELTA)
   * @param rate learning rate
   * @param momentum momentum factor (needed only if ADADELTA isn't used)
   * @param mb which mini-batch index
   */
  private void update_bias(final Storage.DenseVector _b, final Storage.DenseVector _bEA, final Storage.DenseVector _bm, final int row,
                   double[/*actual mini-batch size*/] partial_grad, final double avg_grad2, double rate, final double momentum, int mb) {
    final boolean have_momenta = _minfo.has_momenta();
    final boolean have_ada = _minfo.adaDelta();
    final float l1 = (float)params._l1;
    final float l2 = (float)params._l2;
    final int b = _k != 0 ? _k*row+_maxIncoming[mb][row] : row;
    final double bias = _b.get(b);

    partial_grad[mb] += Math.signum(bias) * l1 + bias * l2;
    if (_bEA != null) partial_grad[mb] += (bias - _bEA.get(b)) * params._elastic_averaging_regularization;

    // store the gradient
    if (DeepLearningModelInfo.gradientCheck != null)
      DeepLearningModelInfo.gradientCheck.apply(_index, row, -1, partial_grad[mb]);

    if (have_ada) {
      final float rho = (float)params._rho;
      final float eps = (float)params._epsilon;
      rate = computeAdaDeltaRateForBias(avg_grad2, b, _bias_ada_dx_g, rho, eps);
    }
    if (!params._nesterov_accelerated_gradient) {
      final double delta = -rate * partial_grad[mb];
      _b.add(b, delta);
      if (have_momenta) {
        _b.add(b, momentum * _bm.get(b));
        _bm.set(b, delta);
      }
    } else {
      double d = -partial_grad[mb];
      if (have_momenta) {
        _bm.set(b, _bm.get(b) * momentum);
        _bm.add(b, d);
        d = _bm.get(b);
      }
      _b.add(b, rate * d);
    }
    //update for sparsity constraint
    if (params._autoencoder && params._sparsity_beta > 0 && !(this instanceof Output) && !(this instanceof Input) && (_index != params._hidden.length)) {
      _b.add(b, -(rate * params._sparsity_beta * (_avg_a.raw()[b] - params._average_activation)));
    }
    if (Double.isInfinite(_b.get(b))) _minfo.setUnstable();
  }


  /**
   * The learning rate
   * @param n The number of training samples seen so far (for rate_annealing greater than 0)
   * @return Learning rate
   */
  public float rate(double n) {
    return (float)(params._rate / (1 + params._rate_annealing * n));
  }

  protected float momentum() {
    return momentum(-1);
  }
  /**
   * The momentum - real number in [0, 1)
   * Can be a linear ramp from momentum_start to momentum_stable, over momentum_ramp training samples
   * @param n The number of training samples seen so far
   * @return momentum
   */
  final public float momentum(double n) {
    double m = params._momentum_start;
    if( params._momentum_ramp > 0 ) {
      final double num = n != -1 ? _minfo.get_processed_total() : n;
      if( num >= params._momentum_ramp)
        m = params._momentum_stable;
      else
        m += (params._momentum_stable - params._momentum_start) * num / params._momentum_ramp;
    }
    return (float)m;
  }

  /**
   * Input layer of the Neural Network
   * This layer is different from other layers as it has no incoming weights,
   * but instead gets its activation values from the training points.
   */
  public static class Input extends Neurons {

    private DataInfo _dinfo; //training data

    Input(DeepLearningParameters params, int units, final DataInfo d) {
      super(units);
      _dinfo = d;
      _a = new Storage.DenseVector[params._mini_batch_size];
      for (int i=0;i<_a.length;++i) _a[i] = new Storage.DenseVector(units);
    }

    @Override protected void bprop(int n) { throw new UnsupportedOperationException(); }
    @Override protected void fprop(long seed, boolean training, int n) { throw new UnsupportedOperationException(); }

    /**
     * One of two methods to set layer input values. This one is for raw double data, e.g. for scoring
     * @param seed For seeding the RNG inside (for input dropout)
     * @param data Data (training columns and responses) to extract the training columns
     *             from to be mapped into the input neuron layer
     * @param mb Mini-Batch index (which point inside this mini-batch)
     */
    public void setInput(long seed, final double[] data, int mb) {
//      Log.info("Data: " + ArrayUtils.toString(data));
      assert(_dinfo != null);
      double [] nums = MemoryManager.malloc8d(_dinfo._nums); // a bit wasteful - reallocated each time
      int    [] cats = MemoryManager.malloc4(_dinfo._cats); // a bit wasteful - reallocated each time
      int i = 0, ncats = 0;
      for(; i < _dinfo._cats; ++i){
        assert(_dinfo._catMissing[i]); //we now *always* have a categorical level for NAs, just in case.
        if (Double.isNaN(data[i])) {
          cats[ncats] = (_dinfo._catOffsets[i+1]-1); //use the extra level for NAs made during training
        } else {
          int c = (int)data[i];

          if (_dinfo._useAllFactorLevels)
            cats[ncats] = c + _dinfo._catOffsets[i];
          else if (c!=0)
            cats[ncats] = c + _dinfo._catOffsets[i] - 1;

          // If factor level in test set was not seen by training, then turn it into an NA
          if (cats[ncats] >= _dinfo._catOffsets[i+1]) {
            cats[ncats] = (_dinfo._catOffsets[i+1]-1);
          }
        }
        ncats++;
      }
      for(;i < data.length;++i){
        double d = data[i];
        if(_dinfo._normMul != null) d = (d - _dinfo._normSub[i-_dinfo._cats])*_dinfo._normMul[i-_dinfo._cats];
        nums[i-_dinfo._cats] = d; //can be NaN for missing numerical data
      }
      setInput(seed, null, nums, ncats, cats, mb);
    }

    /**
     * The second method used to set input layer values. This one is used directly by FrameTask.processRow() and by the method above.
     * @param seed For seeding the RNG inside (for input dropout)
     * @param nums Array containing numerical values, can be NaN
     * @param numcat Number of horizontalized categorical non-zero values (i.e., those not being the first factor of a class)
     * @param cats Array of indices, the first numcat values are the input layer unit (==column) indices for the non-zero categorical values
     *             (This allows this array to be re-usable by the caller, without re-allocating each time)
     * @param mb Mini-Batch index (which point inside this mini-batch)
     */
    public void setInput(long seed, final int[] numIds, final double[] nums, final int numcat, final int[] cats, int mb) {
      Arrays.fill(_a[mb].raw(), 0f);

      // random projection from fullN down to max_categorical_features
      if (params._max_categorical_features < _dinfo.fullN() - _dinfo._nums) {
        assert(nums.length == _dinfo._nums);
        final int M = nums.length + params._max_categorical_features;
//        final boolean random_projection = false;
//        final boolean hash_trick = true;
//        if (random_projection) {
//          final int N = _dinfo.fullN();
//          assert (_a.size() == M);
//
//          // sparse random projection
//          for (int i = 0; i < M; ++i) {
//            for (int c = 0; c < numcat; ++c) {
//              int j = cats[c];
//              Random rng = RandomUtils.getRNG(params._seed + i * N + j);
//              double val = 0;
//              final float rnd = rng.nextFloat();
//              if (rnd < 1. / 6.) val = Math.sqrt(3);
//              if (rnd > 5. / 6.) val = -Math.sqrt(3);
//              _a.add(i, 1f * val);
//            }
//            Random rng = RandomUtils.getRNG(params._seed + i*N + _dinfo.numStart());
//            for (int n = 0; n < nums.length; ++n) {
//              double val = 0;
//              final float rnd = rng.nextFloat();
//              if (rnd < 1. / 6.) val = Math.sqrt(3);
//              if (rnd > 5. / 6.) val = - Math.sqrt(3);
//              _a.set(i, (Double.isNaN(nums[n]) ? 0f /*Always do MeanImputation during scoring*/ : nums[n]) * val);
//            }
//          }
//        } else if (hash_trick) {
          // Use hash trick for categorical features
          assert (_a[mb].size() == M);
          // hash N-nums.length down to M-nums.length = cM (#categorical slots - always use all numerical features)
          final int cM = params._max_categorical_features;

          assert (_a[mb].size() == M);
          MurmurHash murmur = MurmurHash.getInstance();
          for (int i = 0; i < numcat; ++i) {
            ByteBuffer buf = ByteBuffer.allocate(4);
            int hashval = murmur.hash(buf.putInt(cats[i]).array(), 4, (int)params._seed); // turn horizontalized categorical integer into another integer, based on seed
            _a[mb].add(Math.abs(hashval % cM), 1f); // restrict to limited range
          }
          for (int i = 0; i < nums.length; ++i)
            _a[mb].set(cM + i, Double.isNaN(nums[i]) ? 0f /*Always do MeanImputation during scoring*/ : nums[i]);
//        }
      } else {
        assert(_a[mb].size() == _dinfo.fullN());
        for (int i = 0; i < numcat; ++i) _a[mb].set(cats[i], 1f); // one-hot encode categoricals
        if (numIds != null) {
          //sparse
          for (int i = 0; i < numIds.length; ++i)
            _a[mb].set(numIds[i], Double.isNaN(nums[i]) ? 0f /*Always do MeanImputation during scoring*/ : nums[i]);
        } else {
          //dense
          for (int i = 0; i < nums.length; ++i)
            _a[mb].set(_dinfo.numStart() + i, Double.isNaN(nums[i]) ? 0f /*Always do MeanImputation during scoring*/ : nums[i]);
        }
      }

      // Input Dropout
      if (_dropout == null) return;
      if (params._autoencoder && params._input_dropout_ratio > 0) {
        // copy input into _origa -- needed for reconstruction error
        System.arraycopy(_a[mb].raw(), 0, _origa[mb].raw(), 0, _a[mb].raw().length);
      }
      seed += params._seed + 0x1337B4BE;
      _dropout.randomlySparsifyActivation(_a[mb], seed);
    }

  }

  /**
   * Tanh neurons - most common, most stable
   */
  public static class Tanh extends Neurons {
    public Tanh(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training, int n) {
      // TODO: implement GEMM
      for (int mb=0;mb<n;++mb)
        gemv(_a[mb], _w, _previous._a[mb], _b, _dropout != null ? _dropout.bits() : null);
      final int rows = _a[0].size();
      for (int mb=0;mb<n;++mb)
        for( int row = 0; row < rows; row++ )
          _a[mb].set(row, 1. - 2. / (1. + Math.exp(2*_a[mb].get(row)))); //evals faster than tanh(x), but is slightly less numerically stable - OK
      compute_sparsity();
    }
    // Computing partial derivative g = dE/dnet = dE/dy * dy/dnet, where dE/dy is the backpropagated error
    // dy/dnet = (1 - a^2) for y(net) = tanh(net)
    @Override protected void bprop(int n) {
      assert (_index < _minfo.get_params()._hidden.length);
      float m = _minfo.adaDelta() ? 0 : momentum();
      float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
      final int rows = _a[0].size();
      double[] g = new double[n];
      for (int row = 0; row < rows; row++) {
        for (int mb=0;mb<n;++mb)
          g[mb] = _e[mb].get(row) * (1 - _a[mb].get(row) * _a[mb].get(row));
        bprop(row, g, r, m, n);
      }
    }
  }

  /**
   * Tanh neurons with dropout
   */
  public static class TanhDropout extends Tanh {
    public TanhDropout(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training, int n) {
      if (training) {
        seed += params._seed + 0xDA7A6000;
        _dropout.fillBytes(seed);
        super.fprop(seed, true, n);
      }
      else {
        super.fprop(seed, false, n);
        for (int mb=0;mb<n;++mb)
          ArrayUtils.mult(_a[mb].raw(), 1-params._hidden_dropout_ratios[_index]);
      }
    }
  }

  /**
   * Maxout neurons (picks the max out of the k activation_j = sum(A_ij*x_i) + b_j)
   * Requires k times the model parameters (weights/biases) as a "normal" neuron
   */
  public static class Maxout extends Neurons {

    public Maxout(DeepLearningParameters params, short k, int units) { super(units);
      _k = k;
      _maxIncoming=new int[params._mini_batch_size][];
      for (int i=0;i<_maxIncoming.length;++i) _maxIncoming[i]=new int[units];
      if (_k!=2) throw H2O.unimpl("Maxout is currently hardcoded for 2 channels. Trivial to enable k > 2 though.");
    }
    @Override protected void fprop(long seed, boolean training, int n) {
      assert(_b.size() == _a[0].size() * _k);
      assert(_w.size() == _a[0].size() * _previous._a[0].size() * _k);
      final int rows = _a[0].size();
      double[] channel = new double[_k];
      for( int row = 0; row < rows; row++ ) {
        for (int mb=0;mb<n;++mb) {
          _a[mb].set(row, 0);
          if( !training || _dropout == null || _dropout.unit_active(row) ) {
            final int cols = _previous._a[mb].size();
            // For each neuron in the previous layer, there's k channels
            // Each channel has its own weight and bias values
            // The channel leading to the highest incoming value (W*x + b) is the "winner" and will activate this neuron
            short maxK = 0;
            for( short k = 0; k < _k; k++ ) {
              channel[k] = 0;
              for( int col = 0; col < cols; col++ ) {
                channel[k] += _w.raw()[_k*(row * cols + col) + k] * _previous._a[mb].get(col);
              }
              channel[k] += _b.raw()[_k*row+k];
              if (channel[k] > channel[maxK]) maxK=k;
            }
            _maxIncoming[mb][row] = maxK;
            _a[mb].set(row, channel[maxK]);
          }
        }
        compute_sparsity();
      }
    }

    @Override protected void bprop(int n) {
      assert(_index != params._hidden.length);
      float m = _minfo.adaDelta() ? 0 : momentum();
      float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
      double[] g = new double[n];
      final int rows = _a[0].size();
      for (int row = 0; row < rows; row++) {
        for (int mb=0;mb<n;++mb)
          g[mb] = _e[mb].get(row);
        bprop(row, g, r, m, n);
      }
    }
  }

  /**
   * Maxout neurons with dropout
   */
  public static class MaxoutDropout extends Maxout {
    public MaxoutDropout(DeepLearningParameters params, short k, int units) { super(params,k,units); }
    @Override protected void fprop(long seed, boolean training, int n) {
      if (training) {
        seed += params._seed + 0x51C8D00D;
        _dropout.fillBytes(seed);
        super.fprop(seed, true, n);
      }
      else {
        super.fprop(seed, false, n);
        for (int mb=0;mb<n;++mb)
          ArrayUtils.mult(_a[mb].raw(), 1-params._hidden_dropout_ratios[_index]);
      }
    }
  }

  /**
   * Rectifier linear unit (ReLU) neurons
   */
  public static class Rectifier extends Neurons {
    public Rectifier(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training, int n) {
      // TODO: implement GEMM
      for (int mb=0;mb<n;++mb)
        gemv(_a[mb], _w, _previous._a[mb], _b, _dropout != null ? _dropout.bits() : null);
      final int rows = _a[0].size();
      for (int mb=0;mb<n;++mb) {
        for( int row = 0; row < rows; row++ ) {
          _a[mb].set(row, 0.5f* (_a[mb].get(row) + Math.abs(_a[mb].get(row)))); //faster than max(a, 0)
//        _a.set(row, Math.max(_a.get(row), 0f));
        }
      }
      compute_sparsity();
    }

    @Override protected void bprop(int n) {
      assert (_index < _minfo.get_params()._hidden.length);
      float m = _minfo.adaDelta() ? 0 : momentum();
      float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
      final int rows = _a[0].size();
      double[] g = new double[n];
      for (int row = 0; row < rows; row++) {
        for (int mb=0;mb<n;++mb)
          //(d/dx)(max(0,x)) = 1 if x > 0, otherwise 0
          g[mb] = _a[mb].get(row) > 0f ? _e[mb].get(row) : 0f;
        bprop(row, g, r, m, n);
      }
    }
  }

  /**
   * Rectifier linear unit (ReLU) neurons with dropout
   */
  public static class RectifierDropout extends Rectifier {
    public RectifierDropout(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training, int n) {
      if (training) {
        seed += params._seed + 0x3C71F1ED;
        _dropout.fillBytes(seed);
        super.fprop(seed, true, n);
      }
      else {
        super.fprop(seed, false, n);
        for (int mb=0;mb<n;++mb)
          ArrayUtils.mult(_a[mb].raw(), 1-params._hidden_dropout_ratios[_index]);
      }
    }
  }

  public static class ExpRectifier extends Neurons {
    public ExpRectifier(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training, int n) {
      for (int mb=0;mb<n;++mb)
        gemv(_a[mb], _w, _previous._a[mb], _b, _dropout != null ? _dropout.bits() : null);
      final int rows = _a[0].size();
      for( int row = 0; row < rows; row++ ) {
        for (int mb=0;mb<n;++mb) {
          double x = _a[mb].get(row);
          double val = x >= 0 ? x : Math.exp(x) - 1;
          _a[mb].set(row, val);
        }
      }
      compute_sparsity();
    }
    // Computing partial derivative g = dE/dnet = dE/dy * dy/dnet, where dE/dy is the backpropagated error
    @Override protected void bprop(int n) {
      assert (_index < _minfo.get_params()._hidden.length);
      float m = _minfo.adaDelta() ? 0 : momentum();
      float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
      final int rows = _a[0].size();
      for (int row = 0; row < rows; row++) {
        double [] g = new double[n];
        for (int mb=0;mb<n;++mb) {
          double x = _a[mb].get(row);
          double val = x >= 0 ? 1 : Math.exp(x);
          g[mb] = _e[mb].get(row) * val;
        }
        bprop(row, g, r, m, n);
      }
    }
  }

  /**
   * Exponential Rectifier with dropout
   */
  public static class ExpRectifierDropout extends ExpRectifier {
    public ExpRectifierDropout(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training, int n) {
      if (training) {
        seed += params._seed + 0xDA7A6000;
        _dropout.fillBytes(seed);
        super.fprop(seed, true, n);
      }
      else {
        super.fprop(seed, false, n);
        for (int mb=0;mb<n;++mb)
          ArrayUtils.mult(_a[mb].raw(), 1-params._hidden_dropout_ratios[_index]);
      }
    }
  }

  /**
   * Abstract class for Output neurons
   */
  public static abstract class Output extends Neurons {
    Output(int units) { super(units); }
    protected void bprop(int n) { throw new UnsupportedOperationException(); }
  }

  /**
   * Output neurons for classification - Softmax
   */
  public static class Softmax extends Output {
    public Softmax(int units) { super(units); }
    protected void fprop(long seed, boolean training, int n) {
      for (int mb=0;mb<n;++mb)
        gemv(_a[mb], _w, _previous._a[mb], _b, null);
      for (int mb=0;mb<n;++mb) {
        final double max = ArrayUtils.maxValue(_a[mb].raw());
        double scaling = 0;
        final int rows = _a[mb].size();
        for( int row = 0; row < rows; row++ ) {
          _a[mb].set(row, Math.exp(_a[mb].get(row) - max));
          scaling += _a[mb].get(row);
        }
        for( int row = 0; row < rows; row++ ) {
          _a[mb].raw()[row] /= scaling;
        }
      }
    }

    /**
     * Part of backpropagation for classification
     * Update every weight as follows: w += -rate * dE/dw
     * Compute dE/dw via chain rule: dE/dw = dE/dy * dy/dnet * dnet/dw, where net = sum(xi*wi)+b and y = activation function
     * @param target actual class label (integer)
     */
    @Override protected void setOutputLayerGradient(double target, int mb, int n) {
      assert(target == (int)target);
      double g; //partial derivative dE/dy * dy/dnet
      final int rows = _a[mb].size();
      for( int row = 0; row < rows; row++ ) {
        final double t = (row == (int)target ? 1 : 0);
        final double y = _a[mb].get(row);
        //dy/dnet = derivative of softmax = (1-y)*y
        switch(params._loss) {
          case CrossEntropy:
            //shortcut possible -dCE/dy * dy/dnet = target - y
            g = y - t;
            break;
          case ModifiedHuber:
            g = -2*_dist.negHalfGradient(t, y) * (1 - y) * y;
            break;
          case Quadratic:
            g = (y - t) * (1f - y) * y;
            break;
          default:
            throw H2O.unimpl();
        }
        _e[mb].set(row, g/n); //minibatch normalization
      }
    }
  }

  /**
   * Output neurons for regression - Linear units
   */
  public static class Linear extends Output {
    public Linear() {
      super(1);
    }
    protected void fprop(long seed, boolean training, int n) {
      for (int mb=0;mb<n;++mb)
        gemv(_a[mb], _w, _previous._a[mb], _b, _dropout != null ? _dropout.bits() : null);
    }

    /**
     * Backpropagation for regression
     * @param target floating-point target value
     */
    @Override protected void setOutputLayerGradient(double target, int mb, int n) {
      final int row = 0;
      final double y = _a[mb].get(row);
      double g = -2*_dist.negHalfGradient(target, y);
      _e[mb].set(row, g/n); //minibatch normalization
    }
  }

  /**
   * Mat-Vec Plus Add (with optional row dropout)
   * @param res = a*x+y (pre-allocated, will be overwritten)
   * @param a matrix of size rows x cols
   * @param x vector of length cols
   * @param y vector of length rows
   * @param row_bits if not null, check bits of this byte[] to determine whether a row is used or not
   */
  static void gemv_naive(final double[] res, final float[] a, final double[] x, final double[] y, byte[] row_bits) {
    final int cols = x.length;
    final int rows = y.length;
    assert(res.length == rows);
    for(int row = 0; row<rows; row++) {
      res[row] = 0;
      if( row_bits != null && (row_bits[row / 8] & (1 << (row % 8))) == 0) continue;
      for(int col = 0; col<cols; col++)
        res[row] += a[row*cols+col] * x[col];
      res[row] += y[row];
    }
  }

  /**
   * Optimized Mat-Vec Plus Add (with optional row dropout)
   * Optimization: Partial sums can be evaluated in parallel
   * @param res = a*x+y (pre-allocated, will be overwritten)
   * @param a matrix of size rows x cols
   * @param x vector of length cols
   * @param y vector of length rows
   * @param row_bits if not null, check bits of this byte[] to determine whether a row is used or not
   */
  static void gemv_row_optimized(final double[] res, final float[] a, final double[] x, final double[] y, final byte[] row_bits) {
    final int cols = x.length;
    final int rows = y.length;
    assert(res.length == rows);
    final int extra=cols-cols%8;
    final int multiple = (cols/8)*8-1;
    int idx = 0;
    for (int row = 0; row<rows; row++) {
      res[row] = 0;
      if( row_bits == null || (row_bits[row / 8] & (1 << (row % 8))) != 0) {
        double psum0 = 0, psum1 = 0, psum2 = 0, psum3 = 0, psum4 = 0, psum5 = 0, psum6 = 0, psum7 = 0;
        for (int col = 0; col < multiple; col += 8) {
          int off = idx + col;
          psum0 += a[off    ] * x[col    ];
          psum1 += a[off + 1] * x[col + 1];
          psum2 += a[off + 2] * x[col + 2];
          psum3 += a[off + 3] * x[col + 3];
          psum4 += a[off + 4] * x[col + 4];
          psum5 += a[off + 5] * x[col + 5];
          psum6 += a[off + 6] * x[col + 6];
          psum7 += a[off + 7] * x[col + 7];
        }
        res[row] += psum0 + psum1 + psum2 + psum3;
        res[row] += psum4 + psum5 + psum6 + psum7;
        for (int col = extra; col < cols; col++)
          res[row] += a[idx + col] * x[col];
        res[row] += y[row];
      }
      idx += cols;
    }
  }

  /**
   * Helper to do a generic gemv: res = a*x + y
   * @param res Dense result
   * @param a DenseMatrix
   * @param x DenseVector
   * @param y Dense vector to add to result
   * @param row_bits Bit mask for which rows to use
   */
  static void gemv(final Storage.DenseVector res, final Storage.DenseRowMatrix a, final Storage.DenseVector x, final Storage.DenseVector y, byte[] row_bits) {
    gemv_row_optimized(res.raw(), a.raw(), x.raw(), y.raw(), row_bits);
  }

  static void gemv_naive(final Storage.DenseVector res, final Storage.DenseRowMatrix a, final Storage.DenseVector x, final Storage.DenseVector y, byte[] row_bits) {
    gemv_naive(res.raw(), a.raw(), x.raw(), y.raw(), row_bits);
  }
}