Layer.java example

Explorer
h2o-2-master
package hex;

import water.*;
import water.api.DocGen;
import water.api.Request.API;
import water.fvec.Chunk;
import water.fvec.Vec;
import water.util.Utils;

import java.util.Random;

/**
 * Neural network layer.
 *
 * @author cypof
 */
public abstract class Layer extends Iced {
  static final int API_WEAVER = 1;
  public static DocGen.FieldDoc[] DOC_FIELDS;

  @API(help = "Number of neurons")
  @ParamsSearch.Ignore
  public int units;

  public NeuralNet params;

  // Layer state: activity, error
  protected transient float[] _a, _e;

  // Shared state: weights and biases (and their momenta)
  protected transient float[] _w, _wm;
  protected transient float[] _b, _bm;

  // Previous and input layers
  protected transient Layer _previous;
  transient Input _input;

  // Dropout (for input + hidden layers)
  transient Dropout dropout;

  /**
   * Start of refactoring in specification & running data, for layers and trainers.
   */
  static abstract class Training {
    abstract long processed();
  }

  transient Training _training;

  /**
   * We need a way to encode a missing value in the neural net forward/back-propagation scheme.
   * For simplicity and performance, we simply use the largest values to encode a missing value.
   * If we run into exactly one of those values with regular neural net updates, then we're very
   * likely also running into overflow problems, which will trigger a NaN somewhere, which will be
   * caught and lead to automatic job cancellation.
   */
  public static final int missing_int_value = Integer.MAX_VALUE; //encode missing label or target
  public static final float missing_float_value = Float.MAX_VALUE; //encode missing input

  /**
   * Helper class for dropout, only to be used from within a Layer
   */

  public class Dropout {
    private transient Random _rand;
    private transient byte[] _bits;

    @Override
    public String toString() {
      String s = "Dropout: " + super.toString();
      s += "\nRandom: " + _rand.toString();
      s += "\nbits: ";
      for (int i=0; i< _bits.length*8; ++i) s += unit_active(i) ? "1":"0";
      s += "\n";
      return s;
    }

    Dropout(int units) {
      _bits = new byte[(units+7)/8];
      _rand = new Random(0);
    }

    // for input layer
    public void randomlySparsifyActivation(float[] a, double rate, long seed) {
      if (rate == 0) return;
      setSeed(seed);
      for( int i = 0; i < a.length; i++ )
        if (_rand.nextFloat() < rate) a[i] = 0;
    }

    // for hidden layers
    public void fillBytes(long seed) {
      setSeed(seed);
      _rand.nextBytes(_bits);
    }

    public boolean unit_active(int o) {
      return (_bits[o / 8] & (1 << (o % 8))) != 0;
    }

    private void setSeed(long seed) {
      if ((seed >>> 32) < 0x0000ffffL)         seed |= 0x5b93000000000000L;
      if (((seed << 32) >>> 32) < 0x0000ffffL) seed |= 0xdb910000L;
      _rand.setSeed(seed);
    }
  }

  public final void init(Layer[] ls, int index, NeuralNet p) {
    params = (NeuralNet)p.clone();
    init(ls, index, true);
  }

  public void init(Layer[] ls, int index, boolean weights) {
    params.rate *= Math.pow(params.rate_decay, index-1);
    _a = new float[units];
    if (!(this instanceof Output) && !(this instanceof Input)) {
      _e = new float[units];
    }
    _previous = ls[index - 1];
    _input = (Input) ls[0];

    if (this instanceof MaxoutDropout || this instanceof TanhDropout || this instanceof RectifierDropout) {
      dropout = new Dropout(units);
    }

    if( weights ) {
      _w = new float[units * _previous.units];
      _b = new float[units];
      if( params.momentum_start != 0 || params.momentum_stable != 0 ) {
        _wm = new float[_w.length];
        _bm = new float[_b.length];
      }
    }
  }

  /**
   *
   // helper to initialize weights
   // adaptive initialization uses prefactor * sqrt(6 / (units_input_layer + units_this_layer))
   * @param seed random generator seed to use
   * @param prefactor prefactor for initialization (typical value: 1.0)
   */
  // cf. http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf
  void randomize(long seed, double prefactor) {
    if (_w == null) return;
    final Random rng = water.util.Utils.getDeterRNG(seed);

    if (params.initial_weight_distribution == NeuralNet.InitialWeightDistribution.UniformAdaptive) {
      final double range = prefactor * Math.sqrt(6. / (_previous.units + units));
      for( int i = 0; i < _w.length; i++ )
        _w[i] = (float)uniformDist(rng, -range, range);
    }
    else {
      if (params.initial_weight_distribution == NeuralNet.InitialWeightDistribution.Uniform) {
        for (int i = 0; i < _w.length; i++) {
          _w[i] = (float)uniformDist(rng, -params.initial_weight_scale, params.initial_weight_scale);
        }
      } else if (params.initial_weight_distribution == NeuralNet.InitialWeightDistribution.Normal) {
        for (int i = 0; i < _w.length; i++) {
          _w[i] = (float) (0 + rng.nextGaussian() * params.initial_weight_scale);
        }
      }
    }
  }

    // TODO: Add "subset randomize" function
//        int count = Math.min(15, _previous.units);
//        double min = -.1f, max = +.1f;
//        //double min = -1f, max = +1f;
//        for( int o = 0; o < units; o++ ) {
//          for( int n = 0; n < count; n++ ) {
//            int i = rand.nextInt(_previous.units);
//            int w = o * _previous.units + i;
//            _w[w] = uniformDist(rand, min, max);
//          }
//        }

  public void close() {
  }

  protected abstract void fprop(long seed, boolean training);

  protected abstract void bprop();

  /**
   * Apply gradient g to unit u with rate r and momentum m.
   */
  final void bprop(int u, float g, float r, float m) {
    // only correct weights if the gradient is large enough
    if (params.fast_mode || (_w == null && params.l1 == 0.0 && params.l2 == 0.0)) {
      if (g == 0f) return;
    }

    final float l1 = (float)params.l1;
    final float l2 = (float)params.l2;
    double r2 = 0;
    final int off = u * _previous._a.length;
    for( int i = 0; i < _previous._a.length; i++ ) {
      int w = off + i;
      if( _previous._e != null ) _previous._e[i] += g * _w[w];
      if (params.fast_mode && _previous._a[i] == 0) continue;

      float d = g * _previous._a[i] - Math.signum(_w[w]) * l1 - _w[w] * l2;

      // TODO finish per-weight acceleration, doesn't help for now
//      if( _wp != null && d != 0 ) {
//        boolean sign = _wp[w] >= 0;
//        double mult = Math.abs(_wp[w]);
//        // If the gradient kept its sign, increase
//        if( (d >= 0) == sign )
//          mult += .05f;
//        else {
//          if( mult > 1 )
//            mult *= .95f;
//          else
//            sign = !sign;
//        }
//        d *= mult;
//        _wp[w] = sign ? mult : -mult;
//      }

      if( _wm != null ) {
        _wm[w] *= m;
        _wm[w] += d;
        d = _wm[w];
      }
      _w[w] += r * d;
      if (params.max_w2 != Double.POSITIVE_INFINITY) r2 += _w[w] * _w[w];
    }
    if( params.max_w2 != Double.POSITIVE_INFINITY && r2 > params.max_w2 ) { // C.f. Improving neural networks by preventing co-adaptation of feature detectors
      final float scale = Utils.approxSqrt((float)(params.max_w2 / r2));
      for( int i = 0; i < _previous._a.length; i++ ) _w[off + i] *= scale;
    }
    float d = g;
    if( _bm != null ) {
      _bm[u] *= m;
      _bm[u] += d;
      d = _bm[u];
    }
    _b[u] += r * d;
  }

  public float rate(long n) {
    return (float)(params.rate / (1 + params.rate_annealing * n));
  }

  public float momentum(long n) {
    double m = params.momentum_start;
    if( params.momentum_ramp > 0 ) {
      if( n >= params.momentum_ramp )
        m = params.momentum_stable;
      else
        m += (params.momentum_stable - params.momentum_start) * n / params.momentum_ramp;
    }
    return (float)m;
  }

  public static abstract class Input extends Layer {
    @ParamsSearch.Ignore
    protected long _pos, _len;

    @Override public void init(Layer[] ls, int index, boolean weights) {
      _a = new float[units];
      dropout = new Dropout(units);
    }

    public void inputDropout(long seed) {
      double rate = params.input_dropout_ratio;
      seed += params.seed + 0x1337B4BE;
      dropout.randomlySparsifyActivation(_a, rate, seed);
    }


    @Override protected void bprop() {
      throw new UnsupportedOperationException();
    }

    public final long move() {
      return _pos = _pos == _len - 1 ? 0 : _pos + 1;
    }
  }

  public static class VecsInput extends Input {
    static final int API_WEAVER = 1;
    public static DocGen.FieldDoc[] DOC_FIELDS;

    public Vec[] vecs;

    @API(help = "Categorical classes identified on the training set")
    int[] categoricals_lens;

    @API(help = "Categorical minimums identified on the training set")
    int[] categoricals_mins;

    @API(help = "Normalisation stats used during training")
    double[] subs, muls;

    transient Chunk[] _chunks;

    @Override public Layer clone() {
      VecsInput o = (VecsInput) super.clone();
      if( o._chunks != null )
        o._chunks = new Chunk[o._chunks.length];
      return o;
    }

    public VecsInput(Vec[] vecs, VecsInput train) {
      Init(vecs, train);
    }

    public void Init(Vec[] vecs, VecsInput train) {
      units = train != null ? train.subs.length : expand(vecs);
      this.vecs = vecs;
      _len = vecs[0].length();

      if( train != null ) {
        int a = train.categoricals_lens.length;
        int b = vecs.length;
        assert a == b;
        categoricals_lens = train.categoricals_lens;
        categoricals_mins = train.categoricals_mins;
        assert train.subs.length == units;
        subs = train.subs;
        muls = train.muls;
      } else {
        categoricals_lens = new int[vecs.length];
        categoricals_mins = new int[vecs.length];
        for( int i = 0; i < vecs.length; i++ ) {
          categoricals_lens[i] = categories(vecs[i]);
          categoricals_mins[i] = (int) vecs[i].min();
        }
        subs = new double[units];
        muls = new double[units];
        stats(vecs);
      }
    }

    static int categories(Vec vec) {
      String[] dom = vec.domain();
      return dom == null ? 1 : dom.length - 1;
    }

    static int expand(Vec[] vecs) {
      int n = 0;
      for (Vec vec : vecs) n += categories(vec);
      return n;
    }

    private void stats(Vec[] vecs) {
      Stats stats = new Stats();
      stats._units = units;
      stats._categoricals_lens = categoricals_lens;
      stats._categoricals_mins = categoricals_mins;
      stats.doAll(vecs);
      for( int i = 0; i < vecs.length; i++ ) {
        subs[i] = stats._means[i];
        double sigma = Math.sqrt(stats._sigms[i] / (stats._rows - 1));
        muls[i] = sigma > 1e-6 ? 1 / sigma : 1;
      }
    }

    @Override protected void fprop(long seed, boolean training) {
      if( _chunks == null )
        _chunks = new Chunk[vecs.length];
      for( int i = 0; i < vecs.length; i++ ) {
        Chunk c = _chunks[i];
        if( c == null || c._vec != vecs[i] || _pos < c._start || _pos >= c._start + c._len )
          _chunks[i] = vecs[i].chunkForRow(_pos);
      }
      ChunksInput.set(_chunks, _a, (int) (_pos - _chunks[0]._start), subs, muls, categoricals_lens, categoricals_mins);
      if (training) inputDropout(seed);
    }
  }

  /**
   * Stats with expanded categoricals. Used to normalize the data in the input layer.
   */
  static class Stats extends MRTask2<Stats> {
    int _units;
    int[] _categoricals_lens, _categoricals_mins;
    double[] _means, _sigms;
    long _rows;
    transient double[] _subs, _muls;

    @Override protected void setupLocal() {
      _subs = new double[_units];
      _muls = new double[_units];
      for( int i = 0; i < _muls.length; i++ )
        _muls[i] = 1;
    }

    @Override public void map(Chunk[] cs) {
      _means = new double[_units];
      _sigms = new double[_units];
      float[] a = new float[_means.length];
      for( int r = 0; r < cs[0]._len; r++ ) {
        ChunksInput.set(cs, a, r, _subs, _muls, _categoricals_lens, _categoricals_mins);
        for( int c = 0; c < a.length; c++ )
          _means[c] += a[c];
      }
      for( int c = 0; c < a.length; c++ )
        _means[c] /= cs[0]._len;
      for( int r = 0; r < cs[0]._len; r++ ) {
        ChunksInput.set(cs, a, r, _subs, _muls, _categoricals_lens, _categoricals_mins);
        for( int c = 0; c < a.length; c++ )
          _sigms[c] += (a[c] - _means[c]) * (a[c] - _means[c]);
      }
      _rows += cs[0]._len;
    }

    @Override public void reduce(Stats rs) {
      reduce(_means, _sigms, _rows, rs._means, rs._sigms, rs._rows);
      _rows += rs._rows;
    }

    static void reduce(double[] ma, double[] sa, long ra, double[] mb, double[] sb, long rb) {
      for( int c = 0; c < ma.length; c++ ) {
        double delta = ma[c] - mb[c];
        ma[c] = (ma[c] * ra + mb[c] * rb) / (ra + rb);
        sa[c] = sa[c] + sb[c] + delta * delta * ra * rb / (ra + rb);
      }
    }

    @Override public boolean logVerbose() {
      return !H2O.DEBUG;
    }
  }

  /**
   * A ChunksInput layer populates the activation values from a FVec chunk.
   * Missing values will lead to a 0 activation value in the input layer, which is equivalent to
   * setting it to the *average* column value before normalizing. In effect, missing column values are ignored.
   */
  static class ChunksInput extends Input {
    transient Chunk[] _chunks;
    double[] _subs, _muls;
    int[] _categoricals_lens;
    int[] _categoricals_mins;

    public ChunksInput(Chunk[] chunks, VecsInput stats) {
      units = stats.subs.length;
      _chunks = chunks;
      _subs = stats.subs;
      _muls = stats.muls;
      _categoricals_lens = stats.categoricals_lens;
      _categoricals_mins = stats.categoricals_mins;
    }

    /**
     * forward propagation means filling the activation values with all the row's column values
     */
    @Override protected void fprop(long seed, boolean training) {
      set(_chunks, _a, (int) _pos, _subs, _muls, _categoricals_lens, _categoricals_mins);
      if (training) inputDropout(seed);
    }

    static void set(Chunk[] chunks, float[] a, int row, double[] subs, double[] muls, int[] catLens, int[] catMins) {
      int n = 0;
      // loop over all columns
      for( int i = 0; i < catLens.length; i++ ) {
        final boolean missing = chunks[i].isNA0(row);
        double d = chunks[i].at0(row);
        if( catLens[i] == 1 ) {
          //numerical value: normalize
          d -= subs[n];
          d *= muls[n];
          a[n++] = missing ? 0f : (float)d;
        } else {
          // categorical values: use precomputed stats
          int cat = catLens[i];
          for( int c = 0; c < cat; c++ )
            a[n + c] = missing ? 0f : (float)-subs[n + c];
          int c = (int) d - catMins[i] - 1;
          if( c >= 0 )
            a[n + c] = missing ? 0f : (float)((1 - subs[n + c]) * muls[n + c]);
          n += cat;
        }
      }
      assert n == a.length;
    }
  }

  public static abstract class Output extends Layer {
    static final int API_WEAVER = 1;
    public static DocGen.FieldDoc[] DOC_FIELDS;

    protected final long pos() {
      return _input._pos;
    }
  }

  /**
   * Softmax output layer is used for classification
   * Rows with missing values in the response column will be ignored
   **/
  public static abstract class Softmax extends Output {
    protected abstract int target();

    @Override public void init(Layer[] ls, int index, boolean weights) {
      super.init(ls, index, weights);
      if( weights ) {
        randomize(params.seed + 0xBAD5EED + index, 4.0f);
      }
    }

    @Override protected void fprop(long seed, boolean training) {
      for( int o = 0; o < _a.length; o++ ) {
        _a[o] = 0;
        for( int i = 0; i < _previous._a.length; i++ )
          _a[o] += _w[o * _previous._a.length + i] * _previous._a[i];
        _a[o] += _b[o];
      }
      final float max = Utils.maxValue(_a);
      float scale = 0;
      for( int o = 0; o < _a.length; o++ ) {
        _a[o] = (float)Math.exp(_a[o] - max);
        scale += _a[o];
      }
      for( int o = 0; o < _a.length; o++ )
        _a[o] /= scale;
    }

    @Override protected void bprop() {
      long processed = _training.processed();
      float m = momentum(processed);
      float r = rate(processed) * (1 - m);
      int label = target();
      if (label == missing_int_value) return; //ignore missing response values
      for( int u = 0; u < _a.length; u++ ) {
        final float targetval = (u == label ? 1f : 0f);
        float g = targetval - _a[u];
        if (params.loss == NeuralNet.Loss.CrossEntropy) {
          //nothing else needed
        } else if (params.loss == NeuralNet.Loss.MeanSquare) {
          g *= (1 - _a[u]) * _a[u];
        }
        bprop(u, g, r, m);
      }
    }
  }

  public static class VecSoftmax extends Softmax {
    public Vec vec;
    private Vec _toClose;

    VecSoftmax() {
    }

    public VecSoftmax(Vec vec, VecSoftmax stats) {
// Waiting for Michal stuff, for now enum must start at 0
//      if( vec.domain() == null ) {
//        vec = vec.toEnum();
//        _toClose = vec;
//      }
      this.units = stats != null ? stats.units : (int) (vec.max() + 1);
      this.vec = vec;
      params = stats != null ? (NeuralNet)stats.params.clone() : null;
    }

    @Override protected int target() {
      if( vec.isNA(_input._pos) )
        return missing_int_value;
      return (int) vec.at8(_input._pos);
    }

    @Override public void close() {
      super.close();
      if( _toClose != null )
        UKV.remove(_toClose._key);
    }
  }

  static class ChunkSoftmax extends Softmax {
    transient Chunk _chunk;

    public ChunkSoftmax(Chunk chunk, VecSoftmax stats) {
      units = stats.units;
      _chunk = chunk;
      params = (NeuralNet)stats.params.clone();
    }

    @Override protected int target() {
      if( _chunk.isNA0((int) _input._pos) )
        return missing_int_value;
      return (int) _chunk.at80((int) _input._pos);
    }
  }

  /**
   * Linear output layer is used for regression
   * Rows with missing values in the response column will be ignored
   **/
  public static abstract class Linear extends Output {
    abstract float[] target();

    @Override public void init(Layer[] ls, int index, boolean weights) {
      super.init(ls, index, weights);
      if( weights ) {
        randomize(params.seed + 0xBAD5EED + index, 1.0f);
      }
    }

    @Override protected void fprop(long seed, boolean training) {
      for( int o = 0; o < _a.length; o++ ) {
        _a[o] = 0;
        for( int i = 0; i < _previous._a.length; i++ )
          _a[o] += _w[o * _previous._a.length + i] * _previous._a[i];
        _a[o] += _b[o];
      }
    }

    @Override protected void bprop() {
      long processed = _training.processed();
      float m = momentum(processed);
      float r = rate(processed) * (1 - m);
      float[] v = target();
      assert(params.loss == NeuralNet.Loss.MeanSquare);
      for( int u = 0; u < _a.length; u++ ) {
        if (v[u] == missing_float_value) continue; //ignore missing regression targets
        float g = v[u] - _a[u];
        bprop(u, g, r, m);
      }
    }
  }

  public static class VecLinear extends Linear {
    Vec _vec;
    transient float[] _values;

    public VecLinear(Vec vec, VecLinear stats) {
      assert(stats == null || stats.units == 1);
      units = 1; //regression
      _vec = vec;
      params = stats != null ? (NeuralNet)stats.params.clone() : null;
    }

    @Override float[] target() {
      if( _values == null )
        _values = new float[units];
      long pos = _input._pos; //pos is a global index into the vector
      _values[0] = _vec.isNA(pos) ? missing_float_value : (float)_vec.at(pos);
      return _values;
    }
  }

  static class ChunkLinear extends Linear {
    transient Chunk _chunk;
    transient float[] _values;

    public ChunkLinear(Chunk chunk, VecLinear stats) {
      assert(stats == null || stats.units == 1);
      units = 1;
      _chunk = chunk;
      params = (NeuralNet) (stats != null ? stats.params.clone() : null);
    }

    @Override float[] target() {
      if( _values == null )
        _values = new float[units];
      int pos = (int)_input._pos; //pos is a local index for this chunk
      _values[0] = _chunk.isNA0(pos) ? missing_float_value : (float)_chunk.at0(pos);
      return _values;
    }
  }

  public static class Tanh extends Layer {
    public Tanh(int units) { this.units = units; }
    @Override public void init(Layer[] ls, int index, boolean weights) {
      super.init(ls, index, weights);
      if( weights ) {
        randomize(params.seed + 0xBAD5EED + index, 1.0f);
      }
    }

    @Override protected void fprop(long seed, boolean training) {
      for( int o = 0; o < _a.length; o++ ) {
        _a[o] = 0;
        if( !training || dropout == null || dropout.unit_active(o) ) {
          for( int i = 0; i < _previous._a.length; i++ ) {
            _a[o] += _w[o * _previous._a.length + i] * _previous._a[i];
          }
          _a[o] += _b[o];
          _a[o] = 1f - 2f / (1f + (float)Math.exp(2*_a[o])); //evals faster than tanh(x), but is slightly less numerically stable - OK
        }
      }
    }

    @Override protected void bprop() {
      long processed = _training.processed();
      float m = momentum(processed);
      float r = rate(processed) * (1 - m);
      for( int u = 0; u < _a.length; u++ ) {
        // Gradient is error * derivative of hyperbolic tangent: (1 - x^2)
        float g = _e[u] * (1f - _a[u] * _a[u]);
        bprop(u, g, r, m);
      }
    }
  }

  public static class TanhDropout extends Tanh {
    public TanhDropout(int units) { super(units); }
    @Override
    protected void fprop(long seed, boolean training) {
      if (training) {
        seed += params.seed + 0xDA7A6000;
        dropout.fillBytes(seed);
        super.fprop(seed, true);
      }
      else {
        super.fprop(seed, false);
        Utils.div(_a, 2.f);
      }
    }
  }

  /**
   * Apply tanh to the weights' transpose. Used for auto-encoders.
   */
  public static class TanhPrime extends Tanh {
    public TanhPrime(int units) {
      super(units);
    }
    @Override public void init(Layer[] ls, int index, boolean weights) {
      super.init(ls, index, weights);
      // Auto encoder has its own bias vector
      _b = new float[units];
    }

    @Override protected void fprop(long seed, boolean training) {
      for( int o = 0; o < _a.length; o++ ) {
        _a[o] = 0;
        for( int i = 0; i < _previous._a.length; i++ )
          _a[o] += _w[i * _a.length + o] * _previous._a[i];
        _a[o] += _b[o];
        _a[o] = (float)Math.tanh(_a[o]);
      }
    }

    @Override protected void bprop() {
      long processed = _training.processed();
      float m = momentum(processed);
      float r = rate(processed) * (1 - m);
      for( int o = 0; o < _a.length; o++ ) {
        assert _previous._previous.units == units;
        float e = _previous._previous._a[o] - _a[o];
        float g = e; // * (1 - _a[o]) * _a[o]; // Square error
        for( int i = 0; i < _previous._a.length; i++ ) {
          int w = i * _a.length + o;
          if( _previous._e != null )
            _previous._e[i] += g * _w[w];
          _w[w] += r * (g * _previous._a[i] - _w[w] * params.l2 - Math.signum(_w[w]) * params.l1);
        }
        _b[o] += r * g;
      }
    }
  }

  public static class Maxout extends Layer {
    public Maxout(int units) { this.units = units; }
    @Override public void init(Layer[] ls, int index, boolean weights) {
      super.init(ls, index, weights);
      if( weights ) {
        randomize(params.seed + 0xBAD5EED + index, 1.0f);
        for( int i = 0; i < _b.length; i++ )
          _b[i] = index == 1 ? 0.5f : 1f;
      }
    }

    @Override protected void fprop(long seed, boolean training) {
      float max = 0;
      for( int o = 0; o < _a.length; o++ ) {
        _a[o] = 0;
        if( !training || dropout == null || dropout.unit_active(o)) {
          final int off = o * _previous._a.length;
          _a[o] = Float.NEGATIVE_INFINITY;
          for( int i = 0; i < _previous._a.length; i++ )
            _a[o] = Math.max(_a[o], _w[off+i] * _previous._a[i]);
          _a[o] += _b[o];
          max = Math.max(_a[o], max);
        }
      }
      if( max > 1 ) Utils.div(_a, max);
    }

    @Override protected void bprop() {
      long processed = _training.processed();
      float m = momentum(processed);
      float r = rate(processed) * (1 - m);
      for( int u = 0; u < _a.length; u++ ) {
        float g = _e[u];
//                if( _a[o] < 0 )   Not sure if we should be using maxout with a hard zero bottom
//                    g = 0;
        bprop(u, g, r, m);
      }
    }
  }

  public static class MaxoutDropout extends Maxout {
    public MaxoutDropout(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training) {
      if (training) {
        seed += params.seed + 0x51C8D00D;
        dropout.fillBytes(seed);
        super.fprop(seed, true);
      }
      else {
        super.fprop(seed, false);
        Utils.div(_a, 2.f);
      }
    }
  }

  public static class Rectifier extends Layer {
    public Rectifier(int units) { this.units = units; }
    @Override public void init(Layer[] ls, int index, boolean weights) {
      super.init(ls, index, weights);
      if( weights ) {
        randomize(params.seed + 0xBAD5EED + index, 1.0f);
        for( int i = 0; i < _b.length; i++ )
          _b[i] = index == 1 ? 0.5f : 1f;
      }
    }

    @Override protected void fprop(long seed, boolean training) {
      for( int o = 0; o < _a.length; o++ ) {
        _a[o] = 0;
        if( !training || dropout == null || dropout.unit_active(o) ) {
          for( int i = 0; i < _previous._a.length; i++ )
            _a[o] += _w[o * _previous._a.length + i] * _previous._a[i];
          _a[o] += _b[o];
          _a[o] = Math.max(_a[o], 0f);
        }
      }
    }

    @Override protected void bprop() {
      long processed = _training.processed();
      final float m = momentum(processed);
      final float r = rate(processed) * (1 - m);
      for( int u = 0; u < _a.length; u++ ) {
        //(d/dx)(max(0,x)) = 1 if x > 0, otherwise 0
        final float g = _a[u] > 0 ? _e[u] : 0; // * 1.0 (from derivative of rectifier)
        bprop(u, g, r, m);
        // otherwise g = _e[u] * 0.0 = 0 and we don't allow other contributions by (and to) weights and momenta
      }
    }
  }

  public static class RectifierDropout extends Rectifier {
    public RectifierDropout(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training) {
      if (training) {
        seed += params.seed + 0x3C71F1ED;
        dropout.fillBytes(seed);
        super.fprop(seed, true);
      }
      else {
        super.fprop(seed, false);
        Utils.div(_a, 2.f);
      }
    }
  }

  public static class RectifierPrime extends Rectifier {
    public RectifierPrime(int units) { super(units); }
    @Override public void init(Layer[] ls, int index, boolean weights) {
      super.init(ls, index, weights);
      // Auto encoder has its own bias vector
      _b = new float[units];
      for( int i = 0; i < _b.length; i++ )
        _b[i] = index == 1 ? 0.5f : 1f;
    }

    @Override protected void fprop(long seed, boolean training) {
      for( int o = 0; o < _a.length; o++ ) {
        _a[o] = 0;
        for( int i = 0; i < _previous._a.length; i++ )
          _a[o] += _w[i * _a.length + o] * _previous._a[i];
        _a[o] += _b[o];
        if( _a[o] < 0 )
          _a[o] = 0;
      }
    }

    @Override protected void bprop() {
      long processed = _training.processed();
      float m = momentum(processed);
      float r = rate(processed) * (1 - m);
      for( int u = 0; u < _a.length; u++ ) {
        assert _previous._previous.units == units;
        float e = _previous._previous._a[u] - _a[u];
        float g = e;//* (1 - _a[o]) * _a[o];
        //float g = e * (1 - _a[o]) * _a[o]; // Square error
        double r2 = 0;
        for( int i = 0; i < _previous._a.length; i++ ) {
          int w = i * _a.length + u;
          if( _previous._e != null ) _previous._e[i] += g * _w[w];
          float d = g * _previous._a[i] - (float)(_w[w] * params.l2) - (float)(Math.signum(_w[w]) * params.l1);
          _w[w] += r * d;
          if (params.max_w2 != Double.POSITIVE_INFINITY) r2 += _w[w] * _w[w];
        }
        if( params.max_w2 != Double.POSITIVE_INFINITY && r2 > params.max_w2 ) { // C.f. Improving neural networks by preventing co-adaptation of feature detectors
          final double scale = Math.sqrt(params.max_w2 / r2);
          for( int i = 0; i < _previous._a.length; i++ ) _w[i * _a.length + u] *= scale;
        }
        _b[u] += r * g;
      }
    }
  }

  @Override public Layer clone() {
    Layer l = (Layer) super.clone();
    if (dropout != null) l.dropout = new Dropout(units);
    return l;
  }

  public static void shareWeights(Layer src, Layer dst) {
    dst._w = src._w;
    if (dst._b == null || dst._b.length == src._b.length) dst._b = src._b;
    dst._wm = src._wm;
    if (dst._bm == null || dst._bm.length == src._bm.length) dst._bm = src._bm;
  }

  public static void shareWeights(Layer[] src, Layer[] dst) {
    for( int y = 1; y < src.length; y++ )
      shareWeights(src[y], dst[y]);
  }

  private static double uniformDist(Random rand, double min, double max) {
    return min + rand.nextFloat() * (max - min);
  }

  @Override public AutoBuffer writeJSON(AutoBuffer bb) {
    bb.put1('{');
    bb.putJSONStr("type").put1(':').putJSONStr(getClass().getName());
    bb.put1(',');
    writeJSONFields(bb);
    bb.put1('}');
    return bb;
  }

}