Neurons.java example

Explorer
h2o-2-master
package hex.deeplearning;

import hex.FrameTask;
import hex.deeplearning.DeepLearning.Loss;
import org.apache.hadoop.util.hash.Hash;
import org.apache.hadoop.util.hash.MurmurHash;
import water.Iced;
import water.MemoryManager;
import water.api.Request.API;
import water.util.Log;
import water.util.Utils;

import java.nio.ByteBuffer;
import java.util.*;

/**
 * This class implements the concept of a Neuron layer in a Neural Network
 * During training, every MRTask2 F/J thread is expected to create these neurons for every map call (Cheap to make).
 * These Neurons are NOT sent over the wire.
 * The weights connecting the neurons are in a separate class (DeepLearningModel.DeepLearningModelInfo), and will be shared per node.
 */
public abstract class Neurons {
  @API(help = "Number of neurons")
  protected int units;

  /**
   * Constructor of a Neuron Layer
   * @param units How many neurons are in this layer?
   */
  Neurons(int units) {
    this.units = units;
  }

  /**
   * Print the status of this neuron layer
   * @return populated String
   */
  @Override
  public String toString() {
    String s = this.getClass().getSimpleName();
    s += "\nNumber of Neurons: " + units;
    s += "\nParameters:\n" + params.toString();
    if (_dropout != null) s += "\nDropout:\n" + _dropout.toString();
    return s;
  }

  /**
   * Parameters (deep-cloned() from the user input, can be modified here, e.g. learning rate decay)
   */
  protected transient DeepLearning params;
  protected transient int _index; //which hidden layer it is

  /**
   * Layer state (one per neuron): activity, error
   */
  public transient Vector _a; //can be sparse for input layer
  public transient DenseVector _e;

  /**
   * References for feed-forward connectivity
   */
  public Neurons _previous;
  public Neurons _input;
  DeepLearningModel.DeepLearningModelInfo _minfo; //reference to shared model info
  public Matrix _w;
  public DenseVector _b;

  /**
   * References for momentum training
   */
  Matrix _wm;
  DenseVector _bm;

  /**
   * References for ADADELTA
   */
  Matrix _ada_dx_g;
  DenseVector _bias_ada_dx_g;

  /**
   * For Dropout training
   */
  protected Dropout _dropout;

  /**
   * Helper to shortcut bprop
   */
  private boolean _shortcut = false;

  public DenseVector _avg_a;

  public static final int missing_int_value = Integer.MAX_VALUE; //encode missing label
  public static final Float missing_real_value = Float.NaN; //encode missing regression target

  /**
   * Helper to check sanity of Neuron layers
   * @param training whether training or testing is done
   */
  void sanityCheck(boolean training) {
    if (this instanceof Input) {
      assert(_previous == null);
      assert (!training || _dropout != null);
    } else {
      assert(_previous != null);
      if (_minfo.has_momenta()) {
        assert(_wm != null);
        assert(_bm != null);
        assert(_ada_dx_g == null);
      }
      if (_minfo.adaDelta()) {
        if (params.rho == 0) throw new IllegalArgumentException("rho must be > 0 if epsilon is >0.");
        if (params.epsilon == 0) throw new IllegalArgumentException("epsilon must be > 0 if rho is >0.");
        assert(_minfo.adaDelta());
        assert(_bias_ada_dx_g != null);
        assert(_wm == null);
        assert(_bm == null);
      }
      if (this instanceof MaxoutDropout || this instanceof TanhDropout || this instanceof RectifierDropout) {
        assert (!training || _dropout != null);
      }
    }
  }

  /**
   * Initialization of the parameters and connectivity of a Neuron layer
   * @param neurons Array of all neuron layers, to establish feed-forward connectivity
   * @param index Which layer am I?
   * @param p User-given parameters (Job parental object hierarchy is not used)
   * @param minfo Model information (weights/biases and their momenta)
   * @param training Whether training is done or just testing (no need for dropout)
   */
  public final void init(Neurons[] neurons, int index, DeepLearning p, final DeepLearningModel.DeepLearningModelInfo minfo, boolean training) {
    _index = index-1;
    params = (DeepLearning)p.clone();
    params.rate *= Math.pow(params.rate_decay, index-1);
    _a = new DenseVector(units);
    if (!(this instanceof Output) && !(this instanceof Input)) {
      _e = new DenseVector(units);
    }
    if (training && (this instanceof MaxoutDropout || this instanceof TanhDropout
            || this instanceof RectifierDropout || this instanceof Input) ) {
      _dropout = this instanceof Input ? new Dropout(units, params.input_dropout_ratio) : new Dropout(units, params.hidden_dropout_ratios[_index]);
    }
    if (!(this instanceof Input)) {
      _previous = neurons[_index]; //incoming neurons
      _minfo = minfo;
      _w = minfo.get_weights(_index); //incoming weights
      _b = minfo.get_biases(_index); //bias for this layer (starting at hidden layer)
      if(params.autoencoder && params.sparsity_beta > 0 && _index < params.hidden.length) {
        _avg_a = minfo.get_avg_activations(_index);
      }
      if (minfo.has_momenta()) {
        _wm = minfo.get_weights_momenta(_index); //incoming weights
        _bm = minfo.get_biases_momenta(_index); //bias for this layer (starting at hidden layer)
      }
      if (minfo.adaDelta()) {
        _ada_dx_g = minfo.get_ada_dx_g(_index);
        _bias_ada_dx_g = minfo.get_biases_ada_dx_g(_index);
      }
      _shortcut = (params.fast_mode || (
              // not doing fast mode, but also don't have anything else to update (neither momentum nor ADADELTA history), and no L1/L2
              !params.adaptive_rate && !_minfo.has_momenta() && params.l1 == 0.0 && params.l2 == 0.0));
    }
    sanityCheck(training);
  }

  /**
   * Forward propagation
   * @param seed For seeding the RNG inside (for dropout)
   * @param training Whether training is done or just testing (no need for dropout)
   */
  protected abstract void fprop(long seed, boolean training);

  /**
   *  Back propagation
   */
  protected abstract void bprop();

  void bprop_sparse(float r, float m) {
    SparseVector prev_a = (SparseVector) _previous._a;
    int start = prev_a.begin()._idx;
    int end = prev_a.end()._idx;
    for (int it = start; it < end; ++it) {
      final int col = prev_a._indices[it];
      final float previous_a = prev_a._values[it];
      bprop_col(col, previous_a, r, m);
    }
    final int rows = _a.size();
    final float max_w2 = params.max_w2;
    for (int row = 0; row < rows; row++) {
      if (max_w2 != Float.POSITIVE_INFINITY)
        rescale_weights(_w, row, max_w2);
    }
  }

  /**
   * Backpropagation: w -= rate * dE/dw, where dE/dw = dE/dy * dy/dnet * dnet/dw
   * This method adds the dnet/dw = activation term per unit
   * @param row row index (update weights feeding to this neuron)
   * @param partial_grad partial derivative dE/dnet = dE/dy * dy/net
   * @param rate learning rate
   * @param momentum momentum factor (needed only if ADADELTA isn't used)
   */
  final void bprop(final int row, final float partial_grad, final float rate, final float momentum) {
    // only correct weights if the gradient is large enough
    if (_shortcut && partial_grad == 0f) return;

    if (_w instanceof DenseRowMatrix && _previous._a instanceof DenseVector)
      bprop_dense_row_dense(
              (DenseRowMatrix) _w, (DenseRowMatrix) _wm, (DenseRowMatrix) _ada_dx_g,
              (DenseVector) _previous._a, _previous._e, _b, _bm, row, partial_grad, rate, momentum);
    else if (_w instanceof DenseRowMatrix && _previous._a instanceof SparseVector)
      bprop_dense_row_sparse(
              (DenseRowMatrix)_w, (DenseRowMatrix)_wm, (DenseRowMatrix)_ada_dx_g,
              (SparseVector)_previous._a, _previous._e, _b, _bm, row, partial_grad, rate, momentum);
    else
      throw new UnsupportedOperationException("bprop for types not yet implemented.");
  }

  final void bprop_col(final int col, final float previous_a, final float rate, final float momentum) {
    if (_w instanceof DenseColMatrix && _previous._a instanceof SparseVector)
      bprop_dense_col_sparse(
              (DenseColMatrix)_w, (DenseColMatrix)_wm, (DenseColMatrix)_ada_dx_g,
              (SparseVector)_previous._a, _previous._e, _b, _bm, col, previous_a, rate, momentum);
    else
      throw new UnsupportedOperationException("bprop_col for types not yet implemented.");
  }

  /**
   * Specialization of backpropagation for DenseRowMatrices and DenseVectors
   * @param _w weight matrix
   * @param _wm weight momentum matrix
   * @param adaxg ADADELTA matrix (2 floats per weight)
   * @param prev_a activation of previous layer
   * @param prev_e error of previous layer
   * @param _b bias vector
   * @param _bm bias momentum vector
   * @param row index of the neuron for which we back-propagate
   * @param partial_grad partial derivative dE/dnet = dE/dy * dy/net
   * @param rate learning rate
   * @param momentum momentum factor (needed only if ADADELTA isn't used)
   */
  private void bprop_dense_row_dense(
          final DenseRowMatrix _w, final DenseRowMatrix _wm, final DenseRowMatrix adaxg,
          final DenseVector prev_a, final DenseVector prev_e, final DenseVector _b, final DenseVector _bm,
          final int row, final float partial_grad, float rate, final float momentum)
  {
    final float rho = (float)params.rho;
    final float eps = (float)params.epsilon;
    final float l1 = (float)params.l1;
    final float l2 = (float)params.l2;
    final float max_w2 = params.max_w2;
    final boolean have_momenta = _minfo.has_momenta();
    final boolean have_ada = _minfo.adaDelta();
    final boolean nesterov = params.nesterov_accelerated_gradient;
    final boolean update_prev = prev_e != null;
    final boolean fast_mode = params.fast_mode;
    final int cols = prev_a.size();
    final int idx = row * cols;

    float avg_grad2 = 0;
    for( int col = 0; col < cols; col++ ) {
      final float weight = _w.get(row,col);
      if( update_prev ) prev_e.add(col, partial_grad * weight); // propagate the error dE/dnet to the previous layer, via connecting weights
      final float previous_a = prev_a.get(col);
      if (fast_mode && previous_a == 0) continue;

      //this is the actual gradient dE/dw
      final float grad = partial_grad * previous_a - Math.signum(weight) * l1 - weight * l2;
      final int w = idx + col;

      if (have_ada) {
        assert(!have_momenta);
        final float grad2 = grad*grad;
        avg_grad2 += grad2;
        float brate = computeAdaDeltaRateForWeight(grad, w, adaxg, rho, eps);
        _w.raw()[w] += brate * grad;
      } else {
        if (!nesterov) {
          final float delta = rate * grad;
          _w.raw()[w] += delta;
          if( have_momenta ) {
            _w.raw()[w] += momentum * _wm.raw()[w];
            _wm.raw()[w] = delta;
          }
        } else {
          float tmp = grad;
          if( have_momenta ) {
            _wm.raw()[w] *= momentum;
            _wm.raw()[w] += tmp;
            tmp = _wm.raw()[w];
          }
          _w.raw()[w] += rate * tmp;
        }
      }
    }
    if (max_w2 != Float.POSITIVE_INFINITY)
      rescale_weights(_w, row, max_w2);
    if (have_ada) avg_grad2 /= cols;
    update_bias(_b, _bm, row, partial_grad, avg_grad2, rate, momentum);
  }

  /**
   * Specialization of backpropagation for DenseColMatrices and SparseVector for previous layer's activation and DenseVector for everything else
   * @param w Weight matrix
   * @param wm Momentum matrix
   * @param adaxg ADADELTA matrix (2 floats per weight)
   * @param prev_a sparse activation of previous layer
   * @param prev_e error of previous layer
   * @param b bias
   * @param bm bias momentum
   * @param rate learning rate
   * @param momentum momentum factor (needed only if ADADELTA isn't used)
   */
  private void bprop_dense_col_sparse(
          final DenseColMatrix w, final DenseColMatrix wm, final DenseColMatrix adaxg,
          final SparseVector prev_a, final DenseVector prev_e, final DenseVector b, final DenseVector bm,
          final int col, final float previous_a, float rate, final float momentum)
  {
    final float rho = (float)params.rho;
    final float eps = (float)params.epsilon;
    final float l1 = (float)params.l1;
    final float l2 = (float)params.l2;
    final boolean have_momenta = _minfo.has_momenta();
    final boolean have_ada = _minfo.adaDelta();
    final boolean nesterov = params.nesterov_accelerated_gradient;
    final boolean update_prev = prev_e != null;
    final int cols = prev_a.size();

    final int rows = _a.size();
    for (int row = 0; row < rows; row++) {
      final float partial_grad = _e.get(row) * (1f - _a.get(row) * _a.get(row));
      final float weight = w.get(row,col);
      if( update_prev ) prev_e.add(col, partial_grad * weight); // propagate the error dE/dnet to the previous layer, via connecting weights
      assert (previous_a != 0); //only iterate over non-zeros!

      if (_shortcut && partial_grad == 0f) continue;

      //this is the actual gradient dE/dw
      final float grad = partial_grad * previous_a - Math.signum(weight) * l1 - weight * l2;
      if (have_ada) {
        assert(!have_momenta);
        float brate = computeAdaDeltaRateForWeight(grad, row, col, adaxg, rho, eps);
        w.add(row,col, brate * grad);
      } else {
        if (!nesterov) {
          final float delta = rate * grad;
          w.add(row, col, delta);
//          Log.info("for row = " + row + ", col = " + col + ", partial_grad = " + partial_grad + ", grad = " + grad);
          if( have_momenta ) {
            w.add(row, col, momentum * wm.get(row, col));
            wm.set(row, col, delta);
          }
        } else {
          float tmp = grad;
          if( have_momenta ) {
            float val = wm.get(row, col);
            val *= momentum;
            val += tmp;
            tmp = val;
            wm.set(row, col, val);
          }
          w.add(row, col, rate * tmp);
        }
      }
      //this is called cols times, so we divide the (repeated) contribution by 1/cols
      update_bias(b, bm, row, partial_grad/cols, grad*grad/cols, rate, momentum);
    }
  }

 /**
   * Specialization of backpropagation for DenseRowMatrices and SparseVector for previous layer's activation and DenseVector for everything else
   * @param _w weight matrix
   * @param _wm weight momentum matrix
   * @param adaxg ADADELTA matrix (2 floats per weight)
   * @param prev_a sparse activation of previous layer
   * @param prev_e error of previous layer
   * @param _b bias vector
   * @param _bm bias momentum vector
   * @param row index of the neuron for which we back-propagate
   * @param partial_grad partial derivative dE/dnet = dE/dy * dy/net
   * @param rate learning rate
   * @param momentum momentum factor (needed only if ADADELTA isn't used)
   */
  private void bprop_dense_row_sparse(
          final DenseRowMatrix _w, final DenseRowMatrix _wm, final DenseRowMatrix adaxg,
          final SparseVector prev_a, final DenseVector prev_e, final DenseVector _b, final DenseVector _bm,
          final int row, final float partial_grad, float rate, final float momentum)
  {
    final float rho = (float)params.rho;
    final float eps = (float)params.epsilon;
    final float l1 = (float)params.l1;
    final float l2 = (float)params.l2;
    final float max_w2 = params.max_w2;
    final boolean have_momenta = _minfo.has_momenta();
    final boolean have_ada = _minfo.adaDelta();
    final boolean nesterov = params.nesterov_accelerated_gradient;
    final boolean update_prev = prev_e != null;
    final int cols = prev_a.size();
    final int idx = row * cols;

    float avg_grad2 = 0;
    int start = prev_a.begin()._idx;
    int end = prev_a.end()._idx;
    for (int it = start; it < end; ++it) {
      final int col = prev_a._indices[it];
      final float weight = _w.get(row,col);
      if( update_prev ) prev_e.add(col, partial_grad * weight); // propagate the error dE/dnet to the previous layer, via connecting weights
      final float previous_a = prev_a._values[it];
      assert (previous_a != 0); //only iterate over non-zeros!

      //this is the actual gradient dE/dw
      final float grad = partial_grad * previous_a - Math.signum(weight) * l1 - weight * l2;
      final int w = idx + col;

      if (have_ada) {
        assert(!have_momenta);
        final float grad2 = grad*grad;
        avg_grad2 += grad2;
        float brate = computeAdaDeltaRateForWeight(grad, w, adaxg, rho, eps);
        _w.raw()[w] += brate * grad;
      } else {
        if (!nesterov) {
          final float delta = rate * grad;
          _w.raw()[w] += delta;
          if( have_momenta ) {
            _w.raw()[w] += momentum * _wm.raw()[w];
            _wm.raw()[w] = delta;
          }
        } else {
          float tmp = grad;
          if( have_momenta ) {
            _wm.raw()[w] *= momentum;
            _wm.raw()[w] += tmp;
            tmp = _wm.raw()[w];
          }
          _w.raw()[w] += rate * tmp;
        }
      }
    }
    if (max_w2 != Float.POSITIVE_INFINITY)
      rescale_weights(_w, row, max_w2);
    if (have_ada) avg_grad2 /= prev_a.nnz();
    update_bias(_b, _bm, row, partial_grad, avg_grad2, rate, momentum);
  }

  /**
   * Helper to scale down incoming weights if their squared sum exceeds a given value (by a factor of 10 -> to avoid doing costly rescaling too often)
   * C.f. Improving neural networks by preventing co-adaptation of feature detectors
   * @param row index of the neuron for which to scale the weights
   */
  private static void rescale_weights(final Matrix w, final int row, final float max_w2) {
    final int cols = w.cols();
    if (w instanceof DenseRowMatrix) {
      rescale_weights((DenseRowMatrix)w, row, max_w2);
    } else if (w instanceof DenseColMatrix) {
      float r2 = 0;
      for (int col=0; col<cols;++col)
        r2 += w.get(row,col)*w.get(row,col);
      if( r2 > max_w2) {
        final float scale = Utils.approxSqrt(max_w2 / r2);
        for( int col=0; col < cols; col++ ) w.set(row, col, w.get(row,col) * scale);
      }
    }
    else throw new UnsupportedOperationException("rescale weights for " + w.getClass().getSimpleName() + " not yet implemented.");
  }

  // Specialization for DenseRowMatrix
  private static void rescale_weights(final DenseRowMatrix w, final int row, final float max_w2) {
    final int cols = w.cols();
    final int idx = row * cols;
    float r2 = Utils.sumSquares(w.raw(), idx, idx+cols);
//    float r2 = Utils.approxSumSquares(w.raw(), idx, idx + cols);
    if( r2 > max_w2) {
      final float scale = Utils.approxSqrt(max_w2 / r2);
      for( int c = 0; c < cols; c++ ) w.raw()[idx + c] *= scale;
    }
  }

  /**
   * Helper to compute the reconstruction error for auto-encoders (part of the gradient computation)
   * @param row neuron index
   * @return difference between the output (auto-encoder output layer activation) and the target (input layer activation)
   */
  protected float autoEncoderError(int row) {
    assert (_minfo.get_params().autoencoder && _index == _minfo.get_params().hidden.length);
    assert (params.loss == Loss.MeanSquare);
    return (_input._a.get(row) - _a.get(row));
  }

  /**
   * Compute learning rate with AdaDelta
   * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
   * @param grad gradient
   * @param row which neuron is to be updated
   * @param col weight from which incoming neuron
   * @param ada_dx_g Matrix holding helper values (2 floats per weight)
   * @param rho hyper-parameter #1
   * @param eps hyper-parameter #2
   * @return learning rate
   */
  private static float computeAdaDeltaRateForWeight(final float grad, final int row, final int col,
                                                  final DenseColMatrix ada_dx_g,
                                                  final float rho, final float eps) {
    ada_dx_g.set(2*row+1, col, rho * ada_dx_g.get(2*row+1, col) + (1f - rho) * grad * grad);
    final float rate = Utils.approxSqrt((ada_dx_g.get(2*row, col) + eps)/(ada_dx_g.get(2*row+1, col) + eps));
    ada_dx_g.set(2*row,   col, rho * ada_dx_g.get(2*row, col)   + (1f - rho) * rate * rate * grad * grad);
    return rate;
  }

  /**
   * Compute learning rate with AdaDelta, specialized for DenseRowMatrix
   * @param grad gradient
   * @param w neuron index
   * @param ada_dx_g Matrix holding helper values (2 floats per weight)
   * @param rho hyper-parameter #1
   * @param eps hyper-parameter #2
   * @return learning rate
   */
  private static float computeAdaDeltaRateForWeight(final float grad, final int w,
                                                  final DenseRowMatrix ada_dx_g,
                                                  final float rho, final float eps) {
    ada_dx_g.raw()[2*w+1] = rho * ada_dx_g.raw()[2*w+1] + (1f - rho) * grad * grad;
    final float rate = Utils.approxSqrt((ada_dx_g.raw()[2*w] + eps)/(ada_dx_g.raw()[2*w+1] + eps));
    ada_dx_g.raw()[2*w]   = rho * ada_dx_g.raw()[2*w]   + (1f - rho) * rate * rate * grad * grad;
    return rate;
  }

  /**
   * Compute learning rate with AdaDelta, specialized for DenseVector (Bias)
   * @param grad2 squared gradient
   * @param row neuron index
   * @param bias_ada_dx_g Matrix holding helper values (2 floats per weight)
   * @param rho hyper-parameter #1
   * @param eps hyper-parameter #2
   * @return learning rate
   */
  private static float computeAdaDeltaRateForBias(final float grad2, final int row,
                                                    final DenseVector bias_ada_dx_g,
                                                    final float rho, final float eps) {
    bias_ada_dx_g.raw()[2*row+1] = rho * bias_ada_dx_g.raw()[2*row+1] + (1f - rho) * grad2;
    final float rate = Utils.approxSqrt((bias_ada_dx_g.raw()[2*row  ] + eps)/(bias_ada_dx_g.raw()[2*row+1] + eps));
    bias_ada_dx_g.raw()[2*row]   = rho * bias_ada_dx_g.raw()[2*row  ] + (1f - rho) * rate * rate * grad2;
    return rate;
  }

  /**
   * Helper to enforce learning rule to satisfy sparsity constraint:
   * Computes the (rolling) average activation for each (hidden) neuron.
   */
  void compute_sparsity() {
    if (_avg_a != null) {
      for (int row = 0; row < _avg_a.size(); row++) {
        _avg_a.set(row, (float) 0.999 * (_avg_a.get(row)) + (float) 0.001 * (_a.get(row)));
      }
    }
  }

  /**
   * Helper to update the bias values
   * @param _b bias vector
   * @param _bm bias momentum vector
   * @param row index of the neuron for which we back-propagate
   * @param partial_grad partial derivative dE/dnet = dE/dy * dy/net
   * @param avg_grad2 average squared gradient for this neuron's incoming weights (only for ADADELTA)
   * @param rate learning rate
   * @param momentum momentum factor (needed only if ADADELTA isn't used)
   */
  void update_bias(final DenseVector _b, final DenseVector _bm, final int row,
                   float partial_grad, final float avg_grad2, float rate, final float momentum) {
    final boolean have_momenta = _minfo.has_momenta();
    final boolean have_ada = _minfo.adaDelta();
    final float l1 = (float)params.l1;
    final float l2 = (float)params.l2;
    final float bias = _b.get(row);
    partial_grad -= Math.signum(bias) * l1 + bias * l2;

    if (have_ada) {
      final float rho = (float)params.rho;
      final float eps = (float)params.epsilon;
      rate = computeAdaDeltaRateForBias(avg_grad2, row, _bias_ada_dx_g, rho, eps);
    }
    if (!params.nesterov_accelerated_gradient) {
      final float delta = rate * partial_grad;
      _b.add(row, delta);
      if (have_momenta) {
        _b.add(row, momentum * _bm.get(row));
        _bm.set(row, delta);
      }
    } else {
      float d = partial_grad;
      if (have_momenta) {
        _bm.set(row, _bm.get(row) * momentum);
        _bm.add(row, d);
        d = _bm.get(row);
      }
      _b.add(row, rate * d);
    }
    //update for sparsity constraint
    if (params.autoencoder && params.sparsity_beta > 0 && !(this instanceof Output) && !(this instanceof Input) && (_index != params.hidden.length)) {
      _b.add(row, -(float) (rate * params.sparsity_beta * (_avg_a._data[row] - params.average_activation)));
    }
    if (Float.isInfinite(_b.get(row))) _minfo.set_unstable();
  }


  /**
   * The learning rate
   * @param n The number of training samples seen so far (for rate_annealing greater than 0)
   * @return Learning rate
   */
  public float rate(long n) {
    return (float)(params.rate / (1 + params.rate_annealing * n));
  }

  protected float momentum() {
    return momentum(-1);
  }
  /**
   * The momentum - real number in [0, 1)
   * Can be a linear ramp from momentum_start to momentum_stable, over momentum_ramp training samples
   * @param n The number of training samples seen so far
   * @return momentum
   */
  public float momentum(long n) {
    double m = params.momentum_start;
    if( params.momentum_ramp > 0 ) {
      final long num = n != -1 ? _minfo.get_processed_total() : n;
      if( num >= params.momentum_ramp )
        m = params.momentum_stable;
      else
        m += (params.momentum_stable - params.momentum_start) * num / params.momentum_ramp;
    }
    return (float)m;
  }

  /**
   * Input layer of the Neural Network
   * This layer is different from other layers as it has no incoming weights,
   * but instead gets its activation values from the training points.
   */
  public static class Input extends Neurons {

    private FrameTask.DataInfo _dinfo; //training data
    SparseVector _svec;
    DenseVector _dvec;

    Input(int units, final FrameTask.DataInfo d) {
      super(units);
      _dinfo = d;
      _a = new DenseVector(units);
      _dvec = (DenseVector)_a;
    }

    @Override protected void bprop() { throw new UnsupportedOperationException(); }
    @Override protected void fprop(long seed, boolean training) { throw new UnsupportedOperationException(); }

    /**
     * One of two methods to set layer input values. This one is for raw double data, e.g. for scoring
     * @param seed For seeding the RNG inside (for input dropout)
     * @param data Data (training columns and responses) to extract the training columns
     *             from to be mapped into the input neuron layer
     */
    public void setInput(long seed, final double[] data) {
//      Log.info("Data: " + ArrayUtils.toString(data));
      assert(_dinfo != null);
      double [] nums = MemoryManager.malloc8d(_dinfo._nums); // a bit wasteful - reallocated each time
      int    [] cats = MemoryManager.malloc4(_dinfo._cats); // a bit wasteful - reallocated each time
      int i = 0, ncats = 0;
      for(; i < _dinfo._cats; ++i){
        // This can occur when testing data has categorical levels that are not part of training (or if there's a missing value)
        if (Double.isNaN(data[i])) {
          if (_dinfo._catMissing[i]!=0) cats[ncats++] = (_dinfo._catOffsets[i+1]-1); //use the extra level made during training
          else {
            if (!_dinfo._useAllFactorLevels)
              throw new IllegalArgumentException("Model was built without missing categorical factors in column "
                      + _dinfo.coefNames()[i] + ", but found unknown (or missing) categorical factors during scoring."
                      + "\nThe model needs to be built with use_all_factor_levels=true for this to work.");
            // else just leave all activations at 0, and since all factor levels were enabled,
            // this is OK (missing or new categorical doesn't activate any levels seen during training)
          }
        } else {
          int c = (int)data[i];
          if (_dinfo._useAllFactorLevels)
            cats[ncats++] = c + _dinfo._catOffsets[i];
          else if (c!=0)
            cats[ncats++] = c + _dinfo._catOffsets[i] - 1;
        }
      }
      final int n = data.length; // data contains only input features - no response is included
      for(;i < n;++i){
        double d = data[i];
        if(_dinfo._normMul != null) d = (d - _dinfo._normSub[i-_dinfo._cats])*_dinfo._normMul[i-_dinfo._cats];
        nums[i-_dinfo._cats] = d; //can be NaN for missing numerical data
      }
      setInput(seed, nums, ncats, cats);
    }

    /**
     * The second method used to set input layer values. This one is used directly by FrameTask.processRow() and by the method above.
     * @param seed For seeding the RNG inside (for input dropout)
     * @param nums Array containing numerical values, can be NaN
     * @param numcat Number of horizontalized categorical non-zero values (i.e., those not being the first factor of a class)
     * @param cats Array of indices, the first numcat values are the input layer unit (==column) indices for the non-zero categorical values
     *             (This allows this array to be re-usable by the caller, without re-allocating each time)
     */
    public void setInput(long seed, final double[] nums, final int numcat, final int[] cats) {
      _a = _dvec;
      Arrays.fill(_a.raw(), 0f);

      // random projection from fullN down to max_categorical_features
      if (params.max_categorical_features < _dinfo.fullN() - _dinfo._nums) {
        assert(nums.length == _dinfo._nums);
        final int M = nums.length + params.max_categorical_features;
        final boolean random_projection = false;
        final boolean hash_trick = true;
        if (random_projection) {
          final int N = _dinfo.fullN();
          assert (_a.size() == M);

          // sparse random projection
          for (int i = 0; i < M; ++i) {
            for (int c = 0; c < numcat; ++c) {
              int j = cats[c];
              Random rng = new Random(params.seed + i*N + j);
              float val = 0;
              final float rnd = rng.nextFloat();
              if (rnd < 1. / 6.) val = (float) Math.sqrt(3);
              if (rnd > 5. / 6.) val = -(float) Math.sqrt(3);
              _a.add(i, 1f * val);
            }
            Random rng = new Random(params.seed + i*N + _dinfo.numStart());
            for (int n = 0; n < nums.length; ++n) {
              float val = 0;
              final float rnd = rng.nextFloat();
              if (rnd < 1. / 6.) val = (float) Math.sqrt(3);
              if (rnd > 5. / 6.) val = -(float) Math.sqrt(3);
              _a.set(i, (Double.isNaN(nums[n]) ? 0f /*Always do MeanImputation during scoring*/ : (float) nums[n]) * val);
            }
          }
        } else if (hash_trick) {
          // Use hash trick for categorical features
          assert (_a.size() == M);
          // hash N-nums.length down to M-nums.length = cM (#categorical slots - always use all numerical features)
          final int cM = params.max_categorical_features;

          assert (_a.size() == M);
          Hash murmur = MurmurHash.getInstance();
          for (int i = 0; i < numcat; ++i) {
            ByteBuffer buf = ByteBuffer.allocate(4);
            int hashval = murmur.hash(buf.putInt(cats[i]).array(), 4, (int)params.seed); // turn horizontalized categorical integer into another integer, based on seed
//            int hashval = cats[i] ^ (int)params.seed; // turn horizontalized categorical integer into another integer, based on seed
            _a.add(Math.abs(hashval % cM), 1f); // restrict to limited range
          }
          for (int i = 0; i < nums.length; ++i)
            _a.set(cM + i, Double.isNaN(nums[i]) ? 0f /*Always do MeanImputation during scoring*/ : (float) nums[i]);

        }
      } else {
        for (int i = 0; i < numcat; ++i) _a.set(cats[i], 1f); // one-hot encode categoricals
        for (int i = 0; i < nums.length; ++i)
          _a.set(_dinfo.numStart() + i, Double.isNaN(nums[i]) ? 0f /*Always do MeanImputation during scoring*/ : (float) nums[i]);
      }
//      Log.info("Input Layer: " + ArrayUtils.toString(_a.raw()));

      // Input Dropout
      if (_dropout == null) return;
      seed += params.seed + 0x1337B4BE;
      _dropout.randomlySparsifyActivation(_a, seed);

      if (params.sparse) {
        _svec = new SparseVector(_dvec);
        _a = _svec;
      }
    }

  }

  /**
   * Tanh neurons - most common, most stable
   */
  public static class Tanh extends Neurons {
    public Tanh(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training) {
      gemv((DenseVector)_a, _w, _previous._a, _b, _dropout != null ? _dropout.bits() : null);
      final int rows = _a.size();
      for( int row = 0; row < rows; row++ )
        _a.set(row, 1f - 2f / (1f + (float)Math.exp(2*_a.get(row)))); //evals faster than tanh(x), but is slightly less numerically stable - OK
      compute_sparsity();
    }
    // Computing partial derivative g = dE/dnet = dE/dy * dy/dnet, where dE/dy is the backpropagated error
    // dy/dnet = (1 - a^2) for y(net) = tanh(net)
    @Override protected void bprop() {
      float m = momentum();
      float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
      if (_w instanceof DenseRowMatrix) {
        final int rows = _a.size();
        for (int row = 0; row < rows; row++) {
          if (_minfo.get_params().autoencoder && _index == _minfo.get_params().hidden.length)
            _e.set(row, autoEncoderError(row));
          float g = _e.get(row) * (1f - _a.get(row) * _a.get(row));
          bprop(row, g, r, m);
        }
      }
      else {
        bprop_sparse(r, m);
      }
    }
  }

  /**
   * Tanh neurons with dropout
   */
  public static class TanhDropout extends Tanh {
    public TanhDropout(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training) {
      if (training) {
        seed += params.seed + 0xDA7A6000;
        _dropout.fillBytes(seed);
        super.fprop(seed, true);
      }
      else {
        super.fprop(seed, false);
        Utils.mult(_a.raw(), (float)(1-params.hidden_dropout_ratios[_index]));
      }
    }
  }

  /**
   * Maxout neurons
   */
  public static class Maxout extends Neurons {
    public Maxout(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training) {
      float max = 0;
      final int rows = _a.size();
      if (_previous._a instanceof DenseVector) {
        for( int row = 0; row < rows; row++ ) {
          _a.set(row, 0);
          if( !training || _dropout == null || _dropout.unit_active(row) ) {
            _a.set(row, Float.NEGATIVE_INFINITY);
            for( int i = 0; i < _previous._a.size(); i++ )
              _a.set(row, Math.max(_a.get(row), _w.get(row, i) * _previous._a.get(i)));
            if (Float.isInfinite(-_a.get(row))) _a.set(row, 0); //catch the case where there is dropout (and/or input sparsity) -> no max found!
            _a.add(row, _b.get(row));
            max = Math.max(_a.get(row), max);
          }
        }
        if( max > 1 ) Utils.div(_a.raw(), max);
      }
      else {
        SparseVector x = (SparseVector)_previous._a;
        for( int row = 0; row < _a.size(); row++ ) {
          _a.set(row, 0);
          if( !training || _dropout == null || _dropout.unit_active(row) ) {
//            _a.set(row, Float.NEGATIVE_INFINITY);
//            for( int i = 0; i < _previous._a.size(); i++ )
//              _a.set(row, Math.max(_a.get(row), _w.get(row, i) * _previous._a.get(i)));
            float mymax = Float.NEGATIVE_INFINITY;
            int start = x.begin()._idx;
            int end = x.end()._idx;
            for (int it = start; it < end; ++it) {
              mymax = Math.max(mymax, _w.get(row, x._indices[it]) * x._values[it]);
            }
            _a.set(row, mymax);
            if (Float.isInfinite(-_a.get(row))) _a.set(row, 0); //catch the case where there is dropout (and/or input sparsity) -> no max found!
            _a.add(row, _b.get(row));
            max = Math.max(_a.get(row), max);
          }
        }
        if( max > 1f ) Utils.div(_a.raw(), max);
      }
      compute_sparsity();
    }
    @Override protected void bprop() {
      float m = momentum();
      float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
      if (_w instanceof DenseRowMatrix) {
        final int rows = _a.size();
        for( int row = 0; row < rows; row++ ) {
          assert (!_minfo.get_params().autoencoder);
//          if (_minfo.get_params().autoencoder && _index == _minfo.get_params().hidden.length)
//            _e.set(row, autoEncoderError(row));
          float g = _e.get(row);
//                if( _a[o] < 0 )   Not sure if we should be using maxout with a hard zero bottom
//                    g = 0;
          bprop(row, g, r, m);
        }
      }
      else {
        bprop_sparse(r, m);
      }
    }
  }

  /**
   * Maxout neurons with dropout
   */
  public static class MaxoutDropout extends Maxout {
    public MaxoutDropout(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training) {
      if (training) {
        seed += params.seed + 0x51C8D00D;
        _dropout.fillBytes(seed);
        super.fprop(seed, true);
      }
      else {
        super.fprop(seed, false);
        Utils.mult(_a.raw(), (float)(1-params.hidden_dropout_ratios[_index]));
      }
    }
  }

  /**
   * Rectifier linear unit (ReLU) neurons
   */
  public static class Rectifier extends Neurons {
    public Rectifier(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training) {
      gemv((DenseVector)_a, _w, _previous._a, _b, _dropout != null ? _dropout.bits() : null);
      final int rows = _a.size();
      for( int row = 0; row < rows; row++ ) {
        _a.set(row, Math.max(_a.get(row), 0f));
        compute_sparsity();
      }
    }

    @Override protected void bprop() {
      float m = momentum();
      float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
      final int rows = _a.size();
      if (_w instanceof DenseRowMatrix) {
        for (int row = 0; row < rows; row++) {
          //(d/dx)(max(0,x)) = 1 if x > 0, otherwise 0
          if (_minfo.get_params().autoencoder && _index == _minfo.get_params().hidden.length)
            _e.set(row, autoEncoderError(row));
          float g = _a.get(row) > 0f ? _e.get(row) : 0f;
          bprop(row, g, r, m);
        }
      }
      else {
        bprop_sparse(r, m);
      }
    }
  }

  /**
   * Rectifier linear unit (ReLU) neurons with dropout
   */
  public static class RectifierDropout extends Rectifier {
    public RectifierDropout(int units) { super(units); }
    @Override protected void fprop(long seed, boolean training) {
      if (training) {
        seed += params.seed + 0x3C71F1ED;
        _dropout.fillBytes(seed);
        super.fprop(seed, true);
      }
      else {
        super.fprop(seed, false);
        Utils.mult(_a.raw(), (float)(1-params.hidden_dropout_ratios[_index]));
      }
    }
  }

  /**
   * Abstract class for Output neurons
   */
  public static abstract class Output extends Neurons {
    Output(int units) { super(units); }
    protected void fprop(long seed, boolean training) { throw new UnsupportedOperationException(); }
    protected void bprop() { throw new UnsupportedOperationException(); }
  }

  /**
   * Output neurons for classification - Softmax
   */
  public static class Softmax extends Output {
    public Softmax(int units) { super(units); }
    protected void fprop() {
      gemv((DenseVector) _a, (DenseRowMatrix) _w, (DenseVector) _previous._a, _b, null);
      final float max = Utils.maxValue(_a.raw());
      float scale = 0f;
      final float rows = _a.size();
      for( int row = 0; row < rows; row++ ) {
        _a.set(row, (float)Math.exp(_a.get(row) - max));
        scale += _a.get(row);
      }
      for( int row = 0; row < rows; row++ ) {
        _a.raw()[row] /= scale;
        if (Float.isNaN(_a.get(row))) {
          _minfo.set_unstable();
          throw new RuntimeException("Numerical instability, predicted NaN.");
        }
      }
    }

    /**
     * Backpropagation for classification
     * Update every weight as follows: w += -rate * dE/dw
     * Compute dE/dw via chain rule: dE/dw = dE/dy * dy/dnet * dnet/dw, where net = sum(xi*wi)+b and y = activation function
     * @param target actual class label
     */
    protected void bprop(int target) {
      assert (target != missing_int_value); // no correction of weights/biases for missing label
      float m = momentum();
      float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
      float g; //partial derivative dE/dy * dy/dnet
      final float rows = _a.size();
      for( int row = 0; row < rows; row++ ) {
        final float t = (row == target ? 1f : 0f);
        final float y = _a.get(row);
        //dy/dnet = derivative of softmax = (1-y)*y
        if (params.loss == Loss.CrossEntropy) {
          //nothing else needed, -dCE/dy * dy/dnet = target - y
          //cf. http://www.stanford.edu/group/pdplab/pdphandbook/handbookch6.html
          g = t - y;
        } else {
          assert(params.loss == Loss.MeanSquare);
          //-dMSE/dy = target-y
          g = (t - y) * (1f - y) * y;
        }
        // this call expects dE/dnet
        bprop(row, g, r, m);
      }
    }
  }

  /**
   * Output neurons for regression - Softmax
   */
  public static class Linear extends Output {
    public Linear(int units) { super(units); }
    protected void fprop() {
      gemv((DenseVector)_a, _w, _previous._a, _b, _dropout != null ? _dropout.bits() : null);
    }

    /**
     * Backpropagation for regression
     * @param target floating-point target value
     */
    protected void bprop(float target) {
      assert (target != missing_real_value);
      if (params.loss != Loss.MeanSquare) throw new UnsupportedOperationException("Regression is only implemented for MeanSquare error.");
      final int row = 0;
      // Computing partial derivative: dE/dnet = dE/dy * dy/dnet = dE/dy * 1
      final float g = target - _a.get(row); //for MSE -dMSE/dy = target-y
      float m = momentum();
      float r = _minfo.adaDelta() ? 0 : rate(_minfo.get_processed_total()) * (1f - m);
      bprop(row, g, r, m);
    }
  }

  /**
   * Mat-Vec Plus Add (with optional row dropout)
   * @param res = a*x+y (pre-allocated, will be overwritten)
   * @param a matrix of size rows x cols
   * @param x vector of length cols
   * @param y vector of length rows
   * @param row_bits if not null, check bits of this byte[] to determine whether a row is used or not
   */
  static void gemv_naive(final float[] res, final float[] a, final float[] x, final float[] y, byte[] row_bits) {
    final int cols = x.length;
    final int rows = y.length;
    assert(res.length == rows);
    for(int row = 0; row<rows; row++) {
      res[row] = 0;
      if( row_bits != null && (row_bits[row / 8] & (1 << (row % 8))) == 0) continue;
      for(int col = 0; col<cols; col++)
        res[row] += a[row*cols+col] * x[col];
      res[row] += y[row];
    }
  }

  /**
   * Optimized Mat-Vec Plus Add (with optional row dropout)
   * Optimization: Partial sums can be evaluated in parallel
   * @param res = a*x+y (pre-allocated, will be overwritten)
   * @param a matrix of size rows x cols
   * @param x vector of length cols
   * @param y vector of length rows
   * @param row_bits if not null, check bits of this byte[] to determine whether a row is used or not
   */
  static void gemv_row_optimized(final float[] res, final float[] a, final float[] x, final float[] y, final byte[] row_bits) {
    final int cols = x.length;
    final int rows = y.length;
    assert(res.length == rows);
    final int extra=cols-cols%8;
    final int multiple = (cols/8)*8-1;
    int idx = 0;
    for (int row = 0; row<rows; row++) {
      res[row] = 0;
      if( row_bits == null || (row_bits[row / 8] & (1 << (row % 8))) != 0) {
        float psum0 = 0, psum1 = 0, psum2 = 0, psum3 = 0, psum4 = 0, psum5 = 0, psum6 = 0, psum7 = 0;
        for (int col = 0; col < multiple; col += 8) {
          int off = idx + col;
          psum0 += a[off    ] * x[col    ];
          psum1 += a[off + 1] * x[col + 1];
          psum2 += a[off + 2] * x[col + 2];
          psum3 += a[off + 3] * x[col + 3];
          psum4 += a[off + 4] * x[col + 4];
          psum5 += a[off + 5] * x[col + 5];
          psum6 += a[off + 6] * x[col + 6];
          psum7 += a[off + 7] * x[col + 7];
        }
        res[row] += psum0 + psum1 + psum2 + psum3;
        res[row] += psum4 + psum5 + psum6 + psum7;
        for (int col = extra; col < cols; col++)
          res[row] += a[idx + col] * x[col];
        res[row] += y[row];
      }
      idx += cols;
    }
  }

  /**
   * Helper to do a generic gemv: res = a*x + y
   * @param res Dense result
   * @param a Matrix (sparse or dense)
   * @param x Vector (sparse or dense)
   * @param y Dense vector to add to result
   * @param row_bits Bit mask for which rows to use
   */
  static void gemv(final DenseVector res, final Matrix a, final Vector x, final DenseVector y, byte[] row_bits) {
    if (a instanceof DenseRowMatrix && x instanceof DenseVector)
      gemv(res, (DenseRowMatrix)a, (DenseVector)x, y, row_bits); //default
    else if (a instanceof DenseColMatrix && x instanceof SparseVector)
      gemv(res, (DenseColMatrix)a, (SparseVector)x, y, row_bits); //fast for really sparse
    else if (a instanceof DenseRowMatrix && x instanceof SparseVector)
      gemv(res, (DenseRowMatrix) a, (SparseVector) x, y, row_bits); //try
    else if (a instanceof DenseColMatrix && x instanceof DenseVector)
      gemv(res, (DenseColMatrix) a, (DenseVector) x, y, row_bits); //try
    else throw new UnsupportedOperationException("gemv for matrix " + a.getClass().getSimpleName() + " and vector + " + x.getClass().getSimpleName() + " not yet implemented.");
  }

  static void gemv(final DenseVector res, final DenseRowMatrix a, final DenseVector x, final DenseVector y, byte[] row_bits) {
    gemv_row_optimized(res.raw(), a.raw(), x.raw(), y.raw(), row_bits);
  }

  static void gemv_naive(final DenseVector res, final DenseRowMatrix a, final DenseVector x, final DenseVector y, byte[] row_bits) {
    gemv_naive(res.raw(), a.raw(), x.raw(), y.raw(), row_bits);
  }

  //TODO: make optimized version for col matrix
  static void gemv(final DenseVector res, final DenseColMatrix a, final DenseVector x, final DenseVector y, byte[] row_bits) {
    final int cols = x.size();
    final int rows = y.size();
    assert(res.size() == rows);
    for(int r = 0; r<rows; r++) {
      res.set(r, 0);
    }
    for(int c = 0; c<cols; c++) {
      final float val = x.get(c);
      for(int r = 0; r<rows; r++) {
        if( row_bits != null && (row_bits[r / 8] & (1 << (r % 8))) == 0) continue;
        res.add(r, a.get(r,c) * val);
      }
    }
    for(int r = 0; r<rows; r++) {
      if( row_bits != null && (row_bits[r / 8] & (1 << (r % 8))) == 0) continue;
      res.add(r, y.get(r));
    }
  }

  static void gemv(final DenseVector res, final DenseRowMatrix a, final SparseVector x, final DenseVector y, byte[] row_bits) {
    final int rows = y.size();
    assert(res.size() == rows);
    for(int r = 0; r<rows; r++) {
      res.set(r, 0);
      if( row_bits != null && (row_bits[r / 8] & (1 << (r % 8))) == 0) continue;
      int start = x.begin()._idx;
      int end = x.end()._idx;
      for (int it = start; it < end; ++it) {
        res.add(r, a.get(r, x._indices[it]) * x._values[it]);
      }
      res.add(r, y.get(r));
    }
  }

  static void gemv(final DenseVector res, final DenseColMatrix a, final SparseVector x, final DenseVector y, byte[] row_bits) {
    final int rows = y.size();
    assert(res.size() == rows);
    for(int r = 0; r<rows; r++) {
      res.set(r, 0);
    }
    int start = x.begin()._idx;
    int end = x.end()._idx;
    for (int it = start; it < end; ++it) {
      final float val = x._values[it];
      if (val == 0f) continue;
      for(int r = 0; r<rows; r++) {
        if( row_bits != null && (row_bits[r / 8] & (1 << (r % 8))) == 0) continue;
        res.add(r, a.get(r,x._indices[it]) * val);
      }
    }
    for(int r = 0; r<rows; r++) {
      if( row_bits != null && (row_bits[r / 8] & (1 << (r % 8))) == 0) continue;
      res.add(r, y.get(r));
    }
  }

  static void gemv(final DenseVector res, final SparseRowMatrix a, final SparseVector x, final DenseVector y, byte[] row_bits) {
    final int rows = y.size();
    assert(res.size() == rows);
    for(int r = 0; r<rows; r++) {
      res.set(r, 0);
      if( row_bits != null && (row_bits[r / 8] & (1 << (r % 8))) == 0) continue;
      // iterate over all non-empty columns for this row
      TreeMap<Integer, Float> row = a.row(r);
      Set<Map.Entry<Integer,Float>> set = row.entrySet();

      for (Map.Entry<Integer,Float> e : set) {
        final float val = x.get(e.getKey());
        if (val != 0f) res.add(r, e.getValue() * val); //TODO: iterate over both iterators and only add where there are matching indices
      }
      res.add(r, y.get(r));
    }
  }

  static void gemv(final DenseVector res, final SparseColMatrix a, final SparseVector x, final DenseVector y, byte[] row_bits) {
    final int rows = y.size();
    assert(res.size() == rows);
    for(int r = 0; r<rows; r++) {
      res.set(r, 0);
    }
    for(int c = 0; c<a.cols(); c++) {
      TreeMap<Integer, Float> col = a.col(c);
      final float val = x.get(c);
      if (val == 0f) continue;
      for (Map.Entry<Integer,Float> e : col.entrySet()) {
        final int r = e.getKey();
        if( row_bits != null && (row_bits[r / 8] & (1 << (r % 8))) == 0) continue;
        // iterate over all non-empty columns for this row
        res.add(r, e.getValue() * val);
      }
    }
    for(int r = 0; r<rows; r++) {
      if( row_bits != null && (row_bits[r / 8] & (1 << (r % 8))) == 0) continue;
      res.add(r, y.get(r));
    }
  }

  /**
   * Abstract vector interface
   */
  public abstract interface Vector {
    public abstract float get(int i);
    public abstract void set(int i, float val);
    public abstract void add(int i, float val);
    public abstract int size();
    public abstract float[] raw();
  }

  /**
   * Dense vector implementation
   */
  public static class DenseVector extends Iced implements Vector {
    private float[] _data;
    DenseVector(int len) { _data = MemoryManager.malloc4f(len); }
    DenseVector(float[] v) { _data = v; }
    @Override public float get(int i) { return _data[i]; }
    @Override public void set(int i, float val) { _data[i] = val; }
    @Override public void add(int i, float val) { _data[i] += val; }
    @Override public int size() { return _data.length; }
    @Override public float[] raw() { return _data; }
  }

  /**
   * Sparse vector implementation
   */
  public static class SparseVector extends Iced implements Vector {
    private int[] _indices;
    private float[] _values;
    private int _size;
    private int _nnz;

    @Override public int size() { return _size; }
    public int nnz() { return _nnz; }

    SparseVector(float[] v) { this(new DenseVector(v)); }
    SparseVector(final DenseVector dv) {
      _size = dv.size();
      // first count non-zeros
      for (int i=0; i<dv._data.length; ++i) {
        if (dv.get(i) != 0.0f) {
          _nnz++;
        }
      }
      // only allocate what's needed
      _indices = MemoryManager.malloc4(_nnz);
      _values = MemoryManager.malloc4f(_nnz);
      // fill values
      int idx = 0;
      for (int i=0; i<dv._data.length; ++i) {
        if (dv.get(i) != 0.0f) {
          _indices[idx] = i;
          _values[idx] = dv.get(i);
          idx++;
        }
      }
      assert(idx == nnz());
    }

    /**
     * Slow path access to i-th element
     * @param i element index
     * @return real value
     */
    @Override public float get(int i) {
      final int idx = Arrays.binarySearch(_indices, i);
      return idx < 0 ? 0f : _values[idx];
    }

    @Override
    public void set(int i, float val) {
      throw new UnsupportedOperationException("setting values in a sparse vector is not implemented.");
    }

    @Override
    public void add(int i, float val) {
      throw new UnsupportedOperationException("adding values in a sparse vector is not implemented.");
    }

    @Override
    public float[] raw() {
      throw new UnsupportedOperationException("raw access to the data in a sparse vector is not implemented.");
    }

    /**
     * Iterator over a sparse vector
     */
    public class Iterator {
      int _idx; //which nnz
      Iterator(int id) { _idx = id; }
      Iterator next() {
        _idx++;
        return this;
      }
//      boolean hasNext() {
//        return _idx < _indices.length-1;
//      }
      boolean equals(Iterator other) {
        return _idx == other._idx;
      }
      @Override
      public String toString() {
        return index() + " -> " + value();
      }
      float value() { return _values[_idx]; }
      int index() { return _indices[_idx]; }
      void setValue(float val) { _values[_idx] = val; }
    }

    public Iterator begin() { return new Iterator(0); }
    public Iterator end() { return new Iterator(_indices.length); }
  }

  /**
   * Abstract matrix interface
   */
  public abstract interface Matrix {
    abstract float get(int row, int col);
    abstract void set(int row, int col, float val);
    abstract void add(int row, int col, float val);
    abstract int cols();
    abstract int rows();
    abstract long size();
    abstract float[] raw();
  }

  /**
   * Dense row matrix implementation
   */
  public final static class DenseRowMatrix extends Iced implements Matrix {
    private float[] _data;
    private int _cols;
    private int _rows;
    DenseRowMatrix(int rows, int cols) { this(MemoryManager.malloc4f(cols*rows), rows, cols); }
    DenseRowMatrix(float[] v, int rows, int cols) { _data = v; _rows = rows; _cols = cols; }
    @Override public float get(int row, int col) { assert(row<_rows && col<_cols); return _data[row*_cols + col]; }
    @Override public void set(int row, int col, float val) { assert(row<_rows && col<_cols); _data[row*_cols + col] = val; }
    @Override public void add(int row, int col, float val) { assert(row<_rows && col<_cols); _data[row*_cols + col] += val; }
    @Override public int cols() { return _cols; }
    @Override public int rows() { return _rows; }
    @Override public long size() { return (long)_rows*(long)_cols; }
    public float[] raw() { return _data; }
  }

  /**
   * Dense column matrix implementation
   */
  public final static class DenseColMatrix extends Iced implements Matrix {
    private float[] _data;
    private int _cols;
    private int _rows;
    DenseColMatrix(int rows, int cols) { this(MemoryManager.malloc4f(cols*rows), rows, cols); }
    DenseColMatrix(float[] v, int rows, int cols) { _data = v; _rows = rows; _cols = cols; }
    DenseColMatrix(DenseRowMatrix m, int rows, int cols) { this(rows, cols); for (int row=0;row<rows;++row) for (int col=0;col<cols;++col) set(row,col, m.get(row,col)); }
    @Override public float get(int row, int col) { assert(row<_rows && col<_cols); return _data[col*_rows + row]; }
    @Override public void set(int row, int col, float val) { assert(row<_rows && col<_cols); _data[col*_rows + row] = val; }
    @Override public void add(int row, int col, float val) { assert(row<_rows && col<_cols); _data[col*_rows + row] += val; }
    @Override public int cols() { return _cols; }
    @Override public int rows() { return _rows; }
    @Override public long size() { return (long)_rows*(long)_cols; }
    public float[] raw() { return _data; }
  }

  /**
   * Sparse row matrix implementation
   */
  public final static class SparseRowMatrix implements Matrix {
    private TreeMap<Integer, Float>[] _rows;
    private int _cols;
    SparseRowMatrix(int rows, int cols) { this(null, rows, cols); }
    SparseRowMatrix(Matrix v, int rows, int cols) {
      _rows = new TreeMap[rows];
      for (int row=0;row<rows;++row) _rows[row] = new TreeMap<Integer, Float>();
      _cols = cols;
      if (v!=null)
        for (int row=0;row<rows;++row)
          for (int col=0;col<cols;++col)
            if (v.get(row,col) != 0f)
              add(row,col, v.get(row,col));
    }
    @Override public float get(int row, int col) { Float v = _rows[row].get(col); if (v == null) return 0f; else return v; }
    @Override public void add(int row, int col, float val) { set(row,col,get(row,col)+val); }
    @Override public void set(int row, int col, float val) { _rows[row].put(col, val); }
    @Override public int cols() { return _cols; }
    @Override public int rows() { return _rows.length; }
    @Override public long size() { return (long)_rows.length*(long)_cols; }
    TreeMap<Integer, Float> row(int row) { return _rows[row]; }
    public float[] raw() { throw new UnsupportedOperationException("raw access to the data in a sparse matrix is not implemented."); }
  }

  /**
   * Sparse column matrix implementation
   */
  static final class SparseColMatrix implements Matrix {
    private TreeMap<Integer, Float>[] _cols;
    private int _rows;
    SparseColMatrix(int rows, int cols) { this(null, rows, cols); }
    SparseColMatrix(Matrix v, int rows, int cols) {
      _rows = rows;
      _cols = new TreeMap[cols];
      for (int col=0;col<cols;++col) _cols[col] = new TreeMap<Integer, Float>();
      if (v!=null)
        for (int row=0;row<rows;++row)
          for (int col=0;col<cols;++col)
            if (v.get(row,col) != 0f)
              add(row,col, v.get(row,col));
    }
    @Override public float get(int row, int col) { Float v = _cols[col].get(row); if (v == null) return 0f; else return v; }
    @Override public void add(int row, int col, float val) { set(row,col,get(row,col)+val); }
    @Override public void set(int row, int col, float val) { _cols[col].put(row, val); }
    @Override public int cols() { return _cols.length; }
    @Override public int rows() { return _rows; }
    @Override public long size() { return (long)_rows*(long)_cols.length; }
    TreeMap<Integer, Float> col(int col) { return _cols[col]; }
    public float[] raw() { throw new UnsupportedOperationException("raw access to the data in a sparse matrix is not implemented."); }
  }
}