package hex;
import water.*;
import water.api.DocGen;
import water.api.Request.API;
import water.fvec.Chunk;
import water.fvec.Vec;
import water.util.Utils;
import java.util.Random;
/**
* Neural network layer.
*
* @author cypof
*/
public abstract class Layer extends Iced {
static final int API_WEAVER = 1;
public static DocGen.FieldDoc[] DOC_FIELDS;
@API(help = "Number of neurons")
@ParamsSearch.Ignore
public int units;
public NeuralNet params;
// Layer state: activity, error
protected transient float[] _a, _e;
// Shared state: weights and biases (and their momenta)
protected transient float[] _w, _wm;
protected transient float[] _b, _bm;
// Previous and input layers
protected transient Layer _previous;
transient Input _input;
// Dropout (for input + hidden layers)
transient Dropout dropout;
/**
* Start of refactoring in specification & running data, for layers and trainers.
*/
static abstract class Training {
abstract long processed();
}
transient Training _training;
/**
* We need a way to encode a missing value in the neural net forward/back-propagation scheme.
* For simplicity and performance, we simply use the largest values to encode a missing value.
* If we run into exactly one of those values with regular neural net updates, then we're very
* likely also running into overflow problems, which will trigger a NaN somewhere, which will be
* caught and lead to automatic job cancellation.
*/
public static final int missing_int_value = Integer.MAX_VALUE; //encode missing label or target
public static final float missing_float_value = Float.MAX_VALUE; //encode missing input
/**
* Helper class for dropout, only to be used from within a Layer
*/
public class Dropout {
private transient Random _rand;
private transient byte[] _bits;
@Override
public String toString() {
String s = "Dropout: " + super.toString();
s += "\nRandom: " + _rand.toString();
s += "\nbits: ";
for (int i=0; i< _bits.length*8; ++i) s += unit_active(i) ? "1":"0";
s += "\n";
return s;
}
Dropout(int units) {
_bits = new byte[(units+7)/8];
_rand = new Random(0);
}
// for input layer
public void randomlySparsifyActivation(float[] a, double rate, long seed) {
if (rate == 0) return;
setSeed(seed);
for( int i = 0; i < a.length; i++ )
if (_rand.nextFloat() < rate) a[i] = 0;
}
// for hidden layers
public void fillBytes(long seed) {
setSeed(seed);
_rand.nextBytes(_bits);
}
public boolean unit_active(int o) {
return (_bits[o / 8] & (1 << (o % 8))) != 0;
}
private void setSeed(long seed) {
if ((seed >>> 32) < 0x0000ffffL) seed |= 0x5b93000000000000L;
if (((seed << 32) >>> 32) < 0x0000ffffL) seed |= 0xdb910000L;
_rand.setSeed(seed);
}
}
public final void init(Layer[] ls, int index, NeuralNet p) {
params = (NeuralNet)p.clone();
init(ls, index, true);
}
public void init(Layer[] ls, int index, boolean weights) {
params.rate *= Math.pow(params.rate_decay, index-1);
_a = new float[units];
if (!(this instanceof Output) && !(this instanceof Input)) {
_e = new float[units];
}
_previous = ls[index - 1];
_input = (Input) ls[0];
if (this instanceof MaxoutDropout || this instanceof TanhDropout || this instanceof RectifierDropout) {
dropout = new Dropout(units);
}
if( weights ) {
_w = new float[units * _previous.units];
_b = new float[units];
if( params.momentum_start != 0 || params.momentum_stable != 0 ) {
_wm = new float[_w.length];
_bm = new float[_b.length];
}
}
}
/**
*
// helper to initialize weights
// adaptive initialization uses prefactor * sqrt(6 / (units_input_layer + units_this_layer))
* @param seed random generator seed to use
* @param prefactor prefactor for initialization (typical value: 1.0)
*/
// cf. http://machinelearning.wustl.edu/mlpapers/paper_files/AISTATS2010_GlorotB10.pdf
void randomize(long seed, double prefactor) {
if (_w == null) return;
final Random rng = water.util.Utils.getDeterRNG(seed);
if (params.initial_weight_distribution == NeuralNet.InitialWeightDistribution.UniformAdaptive) {
final double range = prefactor * Math.sqrt(6. / (_previous.units + units));
for( int i = 0; i < _w.length; i++ )
_w[i] = (float)uniformDist(rng, -range, range);
}
else {
if (params.initial_weight_distribution == NeuralNet.InitialWeightDistribution.Uniform) {
for (int i = 0; i < _w.length; i++) {
_w[i] = (float)uniformDist(rng, -params.initial_weight_scale, params.initial_weight_scale);
}
} else if (params.initial_weight_distribution == NeuralNet.InitialWeightDistribution.Normal) {
for (int i = 0; i < _w.length; i++) {
_w[i] = (float) (0 + rng.nextGaussian() * params.initial_weight_scale);
}
}
}
}
// TODO: Add "subset randomize" function
// int count = Math.min(15, _previous.units);
// double min = -.1f, max = +.1f;
// //double min = -1f, max = +1f;
// for( int o = 0; o < units; o++ ) {
// for( int n = 0; n < count; n++ ) {
// int i = rand.nextInt(_previous.units);
// int w = o * _previous.units + i;
// _w[w] = uniformDist(rand, min, max);
// }
// }
public void close() {
}
protected abstract void fprop(long seed, boolean training);
protected abstract void bprop();
/**
* Apply gradient g to unit u with rate r and momentum m.
*/
final void bprop(int u, float g, float r, float m) {
// only correct weights if the gradient is large enough
if (params.fast_mode || (_w == null && params.l1 == 0.0 && params.l2 == 0.0)) {
if (g == 0f) return;
}
final float l1 = (float)params.l1;
final float l2 = (float)params.l2;
double r2 = 0;
final int off = u * _previous._a.length;
for( int i = 0; i < _previous._a.length; i++ ) {
int w = off + i;
if( _previous._e != null ) _previous._e[i] += g * _w[w];
if (params.fast_mode && _previous._a[i] == 0) continue;
float d = g * _previous._a[i] - Math.signum(_w[w]) * l1 - _w[w] * l2;
// TODO finish per-weight acceleration, doesn't help for now
// if( _wp != null && d != 0 ) {
// boolean sign = _wp[w] >= 0;
// double mult = Math.abs(_wp[w]);
// // If the gradient kept its sign, increase
// if( (d >= 0) == sign )
// mult += .05f;
// else {
// if( mult > 1 )
// mult *= .95f;
// else
// sign = !sign;
// }
// d *= mult;
// _wp[w] = sign ? mult : -mult;
// }
if( _wm != null ) {
_wm[w] *= m;
_wm[w] += d;
d = _wm[w];
}
_w[w] += r * d;
if (params.max_w2 != Double.POSITIVE_INFINITY) r2 += _w[w] * _w[w];
}
if( params.max_w2 != Double.POSITIVE_INFINITY && r2 > params.max_w2 ) { // C.f. Improving neural networks by preventing co-adaptation of feature detectors
final float scale = Utils.approxSqrt((float)(params.max_w2 / r2));
for( int i = 0; i < _previous._a.length; i++ ) _w[off + i] *= scale;
}
float d = g;
if( _bm != null ) {
_bm[u] *= m;
_bm[u] += d;
d = _bm[u];
}
_b[u] += r * d;
}
public float rate(long n) {
return (float)(params.rate / (1 + params.rate_annealing * n));
}
public float momentum(long n) {
double m = params.momentum_start;
if( params.momentum_ramp > 0 ) {
if( n >= params.momentum_ramp )
m = params.momentum_stable;
else
m += (params.momentum_stable - params.momentum_start) * n / params.momentum_ramp;
}
return (float)m;
}
public static abstract class Input extends Layer {
@ParamsSearch.Ignore
protected long _pos, _len;
@Override public void init(Layer[] ls, int index, boolean weights) {
_a = new float[units];
dropout = new Dropout(units);
}
public void inputDropout(long seed) {
double rate = params.input_dropout_ratio;
seed += params.seed + 0x1337B4BE;
dropout.randomlySparsifyActivation(_a, rate, seed);
}
@Override protected void bprop() {
throw new UnsupportedOperationException();
}
public final long move() {
return _pos = _pos == _len - 1 ? 0 : _pos + 1;
}
}
public static class VecsInput extends Input {
static final int API_WEAVER = 1;
public static DocGen.FieldDoc[] DOC_FIELDS;
public Vec[] vecs;
@API(help = "Categorical classes identified on the training set")
int[] categoricals_lens;
@API(help = "Categorical minimums identified on the training set")
int[] categoricals_mins;
@API(help = "Normalisation stats used during training")
double[] subs, muls;
transient Chunk[] _chunks;
@Override public Layer clone() {
VecsInput o = (VecsInput) super.clone();
if( o._chunks != null )
o._chunks = new Chunk[o._chunks.length];
return o;
}
public VecsInput(Vec[] vecs, VecsInput train) {
Init(vecs, train);
}
public void Init(Vec[] vecs, VecsInput train) {
units = train != null ? train.subs.length : expand(vecs);
this.vecs = vecs;
_len = vecs[0].length();
if( train != null ) {
int a = train.categoricals_lens.length;
int b = vecs.length;
assert a == b;
categoricals_lens = train.categoricals_lens;
categoricals_mins = train.categoricals_mins;
assert train.subs.length == units;
subs = train.subs;
muls = train.muls;
} else {
categoricals_lens = new int[vecs.length];
categoricals_mins = new int[vecs.length];
for( int i = 0; i < vecs.length; i++ ) {
categoricals_lens[i] = categories(vecs[i]);
categoricals_mins[i] = (int) vecs[i].min();
}
subs = new double[units];
muls = new double[units];
stats(vecs);
}
}
static int categories(Vec vec) {
String[] dom = vec.domain();
return dom == null ? 1 : dom.length - 1;
}
static int expand(Vec[] vecs) {
int n = 0;
for (Vec vec : vecs) n += categories(vec);
return n;
}
private void stats(Vec[] vecs) {
Stats stats = new Stats();
stats._units = units;
stats._categoricals_lens = categoricals_lens;
stats._categoricals_mins = categoricals_mins;
stats.doAll(vecs);
for( int i = 0; i < vecs.length; i++ ) {
subs[i] = stats._means[i];
double sigma = Math.sqrt(stats._sigms[i] / (stats._rows - 1));
muls[i] = sigma > 1e-6 ? 1 / sigma : 1;
}
}
@Override protected void fprop(long seed, boolean training) {
if( _chunks == null )
_chunks = new Chunk[vecs.length];
for( int i = 0; i < vecs.length; i++ ) {
Chunk c = _chunks[i];
if( c == null || c._vec != vecs[i] || _pos < c._start || _pos >= c._start + c._len )
_chunks[i] = vecs[i].chunkForRow(_pos);
}
ChunksInput.set(_chunks, _a, (int) (_pos - _chunks[0]._start), subs, muls, categoricals_lens, categoricals_mins);
if (training) inputDropout(seed);
}
}
/**
* Stats with expanded categoricals. Used to normalize the data in the input layer.
*/
static class Stats extends MRTask2<Stats> {
int _units;
int[] _categoricals_lens, _categoricals_mins;
double[] _means, _sigms;
long _rows;
transient double[] _subs, _muls;
@Override protected void setupLocal() {
_subs = new double[_units];
_muls = new double[_units];
for( int i = 0; i < _muls.length; i++ )
_muls[i] = 1;
}
@Override public void map(Chunk[] cs) {
_means = new double[_units];
_sigms = new double[_units];
float[] a = new float[_means.length];
for( int r = 0; r < cs[0]._len; r++ ) {
ChunksInput.set(cs, a, r, _subs, _muls, _categoricals_lens, _categoricals_mins);
for( int c = 0; c < a.length; c++ )
_means[c] += a[c];
}
for( int c = 0; c < a.length; c++ )
_means[c] /= cs[0]._len;
for( int r = 0; r < cs[0]._len; r++ ) {
ChunksInput.set(cs, a, r, _subs, _muls, _categoricals_lens, _categoricals_mins);
for( int c = 0; c < a.length; c++ )
_sigms[c] += (a[c] - _means[c]) * (a[c] - _means[c]);
}
_rows += cs[0]._len;
}
@Override public void reduce(Stats rs) {
reduce(_means, _sigms, _rows, rs._means, rs._sigms, rs._rows);
_rows += rs._rows;
}
static void reduce(double[] ma, double[] sa, long ra, double[] mb, double[] sb, long rb) {
for( int c = 0; c < ma.length; c++ ) {
double delta = ma[c] - mb[c];
ma[c] = (ma[c] * ra + mb[c] * rb) / (ra + rb);
sa[c] = sa[c] + sb[c] + delta * delta * ra * rb / (ra + rb);
}
}
@Override public boolean logVerbose() {
return !H2O.DEBUG;
}
}
/**
* A ChunksInput layer populates the activation values from a FVec chunk.
* Missing values will lead to a 0 activation value in the input layer, which is equivalent to
* setting it to the *average* column value before normalizing. In effect, missing column values are ignored.
*/
static class ChunksInput extends Input {
transient Chunk[] _chunks;
double[] _subs, _muls;
int[] _categoricals_lens;
int[] _categoricals_mins;
public ChunksInput(Chunk[] chunks, VecsInput stats) {
units = stats.subs.length;
_chunks = chunks;
_subs = stats.subs;
_muls = stats.muls;
_categoricals_lens = stats.categoricals_lens;
_categoricals_mins = stats.categoricals_mins;
}
/**
* forward propagation means filling the activation values with all the row's column values
*/
@Override protected void fprop(long seed, boolean training) {
set(_chunks, _a, (int) _pos, _subs, _muls, _categoricals_lens, _categoricals_mins);
if (training) inputDropout(seed);
}
static void set(Chunk[] chunks, float[] a, int row, double[] subs, double[] muls, int[] catLens, int[] catMins) {
int n = 0;
// loop over all columns
for( int i = 0; i < catLens.length; i++ ) {
final boolean missing = chunks[i].isNA0(row);
double d = chunks[i].at0(row);
if( catLens[i] == 1 ) {
//numerical value: normalize
d -= subs[n];
d *= muls[n];
a[n++] = missing ? 0f : (float)d;
} else {
// categorical values: use precomputed stats
int cat = catLens[i];
for( int c = 0; c < cat; c++ )
a[n + c] = missing ? 0f : (float)-subs[n + c];
int c = (int) d - catMins[i] - 1;
if( c >= 0 )
a[n + c] = missing ? 0f : (float)((1 - subs[n + c]) * muls[n + c]);
n += cat;
}
}
assert n == a.length;
}
}
public static abstract class Output extends Layer {
static final int API_WEAVER = 1;
public static DocGen.FieldDoc[] DOC_FIELDS;
protected final long pos() {
return _input._pos;
}
}
/**
* Softmax output layer is used for classification
* Rows with missing values in the response column will be ignored
**/
public static abstract class Softmax extends Output {
protected abstract int target();
@Override public void init(Layer[] ls, int index, boolean weights) {
super.init(ls, index, weights);
if( weights ) {
randomize(params.seed + 0xBAD5EED + index, 4.0f);
}
}
@Override protected void fprop(long seed, boolean training) {
for( int o = 0; o < _a.length; o++ ) {
_a[o] = 0;
for( int i = 0; i < _previous._a.length; i++ )
_a[o] += _w[o * _previous._a.length + i] * _previous._a[i];
_a[o] += _b[o];
}
final float max = Utils.maxValue(_a);
float scale = 0;
for( int o = 0; o < _a.length; o++ ) {
_a[o] = (float)Math.exp(_a[o] - max);
scale += _a[o];
}
for( int o = 0; o < _a.length; o++ )
_a[o] /= scale;
}
@Override protected void bprop() {
long processed = _training.processed();
float m = momentum(processed);
float r = rate(processed) * (1 - m);
int label = target();
if (label == missing_int_value) return; //ignore missing response values
for( int u = 0; u < _a.length; u++ ) {
final float targetval = (u == label ? 1f : 0f);
float g = targetval - _a[u];
if (params.loss == NeuralNet.Loss.CrossEntropy) {
//nothing else needed
} else if (params.loss == NeuralNet.Loss.MeanSquare) {
g *= (1 - _a[u]) * _a[u];
}
bprop(u, g, r, m);
}
}
}
public static class VecSoftmax extends Softmax {
public Vec vec;
private Vec _toClose;
VecSoftmax() {
}
public VecSoftmax(Vec vec, VecSoftmax stats) {
// Waiting for Michal stuff, for now enum must start at 0
// if( vec.domain() == null ) {
// vec = vec.toEnum();
// _toClose = vec;
// }
this.units = stats != null ? stats.units : (int) (vec.max() + 1);
this.vec = vec;
params = stats != null ? (NeuralNet)stats.params.clone() : null;
}
@Override protected int target() {
if( vec.isNA(_input._pos) )
return missing_int_value;
return (int) vec.at8(_input._pos);
}
@Override public void close() {
super.close();
if( _toClose != null )
UKV.remove(_toClose._key);
}
}
static class ChunkSoftmax extends Softmax {
transient Chunk _chunk;
public ChunkSoftmax(Chunk chunk, VecSoftmax stats) {
units = stats.units;
_chunk = chunk;
params = (NeuralNet)stats.params.clone();
}
@Override protected int target() {
if( _chunk.isNA0((int) _input._pos) )
return missing_int_value;
return (int) _chunk.at80((int) _input._pos);
}
}
/**
* Linear output layer is used for regression
* Rows with missing values in the response column will be ignored
**/
public static abstract class Linear extends Output {
abstract float[] target();
@Override public void init(Layer[] ls, int index, boolean weights) {
super.init(ls, index, weights);
if( weights ) {
randomize(params.seed + 0xBAD5EED + index, 1.0f);
}
}
@Override protected void fprop(long seed, boolean training) {
for( int o = 0; o < _a.length; o++ ) {
_a[o] = 0;
for( int i = 0; i < _previous._a.length; i++ )
_a[o] += _w[o * _previous._a.length + i] * _previous._a[i];
_a[o] += _b[o];
}
}
@Override protected void bprop() {
long processed = _training.processed();
float m = momentum(processed);
float r = rate(processed) * (1 - m);
float[] v = target();
assert(params.loss == NeuralNet.Loss.MeanSquare);
for( int u = 0; u < _a.length; u++ ) {
if (v[u] == missing_float_value) continue; //ignore missing regression targets
float g = v[u] - _a[u];
bprop(u, g, r, m);
}
}
}
public static class VecLinear extends Linear {
Vec _vec;
transient float[] _values;
public VecLinear(Vec vec, VecLinear stats) {
assert(stats == null || stats.units == 1);
units = 1; //regression
_vec = vec;
params = stats != null ? (NeuralNet)stats.params.clone() : null;
}
@Override float[] target() {
if( _values == null )
_values = new float[units];
long pos = _input._pos; //pos is a global index into the vector
_values[0] = _vec.isNA(pos) ? missing_float_value : (float)_vec.at(pos);
return _values;
}
}
static class ChunkLinear extends Linear {
transient Chunk _chunk;
transient float[] _values;
public ChunkLinear(Chunk chunk, VecLinear stats) {
assert(stats == null || stats.units == 1);
units = 1;
_chunk = chunk;
params = (NeuralNet) (stats != null ? stats.params.clone() : null);
}
@Override float[] target() {
if( _values == null )
_values = new float[units];
int pos = (int)_input._pos; //pos is a local index for this chunk
_values[0] = _chunk.isNA0(pos) ? missing_float_value : (float)_chunk.at0(pos);
return _values;
}
}
public static class Tanh extends Layer {
public Tanh(int units) { this.units = units; }
@Override public void init(Layer[] ls, int index, boolean weights) {
super.init(ls, index, weights);
if( weights ) {
randomize(params.seed + 0xBAD5EED + index, 1.0f);
}
}
@Override protected void fprop(long seed, boolean training) {
for( int o = 0; o < _a.length; o++ ) {
_a[o] = 0;
if( !training || dropout == null || dropout.unit_active(o) ) {
for( int i = 0; i < _previous._a.length; i++ ) {
_a[o] += _w[o * _previous._a.length + i] * _previous._a[i];
}
_a[o] += _b[o];
_a[o] = 1f - 2f / (1f + (float)Math.exp(2*_a[o])); //evals faster than tanh(x), but is slightly less numerically stable - OK
}
}
}
@Override protected void bprop() {
long processed = _training.processed();
float m = momentum(processed);
float r = rate(processed) * (1 - m);
for( int u = 0; u < _a.length; u++ ) {
// Gradient is error * derivative of hyperbolic tangent: (1 - x^2)
float g = _e[u] * (1f - _a[u] * _a[u]);
bprop(u, g, r, m);
}
}
}
public static class TanhDropout extends Tanh {
public TanhDropout(int units) { super(units); }
@Override
protected void fprop(long seed, boolean training) {
if (training) {
seed += params.seed + 0xDA7A6000;
dropout.fillBytes(seed);
super.fprop(seed, true);
}
else {
super.fprop(seed, false);
Utils.div(_a, 2.f);
}
}
}
/**
* Apply tanh to the weights' transpose. Used for auto-encoders.
*/
public static class TanhPrime extends Tanh {
public TanhPrime(int units) {
super(units);
}
@Override public void init(Layer[] ls, int index, boolean weights) {
super.init(ls, index, weights);
// Auto encoder has its own bias vector
_b = new float[units];
}
@Override protected void fprop(long seed, boolean training) {
for( int o = 0; o < _a.length; o++ ) {
_a[o] = 0;
for( int i = 0; i < _previous._a.length; i++ )
_a[o] += _w[i * _a.length + o] * _previous._a[i];
_a[o] += _b[o];
_a[o] = (float)Math.tanh(_a[o]);
}
}
@Override protected void bprop() {
long processed = _training.processed();
float m = momentum(processed);
float r = rate(processed) * (1 - m);
for( int o = 0; o < _a.length; o++ ) {
assert _previous._previous.units == units;
float e = _previous._previous._a[o] - _a[o];
float g = e; // * (1 - _a[o]) * _a[o]; // Square error
for( int i = 0; i < _previous._a.length; i++ ) {
int w = i * _a.length + o;
if( _previous._e != null )
_previous._e[i] += g * _w[w];
_w[w] += r * (g * _previous._a[i] - _w[w] * params.l2 - Math.signum(_w[w]) * params.l1);
}
_b[o] += r * g;
}
}
}
public static class Maxout extends Layer {
public Maxout(int units) { this.units = units; }
@Override public void init(Layer[] ls, int index, boolean weights) {
super.init(ls, index, weights);
if( weights ) {
randomize(params.seed + 0xBAD5EED + index, 1.0f);
for( int i = 0; i < _b.length; i++ )
_b[i] = index == 1 ? 0.5f : 1f;
}
}
@Override protected void fprop(long seed, boolean training) {
float max = 0;
for( int o = 0; o < _a.length; o++ ) {
_a[o] = 0;
if( !training || dropout == null || dropout.unit_active(o)) {
final int off = o * _previous._a.length;
_a[o] = Float.NEGATIVE_INFINITY;
for( int i = 0; i < _previous._a.length; i++ )
_a[o] = Math.max(_a[o], _w[off+i] * _previous._a[i]);
_a[o] += _b[o];
max = Math.max(_a[o], max);
}
}
if( max > 1 ) Utils.div(_a, max);
}
@Override protected void bprop() {
long processed = _training.processed();
float m = momentum(processed);
float r = rate(processed) * (1 - m);
for( int u = 0; u < _a.length; u++ ) {
float g = _e[u];
// if( _a[o] < 0 ) Not sure if we should be using maxout with a hard zero bottom
// g = 0;
bprop(u, g, r, m);
}
}
}
public static class MaxoutDropout extends Maxout {
public MaxoutDropout(int units) { super(units); }
@Override protected void fprop(long seed, boolean training) {
if (training) {
seed += params.seed + 0x51C8D00D;
dropout.fillBytes(seed);
super.fprop(seed, true);
}
else {
super.fprop(seed, false);
Utils.div(_a, 2.f);
}
}
}
public static class Rectifier extends Layer {
public Rectifier(int units) { this.units = units; }
@Override public void init(Layer[] ls, int index, boolean weights) {
super.init(ls, index, weights);
if( weights ) {
randomize(params.seed + 0xBAD5EED + index, 1.0f);
for( int i = 0; i < _b.length; i++ )
_b[i] = index == 1 ? 0.5f : 1f;
}
}
@Override protected void fprop(long seed, boolean training) {
for( int o = 0; o < _a.length; o++ ) {
_a[o] = 0;
if( !training || dropout == null || dropout.unit_active(o) ) {
for( int i = 0; i < _previous._a.length; i++ )
_a[o] += _w[o * _previous._a.length + i] * _previous._a[i];
_a[o] += _b[o];
_a[o] = Math.max(_a[o], 0f);
}
}
}
@Override protected void bprop() {
long processed = _training.processed();
final float m = momentum(processed);
final float r = rate(processed) * (1 - m);
for( int u = 0; u < _a.length; u++ ) {
//(d/dx)(max(0,x)) = 1 if x > 0, otherwise 0
final float g = _a[u] > 0 ? _e[u] : 0; // * 1.0 (from derivative of rectifier)
bprop(u, g, r, m);
// otherwise g = _e[u] * 0.0 = 0 and we don't allow other contributions by (and to) weights and momenta
}
}
}
public static class RectifierDropout extends Rectifier {
public RectifierDropout(int units) { super(units); }
@Override protected void fprop(long seed, boolean training) {
if (training) {
seed += params.seed + 0x3C71F1ED;
dropout.fillBytes(seed);
super.fprop(seed, true);
}
else {
super.fprop(seed, false);
Utils.div(_a, 2.f);
}
}
}
public static class RectifierPrime extends Rectifier {
public RectifierPrime(int units) { super(units); }
@Override public void init(Layer[] ls, int index, boolean weights) {
super.init(ls, index, weights);
// Auto encoder has its own bias vector
_b = new float[units];
for( int i = 0; i < _b.length; i++ )
_b[i] = index == 1 ? 0.5f : 1f;
}
@Override protected void fprop(long seed, boolean training) {
for( int o = 0; o < _a.length; o++ ) {
_a[o] = 0;
for( int i = 0; i < _previous._a.length; i++ )
_a[o] += _w[i * _a.length + o] * _previous._a[i];
_a[o] += _b[o];
if( _a[o] < 0 )
_a[o] = 0;
}
}
@Override protected void bprop() {
long processed = _training.processed();
float m = momentum(processed);
float r = rate(processed) * (1 - m);
for( int u = 0; u < _a.length; u++ ) {
assert _previous._previous.units == units;
float e = _previous._previous._a[u] - _a[u];
float g = e;//* (1 - _a[o]) * _a[o];
//float g = e * (1 - _a[o]) * _a[o]; // Square error
double r2 = 0;
for( int i = 0; i < _previous._a.length; i++ ) {
int w = i * _a.length + u;
if( _previous._e != null ) _previous._e[i] += g * _w[w];
float d = g * _previous._a[i] - (float)(_w[w] * params.l2) - (float)(Math.signum(_w[w]) * params.l1);
_w[w] += r * d;
if (params.max_w2 != Double.POSITIVE_INFINITY) r2 += _w[w] * _w[w];
}
if( params.max_w2 != Double.POSITIVE_INFINITY && r2 > params.max_w2 ) { // C.f. Improving neural networks by preventing co-adaptation of feature detectors
final double scale = Math.sqrt(params.max_w2 / r2);
for( int i = 0; i < _previous._a.length; i++ ) _w[i * _a.length + u] *= scale;
}
_b[u] += r * g;
}
}
}
@Override public Layer clone() {
Layer l = (Layer) super.clone();
if (dropout != null) l.dropout = new Dropout(units);
return l;
}
public static void shareWeights(Layer src, Layer dst) {
dst._w = src._w;
if (dst._b == null || dst._b.length == src._b.length) dst._b = src._b;
dst._wm = src._wm;
if (dst._bm == null || dst._bm.length == src._bm.length) dst._bm = src._bm;
}
public static void shareWeights(Layer[] src, Layer[] dst) {
for( int y = 1; y < src.length; y++ )
shareWeights(src[y], dst[y]);
}
private static double uniformDist(Random rand, double min, double max) {
return min + rand.nextFloat() * (max - min);
}
@Override public AutoBuffer writeJSON(AutoBuffer bb) {
bb.put1('{');
bb.putJSONStr("type").put1(':').putJSONStr(getClass().getName());
bb.put1(',');
writeJSONFields(bb);
bb.put1('}');
return bb;
}
}