package org.deeplearning4j.gradientcheck;
import org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator;
import org.deeplearning4j.nn.api.OptimizationAlgorithm;
import org.deeplearning4j.nn.conf.ComputationGraphConfiguration;
import org.deeplearning4j.nn.conf.MultiLayerConfiguration;
import org.deeplearning4j.nn.conf.NeuralNetConfiguration;
import org.deeplearning4j.nn.conf.Updater;
import org.deeplearning4j.nn.conf.distribution.NormalDistribution;
import org.deeplearning4j.nn.conf.distribution.UniformDistribution;
import org.deeplearning4j.nn.conf.inputs.InputType;
import org.deeplearning4j.nn.conf.layers.*;
import org.deeplearning4j.nn.conf.preprocessor.FeedForwardToCnnPreProcessor;
import org.deeplearning4j.nn.graph.ComputationGraph;
import org.deeplearning4j.nn.multilayer.MultiLayerNetwork;
import org.deeplearning4j.nn.weights.WeightInit;
import org.junit.Ignore;
import org.junit.Test;
import org.nd4j.linalg.activations.Activation;
import org.nd4j.linalg.api.buffer.DataBuffer;
import org.nd4j.linalg.api.buffer.util.DataTypeUtil;
import org.nd4j.linalg.api.ndarray.INDArray;
import org.nd4j.linalg.dataset.DataSet;
import org.nd4j.linalg.dataset.api.iterator.DataSetIterator;
import org.nd4j.linalg.dataset.api.preprocessor.DataNormalization;
import org.nd4j.linalg.dataset.api.preprocessor.NormalizerMinMaxScaler;
import org.nd4j.linalg.factory.Nd4j;
import org.nd4j.linalg.lossfunctions.LossFunctions;
import java.util.Random;
import static org.junit.Assert.assertTrue;
/**
*
*/
public class BNGradientCheckTest {
private static final boolean PRINT_RESULTS = true;
private static final boolean RETURN_ON_FIRST_FAILURE = false;
private static final double DEFAULT_EPS = 1e-5;
private static final double DEFAULT_MAX_REL_ERROR = 1e-5;
private static final double DEFAULT_MIN_ABS_ERROR = 1e-9;
static {
DataTypeUtil.setDTypeForContext(DataBuffer.Type.DOUBLE);
}
@Test
public void testGradient2dSimple() {
DataNormalization scaler = new NormalizerMinMaxScaler();
DataSetIterator iter = new IrisDataSetIterator(150, 150);
scaler.fit(iter);
iter.setPreProcessor(scaler);
DataSet ds = iter.next();
INDArray input = ds.getFeatureMatrix();
INDArray labels = ds.getLabels();
MultiLayerConfiguration.Builder builder =
new NeuralNetConfiguration.Builder().learningRate(1.0).regularization(false)
.updater(Updater.NONE).seed(12345L).weightInit(WeightInit.DISTRIBUTION)
.dist(new NormalDistribution(0, 1)).list()
.layer(0, new DenseLayer.Builder().nIn(4).nOut(3)
.activation(Activation.IDENTITY).build())
.layer(1, new BatchNormalization.Builder().nOut(3).build())
.layer(2, new ActivationLayer.Builder().activation(Activation.TANH).build())
.layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
.activation(Activation.SOFTMAX).nIn(3).nOut(3).build())
.pretrain(false).backprop(true);
MultiLayerNetwork mln = new MultiLayerNetwork(builder.build());
mln.init();
if (PRINT_RESULTS) {
for (int j = 0; j < mln.getnLayers(); j++)
System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
}
boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
assertTrue(gradOK);
}
@Test
public void testGradientCnnSimple() {
Nd4j.getRandom().setSeed(12345);
int minibatch = 10;
int depth = 1;
int hw = 4;
int nOut = 4;
INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw});
INDArray labels = Nd4j.zeros(minibatch, nOut);
Random r = new Random(12345);
for (int i = 0; i < minibatch; i++) {
labels.putScalar(i, r.nextInt(nOut), 1.0);
}
MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder().learningRate(1.0)
.regularization(false).updater(Updater.NONE).seed(12345L).weightInit(WeightInit.DISTRIBUTION)
.dist(new NormalDistribution(0, 2)).list()
.layer(0, new ConvolutionLayer.Builder().kernelSize(2, 2).stride(1, 1).nIn(depth).nOut(2)
.activation(Activation.IDENTITY).build())
.layer(1, new BatchNormalization.Builder().build())
.layer(2, new ActivationLayer.Builder().activation(Activation.TANH).build())
.layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
.activation(Activation.SOFTMAX).nOut(nOut).build())
.setInputType(InputType.convolutional(hw, hw, depth)).pretrain(false).backprop(true);
MultiLayerNetwork mln = new MultiLayerNetwork(builder.build());
mln.init();
if (PRINT_RESULTS) {
for (int j = 0; j < mln.getnLayers(); j++)
System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
}
boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
assertTrue(gradOK);
}
@Test
public void testGradientBNWithCNNandSubsampling() {
//Parameterized test, testing combinations of:
// (a) activation function
// (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
// (c) Loss function (with specified output activations)
// (d) l1 and l2 values
Activation[] activFns = {Activation.SIGMOID, Activation.TANH, Activation.IDENTITY};
boolean[] characteristic = {false, true}; //If true: run some backprop steps first
LossFunctions.LossFunction[] lossFunctions =
{LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here
double[] l2vals = {0.0, 0.1, 0.1};
double[] l1vals = {0.0, 0.0, 0.2}; //i.e., use l2vals[j] with l1vals[j]
Nd4j.getRandom().setSeed(12345);
int minibatch = 10;
int depth = 2;
int hw = 5;
int nOut = 3;
INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw});
INDArray labels = Nd4j.zeros(minibatch, nOut);
Random r = new Random(12345);
for (int i = 0; i < minibatch; i++) {
labels.putScalar(i, r.nextInt(nOut), 1.0);
}
DataSet ds = new DataSet(input, labels);
for (Activation afn : activFns) {
for (boolean doLearningFirst : characteristic) {
for (int i = 0; i < lossFunctions.length; i++) {
for (int j = 0; j < l2vals.length; j++) {
LossFunctions.LossFunction lf = lossFunctions[i];
Activation outputActivation = outputActivations[i];
MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder().seed(12345)
.regularization(l1vals[j] > 0 || l2vals[j] > 0).l1(l1vals[j]).l2(l2vals[j])
.optimizationAlgo(OptimizationAlgorithm.LINE_GRADIENT_DESCENT)
.updater(Updater.NONE).weightInit(WeightInit.DISTRIBUTION)
.dist(new UniformDistribution(-2, 2)).seed(12345L).list()
.layer(0, new ConvolutionLayer.Builder(2, 2).stride(1, 1).nOut(3)
.activation(afn).build())
.layer(1, new BatchNormalization.Builder().build())
.layer(2, new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.MAX)
.kernelSize(2, 2).stride(1, 1).build())
.layer(3, new BatchNormalization())
.layer(4, new ActivationLayer.Builder().activation(afn).build())
.layer(5, new OutputLayer.Builder(lf).activation(outputActivation).nOut(nOut)
.build())
.setInputType(InputType.convolutional(hw, hw, depth)).pretrain(false)
.backprop(true);
MultiLayerConfiguration conf = builder.build();
MultiLayerNetwork mln = new MultiLayerNetwork(conf);
mln.init();
String name = new Object() {}.getClass().getEnclosingMethod().getName();
if (doLearningFirst) {
//Run a number of iterations of learning
mln.setInput(ds.getFeatures());
mln.setLabels(ds.getLabels());
mln.computeGradientAndScore();
double scoreBefore = mln.score();
for (int k = 0; k < 5; k++)
mln.fit(ds);
mln.computeGradientAndScore();
double scoreAfter = mln.score();
//Can't test in 'characteristic mode of operation' if not learning
String msg = name
+ " - score did not (sufficiently) decrease during learning - activationFn="
+ afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
+ ", doLearningFirst= " + doLearningFirst + " (before=" + scoreBefore
+ ", scoreAfter=" + scoreAfter + ")";
assertTrue(msg, scoreAfter < 0.9 * scoreBefore);
}
if (PRINT_RESULTS) {
System.out.println(name + " - activationFn=" + afn + ", lossFn=" + lf
+ ", outputActivation=" + outputActivation + ", doLearningFirst="
+ doLearningFirst + ", l1=" + l1vals[j] + ", l2=" + l2vals[j]);
for (int k = 0; k < mln.getnLayers(); k++)
System.out.println("Layer " + k + " # params: " + mln.getLayer(k).numParams());
}
boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
assertTrue(gradOK);
}
}
}
}
}
@Test
public void testGradientDense() {
//Parameterized test, testing combinations of:
// (a) activation function
// (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
// (c) Loss function (with specified output activations)
// (d) l1 and l2 values
Activation[] activFns = {Activation.SIGMOID, Activation.TANH, Activation.IDENTITY};
boolean[] characteristic = {false, true}; //If true: run some backprop steps first
LossFunctions.LossFunction[] lossFunctions =
{LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here
double[] l2vals = {0.0, 0.1, 0.1};
double[] l1vals = {0.0, 0.0, 0.2}; //i.e., use l2vals[j] with l1vals[j]
Nd4j.getRandom().setSeed(12345);
int minibatch = 10;
int nIn = 5;
int nOut = 3;
INDArray input = Nd4j.rand(new int[] {minibatch, nIn});
INDArray labels = Nd4j.zeros(minibatch, nOut);
Random r = new Random(12345);
for (int i = 0; i < minibatch; i++) {
labels.putScalar(i, r.nextInt(nOut), 1.0);
}
DataSet ds = new DataSet(input, labels);
for (Activation afn : activFns) {
for (boolean doLearningFirst : characteristic) {
for (int i = 0; i < lossFunctions.length; i++) {
for (int j = 0; j < l2vals.length; j++) {
LossFunctions.LossFunction lf = lossFunctions[i];
Activation outputActivation = outputActivations[i];
MultiLayerConfiguration.Builder builder =
new NeuralNetConfiguration.Builder()
.regularization(l1vals[j] > 0 || l2vals[j] > 0).l1(l1vals[j])
.l2(l2vals[j])
.optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT)
.updater(Updater.NONE).weightInit(WeightInit.DISTRIBUTION)
.dist(new UniformDistribution(-2, 2)).seed(12345L).list()
.layer(0, new DenseLayer.Builder().nIn(nIn).nOut(4)
.activation(afn).build())
.layer(1, new BatchNormalization.Builder().build())
.layer(2, new DenseLayer.Builder().nIn(4).nOut(4).build())
.layer(3, new BatchNormalization())
.layer(4, new OutputLayer.Builder(lf)
.activation(outputActivation).nOut(nOut)
.build())
.pretrain(false).backprop(true);
MultiLayerConfiguration conf = builder.build();
MultiLayerNetwork mln = new MultiLayerNetwork(conf);
mln.init();
String name = new Object() {}.getClass().getEnclosingMethod().getName();
if (doLearningFirst) {
//Run a number of iterations of learning
mln.setInput(ds.getFeatures());
mln.setLabels(ds.getLabels());
mln.computeGradientAndScore();
double scoreBefore = mln.score();
for (int k = 0; k < 10; k++)
mln.fit(ds);
mln.computeGradientAndScore();
double scoreAfter = mln.score();
//Can't test in 'characteristic mode of operation' if not learning
String msg = name
+ " - score did not (sufficiently) decrease during learning - activationFn="
+ afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
+ ", doLearningFirst= " + doLearningFirst + " (before=" + scoreBefore
+ ", scoreAfter=" + scoreAfter + ")";
assertTrue(msg, scoreAfter < 0.8 * scoreBefore);
}
if (PRINT_RESULTS) {
System.out.println(name + " - activationFn=" + afn + ", lossFn=" + lf
+ ", outputActivation=" + outputActivation + ", doLearningFirst="
+ doLearningFirst + ", l1=" + l1vals[j] + ", l2=" + l2vals[j]);
for (int k = 0; k < mln.getnLayers(); k++)
System.out.println("Layer " + k + " # params: " + mln.getLayer(k).numParams());
}
boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
assertTrue(gradOK);
}
}
}
}
}
@Test
public void testGradient2dFixedGammaBeta() {
DataNormalization scaler = new NormalizerMinMaxScaler();
DataSetIterator iter = new IrisDataSetIterator(150, 150);
scaler.fit(iter);
iter.setPreProcessor(scaler);
DataSet ds = iter.next();
INDArray input = ds.getFeatureMatrix();
INDArray labels = ds.getLabels();
MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder().learningRate(1.0)
.regularization(false).updater(Updater.NONE).seed(12345L).weightInit(WeightInit.DISTRIBUTION)
.dist(new NormalDistribution(0, 1)).list()
.layer(0, new DenseLayer.Builder().nIn(4).nOut(3).activation(Activation.IDENTITY).build())
.layer(1, new BatchNormalization.Builder().lockGammaBeta(true).gamma(2.0).beta(0.5).nOut(3)
.build())
.layer(2, new ActivationLayer.Builder().activation(Activation.TANH).build())
.layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
.activation(Activation.SOFTMAX).nIn(3).nOut(3).build())
.pretrain(false).backprop(true);
MultiLayerNetwork mln = new MultiLayerNetwork(builder.build());
mln.init();
if (PRINT_RESULTS) {
for (int j = 0; j < mln.getnLayers(); j++)
System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
}
boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
assertTrue(gradOK);
}
@Test
public void testGradientCnnFixedGammaBeta() {
Nd4j.getRandom().setSeed(12345);
int minibatch = 10;
int depth = 1;
int hw = 4;
int nOut = 4;
INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw});
INDArray labels = Nd4j.zeros(minibatch, nOut);
Random r = new Random(12345);
for (int i = 0; i < minibatch; i++) {
labels.putScalar(i, r.nextInt(nOut), 1.0);
}
MultiLayerConfiguration.Builder builder = new NeuralNetConfiguration.Builder().learningRate(1.0)
.regularization(false).updater(Updater.NONE).seed(12345L).weightInit(WeightInit.DISTRIBUTION)
.dist(new NormalDistribution(0, 2)).list()
.layer(0, new ConvolutionLayer.Builder().kernelSize(2, 2).stride(1, 1).nIn(depth).nOut(2)
.activation(Activation.IDENTITY).build())
.layer(1, new BatchNormalization.Builder().lockGammaBeta(true).gamma(2.0).beta(0.5).build())
.layer(2, new ActivationLayer.Builder().activation(Activation.TANH).build())
.layer(3, new OutputLayer.Builder(LossFunctions.LossFunction.MCXENT)
.activation(Activation.SOFTMAX).nOut(nOut).build())
.setInputType(InputType.convolutional(hw, hw, depth)).pretrain(false).backprop(true);
MultiLayerNetwork mln = new MultiLayerNetwork(builder.build());
mln.init();
if (PRINT_RESULTS) {
for (int j = 0; j < mln.getnLayers(); j++)
System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams());
}
boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels);
assertTrue(gradOK);
}
@Test
public void testBatchNormCompGraphSimple() {
int numClasses = 2;
int height = 3;
int width = 3;
int channels = 1;
long seed = 123;
int minibatchSize = 3;
ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().seed(seed).updater(Updater.NONE)
.weightInit(WeightInit.XAVIER).regularization(false).graphBuilder().addInputs("in")
.setInputTypes(InputType.convolutional(height, width, channels))
.addLayer("bn", new BatchNormalization.Builder().build(), "in")
.addLayer("out", new OutputLayer.Builder().lossFunction(LossFunctions.LossFunction.MCXENT)
.activation(Activation.SOFTMAX).nOut(numClasses).build(), "bn")
.setOutputs("out").backprop(true).pretrain(false).build();
ComputationGraph net = new ComputationGraph(conf);
net.init();
Random r = new Random(12345);
INDArray input = Nd4j.rand(new int[] {minibatchSize, channels, height, width}); //Order: examples, channels, height, width
INDArray labels = Nd4j.zeros(minibatchSize, numClasses);
for (int i = 0; i < minibatchSize; i++) {
labels.putScalar(new int[] {i, r.nextInt(numClasses)}, 1.0);
}
boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, new INDArray[] {input},
new INDArray[] {labels});
assertTrue(gradOK);
}
@Test
public void testGradientBNWithCNNandSubsamplingCompGraph() {
//Parameterized test, testing combinations of:
// (a) activation function
// (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation')
// (c) Loss function (with specified output activations)
// (d) l1 and l2 values
Activation[] activFns = {Activation.SIGMOID, Activation.TANH, Activation.IDENTITY};
boolean[] characteristic = {false, true}; //If true: run some backprop steps first
LossFunctions.LossFunction[] lossFunctions =
{LossFunctions.LossFunction.NEGATIVELOGLIKELIHOOD, LossFunctions.LossFunction.MSE};
Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here
double[] l2vals = {0.0, 0.1, 0.1};
double[] l1vals = {0.0, 0.0, 0.2}; //i.e., use l2vals[j] with l1vals[j]
Nd4j.getRandom().setSeed(12345);
int minibatch = 10;
int depth = 2;
int hw = 5;
int nOut = 3;
INDArray input = Nd4j.rand(new int[] {minibatch, depth, hw, hw});
INDArray labels = Nd4j.zeros(minibatch, nOut);
Random r = new Random(12345);
for (int i = 0; i < minibatch; i++) {
labels.putScalar(i, r.nextInt(nOut), 1.0);
}
DataSet ds = new DataSet(input, labels);
for (Activation afn : activFns) {
for (boolean doLearningFirst : characteristic) {
for (int i = 0; i < lossFunctions.length; i++) {
for (int j = 0; j < l2vals.length; j++) {
LossFunctions.LossFunction lf = lossFunctions[i];
Activation outputActivation = outputActivations[i];
ComputationGraphConfiguration conf = new NeuralNetConfiguration.Builder().seed(12345)
.regularization(l1vals[j] > 0 || l2vals[j] > 0).l1(l1vals[j]).l2(l2vals[j])
.optimizationAlgo(OptimizationAlgorithm.LINE_GRADIENT_DESCENT)
.updater(Updater.NONE).weightInit(WeightInit.DISTRIBUTION)
.dist(new UniformDistribution(-2, 2)).seed(12345L).graphBuilder()
.addInputs("in")
.addLayer("0", new ConvolutionLayer.Builder(2, 2).stride(1, 1).nOut(3)
.activation(afn).build(), "in")
.addLayer("1", new BatchNormalization.Builder().build(), "0")
.addLayer("2", new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.MAX)
.kernelSize(2, 2).stride(1, 1).build(), "1")
.addLayer("3", new BatchNormalization(), "2")
.addLayer("4", new ActivationLayer.Builder().activation(afn).build(), "3")
.addLayer("5", new OutputLayer.Builder(lf).activation(outputActivation)
.nOut(nOut).build(), "4")
.setOutputs("5").setInputTypes(InputType.convolutional(hw, hw, depth))
.pretrain(false).backprop(true).build();
ComputationGraph net = new ComputationGraph(conf);
net.init();
String name = new Object() {}.getClass().getEnclosingMethod().getName();
if (doLearningFirst) {
//Run a number of iterations of learning
net.setInput(0, ds.getFeatures());
net.setLabels(ds.getLabels());
net.computeGradientAndScore();
double scoreBefore = net.score();
for (int k = 0; k < 5; k++)
net.fit(ds);
net.computeGradientAndScore();
double scoreAfter = net.score();
//Can't test in 'characteristic mode of operation' if not learning
String msg = name
+ " - score did not (sufficiently) decrease during learning - activationFn="
+ afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation
+ ", doLearningFirst= " + doLearningFirst + " (before=" + scoreBefore
+ ", scoreAfter=" + scoreAfter + ")";
assertTrue(msg, scoreAfter < 0.9 * scoreBefore);
}
if (PRINT_RESULTS) {
System.out.println(name + " - activationFn=" + afn + ", lossFn=" + lf
+ ", outputActivation=" + outputActivation + ", doLearningFirst="
+ doLearningFirst + ", l1=" + l1vals[j] + ", l2=" + l2vals[j]);
for (int k = 0; k < net.getNumLayers(); k++)
System.out.println("Layer " + k + " # params: " + net.getLayer(k).numParams());
}
boolean gradOK = GradientCheckUtil.checkGradients(net, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR,
DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE,
new INDArray[] {input}, new INDArray[] {labels});
assertTrue(gradOK);
}
}
}
}
}
}