package org.deeplearning4j.gradientcheck; import org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator; import org.deeplearning4j.nn.api.OptimizationAlgorithm; import org.deeplearning4j.nn.conf.MultiLayerConfiguration; import org.deeplearning4j.nn.conf.NeuralNetConfiguration; import org.deeplearning4j.nn.conf.Updater; import org.deeplearning4j.nn.conf.distribution.NormalDistribution; import org.deeplearning4j.nn.conf.distribution.UniformDistribution; import org.deeplearning4j.nn.conf.inputs.InputType; import org.deeplearning4j.nn.conf.layers.*; import org.deeplearning4j.nn.conf.preprocessor.RnnToCnnPreProcessor; import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; import org.deeplearning4j.nn.params.PretrainParamInitializer; import org.deeplearning4j.nn.weights.WeightInit; import org.deeplearning4j.optimize.solvers.BaseOptimizer; import org.junit.Test; import org.nd4j.linalg.activations.Activation; import org.nd4j.linalg.activations.IActivation; import org.nd4j.linalg.activations.impl.ActivationTanH; import org.nd4j.linalg.api.buffer.DataBuffer; import org.nd4j.linalg.api.buffer.util.DataTypeUtil; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.api.ops.random.impl.GaussianDistribution; import org.nd4j.linalg.dataset.DataSet; import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; import org.nd4j.linalg.dataset.api.preprocessor.DataNormalization; import org.nd4j.linalg.dataset.api.preprocessor.NormalizerMinMaxScaler; import org.nd4j.linalg.dataset.api.preprocessor.NormalizerStandardize; import org.nd4j.linalg.factory.Nd4j; import org.nd4j.linalg.lossfunctions.LossFunctions.LossFunction; import java.util.Arrays; import java.util.Random; import static org.junit.Assert.assertTrue; /** * @author Alex Black 14 Aug 2015 */ public class GradientCheckTests { private static final boolean PRINT_RESULTS = true; private static final boolean RETURN_ON_FIRST_FAILURE = false; private static final double DEFAULT_EPS = 1e-6; private static final double DEFAULT_MAX_REL_ERROR = 1e-3; private static final double DEFAULT_MIN_ABS_ERROR = 1e-8; static { Nd4j.zeros(1); DataTypeUtil.setDTypeForContext(DataBuffer.Type.DOUBLE); } @Test public void testGradientMLP2LayerIrisSimple() { //Parameterized test, testing combinations of: // (a) activation function // (b) Whether to test at random initialization, or after some learning (i.e., 'characteristic mode of operation') // (c) Loss function (with specified output activations) String[] activFns = {"sigmoid", "tanh", "softplus"}; //activation functions such as relu and hardtanh: may randomly fail due to discontinuities boolean[] characteristic = {false, true}; //If true: run some backprop steps first LossFunction[] lossFunctions = {LossFunction.MCXENT, LossFunction.MSE}; String[] outputActivations = {"softmax", "tanh"}; //i.e., lossFunctions[i] used with outputActivations[i] here DataNormalization scaler = new NormalizerMinMaxScaler(); DataSetIterator iter = new IrisDataSetIterator(150, 150); scaler.fit(iter); iter.setPreProcessor(scaler); DataSet ds = iter.next(); INDArray input = ds.getFeatureMatrix(); INDArray labels = ds.getLabels(); for (String afn : activFns) { for (boolean doLearningFirst : characteristic) { for (int i = 0; i < lossFunctions.length; i++) { LossFunction lf = lossFunctions[i]; String outputActivation = outputActivations[i]; MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().regularization(false) .optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT).learningRate(1.0) .seed(12345L) .list().layer(0, new DenseLayer.Builder().nIn(4).nOut(3) .weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)) .activation(afn).updater( Updater.SGD) .build()) .layer(1, new OutputLayer.Builder(lf).activation(outputActivation).nIn(3).nOut(3) .weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)).updater(Updater.SGD).build()) .pretrain(false).backprop(true).build(); MultiLayerNetwork mln = new MultiLayerNetwork(conf); mln.init(); if (doLearningFirst) { //Run a number of iterations of learning mln.setInput(ds.getFeatures()); mln.setLabels(ds.getLabels()); mln.computeGradientAndScore(); double scoreBefore = mln.score(); for (int j = 0; j < 10; j++) mln.fit(ds); mln.computeGradientAndScore(); double scoreAfter = mln.score(); //Can't test in 'characteristic mode of operation' if not learning String msg = "testGradMLP2LayerIrisSimple() - score did not (sufficiently) decrease during learning - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst + " (before=" + scoreBefore + ", scoreAfter=" + scoreAfter + ")"; assertTrue(msg, scoreAfter < 0.8 * scoreBefore); } if (PRINT_RESULTS) { System.out.println("testGradientMLP2LayerIrisSimpleRandom() - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst); for (int j = 0; j < mln.getnLayers(); j++) System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams()); } boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); String msg = "testGradMLP2LayerIrisSimple() - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst; assertTrue(msg, gradOK); } } } } @Test public void testGradientMLP2LayerIrisL1L2Simple() { //As above (testGradientMLP2LayerIrisSimple()) but with L2, L1, and both L2/L1 applied //Need to run gradient through updater, so that L2 can be applied String[] activFns = {"sigmoid", "tanh"}; boolean[] characteristic = {false, true}; //If true: run some backprop steps first LossFunction[] lossFunctions = {LossFunction.MCXENT, LossFunction.MSE}; String[] outputActivations = {"softmax", "tanh"}; //i.e., lossFunctions[i] used with outputActivations[i] here DataNormalization scaler = new NormalizerMinMaxScaler(); DataSetIterator iter = new IrisDataSetIterator(150, 150); scaler.fit(iter); iter.setPreProcessor(scaler); DataSet ds = iter.next(); INDArray input = ds.getFeatureMatrix(); INDArray labels = ds.getLabels(); //use l2vals[i] with l1vals[i] double[] l2vals = {0.4, 0.0, 0.4, 0.4}; double[] l1vals = {0.0, 0.0, 0.5, 0.0}; double[] biasL2 = {0.0, 0.0, 0.0, 0.2}; double[] biasL1 = {0.0, 0.0, 0.6, 0.0}; for (String afn : activFns) { for (boolean doLearningFirst : characteristic) { for (int i = 0; i < lossFunctions.length; i++) { for (int k = 0; k < l2vals.length; k++) { LossFunction lf = lossFunctions[i]; String outputActivation = outputActivations[i]; double l2 = l2vals[k]; double l1 = l1vals[k]; MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().regularization(true).l2(l2).l1(l1) .l2Bias(biasL2[k]).l1Bias(biasL1[k]) .optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT) .seed(12345L) .list().layer(0, new DenseLayer.Builder().nIn(4).nOut(3) .weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)) .updater(Updater.NONE) .activation(afn).build()) .layer(1, new OutputLayer.Builder(lf).nIn(3).nOut(3) .weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)) .updater(Updater.NONE) .activation(outputActivation).build()) .pretrain(false).backprop(true).build(); MultiLayerNetwork mln = new MultiLayerNetwork(conf); mln.init(); if (doLearningFirst) { //Run a number of iterations of learning mln.setInput(ds.getFeatures()); mln.setLabels(ds.getLabels()); mln.computeGradientAndScore(); double scoreBefore = mln.score(); for (int j = 0; j < 10; j++) mln.fit(ds); mln.computeGradientAndScore(); double scoreAfter = mln.score(); //Can't test in 'characteristic mode of operation' if not learning String msg = "testGradMLP2LayerIrisSimple() - score did not (sufficiently) decrease during learning - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst + ", l2=" + l2 + ", l1=" + l1 + " (before=" + scoreBefore + ", scoreAfter=" + scoreAfter + ")"; assertTrue(msg, scoreAfter < 0.8 * scoreBefore); } if (PRINT_RESULTS) { System.out.println("testGradientMLP2LayerIrisSimpleRandom() - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst + ", l2=" + l2 + ", l1=" + l1); for (int j = 0; j < mln.getnLayers(); j++) System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams()); } boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); String msg = "testGradMLP2LayerIrisSimple() - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst + ", l2=" + l2 + ", l1=" + l1; assertTrue(msg, gradOK); } } } } } @Test public void testEmbeddingLayerSimple() { Random r = new Random(12345); int nExamples = 5; INDArray input = Nd4j.zeros(nExamples, 1); INDArray labels = Nd4j.zeros(nExamples, 3); for (int i = 0; i < nExamples; i++) { input.putScalar(i, r.nextInt(4)); labels.putScalar(new int[] {i, r.nextInt(3)}, 1.0); } MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().regularization(true).l2(0.2).l1(0.1) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT).seed(12345L) .list().layer(0, new EmbeddingLayer.Builder().nIn(4).nOut(3).weightInit(WeightInit.XAVIER) .dist(new NormalDistribution(0, 1)) .updater(Updater.NONE).activation( Activation.TANH) .build()) .layer(1, new OutputLayer.Builder(LossFunction.MCXENT).nIn(3).nOut(3) .weightInit(WeightInit.XAVIER).dist(new NormalDistribution(0, 1)) .updater(Updater.NONE).activation(Activation.SOFTMAX).build()) .pretrain(false).backprop(true).build(); MultiLayerNetwork mln = new MultiLayerNetwork(conf); mln.init(); if (PRINT_RESULTS) { System.out.println("testEmbeddingLayerSimple"); for (int j = 0; j < mln.getnLayers(); j++) System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams()); } boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); String msg = "testEmbeddingLayerSimple"; assertTrue(msg, gradOK); } @Test public void testRbm() { //As above (testGradientMLP2LayerIrisSimple()) but with L2, L1, and both L2/L1 applied //Need to run gradient through updater, so that L2 can be applied RBM.HiddenUnit[] hiddenFunc = {RBM.HiddenUnit.BINARY, RBM.HiddenUnit.RECTIFIED}; boolean[] characteristic = {false, true}; //If true: run some backprop steps first LossFunction[] lossFunctions = {LossFunction.MSE, LossFunction.KL_DIVERGENCE}; String[] outputActivations = {"softmax", "sigmoid"}; //i.e., lossFunctions[i] used with outputActivations[i] here DataNormalization scaler = new NormalizerMinMaxScaler(); DataSetIterator iter = new IrisDataSetIterator(150, 150); scaler.fit(iter); iter.setPreProcessor(scaler); DataSet ds = iter.next(); INDArray input = ds.getFeatureMatrix(); INDArray labels = ds.getLabels(); double[] l2vals = {0.4, 0.0, 0.4}; double[] l1vals = {0.0, 0.5, 0.5}; //i.e., use l2vals[i] with l1vals[i] for (RBM.HiddenUnit hidunit : hiddenFunc) { for (boolean doLearningFirst : characteristic) { for (int i = 0; i < lossFunctions.length; i++) { for (int k = 0; k < l2vals.length; k++) { LossFunction lf = lossFunctions[i]; String outputActivation = outputActivations[i]; double l2 = l2vals[k]; double l1 = l1vals[k]; MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().regularization(true).l2(l2) .l1(l1).learningRate(1.0) .optimizationAlgo(OptimizationAlgorithm.STOCHASTIC_GRADIENT_DESCENT) .seed(12345L) .list().layer(0, new RBM.Builder(hidunit, RBM.VisibleUnit.BINARY).nIn(4).nOut(3) .weightInit(WeightInit.UNIFORM).updater( Updater.SGD) .build()) .layer(1, new OutputLayer.Builder(lf).nIn(3).nOut(3) .weightInit(WeightInit.XAVIER).updater(Updater.SGD) .activation(outputActivation).build()) .pretrain(false).backprop(true).build(); MultiLayerNetwork mln = new MultiLayerNetwork(conf); mln.init(); if (doLearningFirst) { //Run a number of iterations of learning mln.setInput(ds.getFeatures()); mln.setLabels(ds.getLabels()); mln.computeGradientAndScore(); double scoreBefore = mln.score(); for (int j = 0; j < 10; j++) mln.fit(ds); mln.computeGradientAndScore(); double scoreAfter = mln.score(); //Can't test in 'characteristic mode of operation' if not learning String msg = "testGradMLP2LayerIrisSimple() - score did not (sufficiently) decrease during learning - activationFn=" + hidunit.toString() + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst + ", l2=" + l2 + ", l1=" + l1 + " (before=" + scoreBefore + ", scoreAfter=" + scoreAfter + ")"; assertTrue(msg, scoreAfter < scoreBefore); } if (PRINT_RESULTS) { System.out.println("testGradientMLP2LayerIrisSimpleRandom() - activationFn=" + hidunit.toString() + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst + ", l2=" + l2 + ", l1=" + l1); for (int j = 0; j < mln.getnLayers(); j++) System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams()); } boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); String msg = "testGradMLP2LayerIrisSimple() - activationFn=" + hidunit.toString() + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst + ", l2=" + l2 + ", l1=" + l1; assertTrue(msg, gradOK); } } } } } @Test public void testAutoEncoder() { //As above (testGradientMLP2LayerIrisSimple()) but with L2, L1, and both L2/L1 applied //Need to run gradient through updater, so that L2 can be applied String[] activFns = {"sigmoid", "tanh"}; boolean[] characteristic = {false, true}; //If true: run some backprop steps first LossFunction[] lossFunctions = {LossFunction.MCXENT, LossFunction.MSE}; String[] outputActivations = {"softmax", "tanh"}; //i.e., lossFunctions[i] used with outputActivations[i] here DataNormalization scaler = new NormalizerMinMaxScaler(); DataSetIterator iter = new IrisDataSetIterator(150, 150); scaler.fit(iter); iter.setPreProcessor(scaler); DataSet ds = iter.next(); INDArray input = ds.getFeatureMatrix(); INDArray labels = ds.getLabels(); NormalizerStandardize norm = new NormalizerStandardize(); norm.fit(ds); norm.transform(ds); double[] l2vals = {0.2, 0.0, 0.2}; double[] l1vals = {0.0, 0.3, 0.3}; //i.e., use l2vals[i] with l1vals[i] for (String afn : activFns) { for (boolean doLearningFirst : characteristic) { for (int i = 0; i < lossFunctions.length; i++) { for (int k = 0; k < l2vals.length; k++) { LossFunction lf = lossFunctions[i]; String outputActivation = outputActivations[i]; double l2 = l2vals[k]; double l1 = l1vals[k]; Nd4j.getRandom().setSeed(12345); MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().regularization(true).learningRate(1.0) .l2(l2).l1(l1) .optimizationAlgo(OptimizationAlgorithm.CONJUGATE_GRADIENT) .seed(12345L).weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)).updater(Updater.SGD) .list().layer(0, new AutoEncoder.Builder().nIn(4).nOut(3) .activation(afn).build()) .layer(1, new OutputLayer.Builder(lf).nIn(3).nOut(3) .activation(outputActivation).build()) .pretrain(true).backprop(true).build(); MultiLayerNetwork mln = new MultiLayerNetwork(conf); mln.init(); String msg; if (doLearningFirst) { //Run a number of iterations of learning mln.setInput(ds.getFeatures()); mln.setLabels(ds.getLabels()); mln.computeGradientAndScore(); double scoreBefore = mln.score(); for (int j = 0; j < 10; j++) mln.fit(ds); mln.computeGradientAndScore(); double scoreAfter = mln.score(); //Can't test in 'characteristic mode of operation' if not learning msg = "testGradMLP2LayerIrisSimple() - score did not (sufficiently) decrease during learning - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst + ", l2=" + l2 + ", l1=" + l1 + " (before=" + scoreBefore + ", scoreAfter=" + scoreAfter + ")"; assertTrue(msg, scoreAfter < scoreBefore); } msg = "testGradMLP2LayerIrisSimple() - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", doLearningFirst=" + doLearningFirst + ", l2=" + l2 + ", l1=" + l1; if (PRINT_RESULTS) { System.out.println(msg); for (int j = 0; j < mln.getnLayers(); j++) System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams()); } boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); assertTrue(msg, gradOK); } } } } } }