package org.deeplearning4j.gradientcheck; import org.deeplearning4j.datasets.iterator.impl.IrisDataSetIterator; import org.deeplearning4j.nn.api.OptimizationAlgorithm; import org.deeplearning4j.nn.conf.MultiLayerConfiguration; import org.deeplearning4j.nn.conf.NeuralNetConfiguration; import org.deeplearning4j.nn.conf.Updater; import org.deeplearning4j.nn.conf.distribution.NormalDistribution; import org.deeplearning4j.nn.conf.distribution.UniformDistribution; import org.deeplearning4j.nn.conf.inputs.InputType; import org.deeplearning4j.nn.conf.layers.*; import org.deeplearning4j.nn.conf.preprocessor.RnnToCnnPreProcessor; import org.deeplearning4j.nn.multilayer.MultiLayerNetwork; import org.deeplearning4j.nn.weights.WeightInit; import org.junit.Test; import org.nd4j.linalg.activations.Activation; import org.nd4j.linalg.api.buffer.DataBuffer; import org.nd4j.linalg.api.buffer.util.DataTypeUtil; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.dataset.DataSet; import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; import org.nd4j.linalg.dataset.api.preprocessor.DataNormalization; import org.nd4j.linalg.dataset.api.preprocessor.NormalizerMinMaxScaler; import org.nd4j.linalg.dataset.api.preprocessor.NormalizerStandardize; import org.nd4j.linalg.factory.Nd4j; import org.nd4j.linalg.lossfunctions.LossFunctions.LossFunction; import java.util.Random; import static org.junit.Assert.assertTrue; /** * @author Alex Black 14 Aug 2015 */ public class LSTMGradientCheckTests { private static final boolean PRINT_RESULTS = true; private static final boolean RETURN_ON_FIRST_FAILURE = false; private static final double DEFAULT_EPS = 1e-6; private static final double DEFAULT_MAX_REL_ERROR = 1e-3; private static final double DEFAULT_MIN_ABS_ERROR = 1e-8; static { DataTypeUtil.setDTypeForContext(DataBuffer.Type.DOUBLE); } @Test public void testLSTMBasicMultiLayer() { //Basic test of GravesLSTM layer Nd4j.getRandom().setSeed(12345L); int timeSeriesLength = 4; int nIn = 2; int layerSize = 2; int nOut = 2; int miniBatchSize = 5; boolean[] gravesLSTM = new boolean[]{true, false}; for( boolean graves : gravesLSTM ) { Layer l0; Layer l1; if(graves) { l0 = new GravesLSTM.Builder().nIn(nIn).nOut(layerSize).activation(Activation.SIGMOID) .weightInit(WeightInit.DISTRIBUTION).dist(new NormalDistribution(0, 1.0)) .updater(Updater.NONE).build(); l1 = new GravesLSTM.Builder().nIn(layerSize).nOut(layerSize).activation(Activation.SIGMOID) .weightInit(WeightInit.DISTRIBUTION).dist( new NormalDistribution(0, 1.0)) .updater(Updater.NONE).build(); } else { l0 = new LSTM.Builder().nIn(nIn).nOut(layerSize).activation(Activation.SIGMOID) .weightInit(WeightInit.DISTRIBUTION).dist(new NormalDistribution(0, 1.0)) .updater(Updater.NONE).build(); l1 = new LSTM.Builder().nIn(layerSize).nOut(layerSize).activation(Activation.SIGMOID) .weightInit(WeightInit.DISTRIBUTION).dist( new NormalDistribution(0, 1.0)) .updater(Updater.NONE).build(); } MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().regularization(false).seed(12345L).list() .layer(0, l0) .layer(1, l1) .layer(2, new RnnOutputLayer.Builder(LossFunction.MCXENT).activation(Activation.SOFTMAX) .nIn(layerSize).nOut(nOut).weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1.0)).updater(Updater.NONE).build()) .pretrain(false).backprop(true).build(); MultiLayerNetwork mln = new MultiLayerNetwork(conf); mln.init(); Random r = new Random(12345L); INDArray input = Nd4j.zeros(miniBatchSize, nIn, timeSeriesLength); for (int i = 0; i < miniBatchSize; i++) { for (int j = 0; j < nIn; j++) { for (int k = 0; k < timeSeriesLength; k++) { input.putScalar(new int[]{i, j, k}, r.nextDouble() - 0.5); } } } INDArray labels = Nd4j.zeros(miniBatchSize, nOut, timeSeriesLength); for (int i = 0; i < miniBatchSize; i++) { for (int j = 0; j < timeSeriesLength; j++) { int idx = r.nextInt(nOut); labels.putScalar(new int[]{i, idx, j}, 1.0); } } String testName = "testLSTMBasic(" + (graves ? "GravesLSTM" : "LSTM") + ")"; if (PRINT_RESULTS) { System.out.println(testName); for (int j = 0; j < mln.getnLayers(); j++) System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams()); } boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); assertTrue(testName, gradOK); } } @Test public void testGradientLSTMFull() { Activation[] activFns = {Activation.TANH, Activation.SOFTSIGN}; LossFunction[] lossFunctions = {LossFunction.MCXENT, LossFunction.MSE}; Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here int timeSeriesLength = 8; int nIn = 7; int layerSize = 9; int nOut = 4; int miniBatchSize = 6; boolean[] gravesLSTM = new boolean[]{true, false}; for( boolean graves : gravesLSTM ) { Random r = new Random(12345L); INDArray input = Nd4j.zeros(miniBatchSize, nIn, timeSeriesLength); for (int i = 0; i < miniBatchSize; i++) { for (int j = 0; j < nIn; j++) { for (int k = 0; k < timeSeriesLength; k++) { input.putScalar(new int[]{i, j, k}, r.nextDouble() - 0.5); } } } INDArray labels = Nd4j.zeros(miniBatchSize, nOut, timeSeriesLength); for (int i = 0; i < miniBatchSize; i++) { for (int j = 0; j < timeSeriesLength; j++) { int idx = r.nextInt(nOut); labels.putScalar(new int[]{i, idx, j}, 1.0f); } } //use l2vals[i] with l1vals[i] double[] l2vals = {0.4, 0.0, 0.4, 0.4}; double[] l1vals = {0.0, 0.0, 0.5, 0.0}; double[] biasL2 = {0.0, 0.0, 0.0, 0.2}; double[] biasL1 = {0.0, 0.0, 0.6, 0.0}; for (Activation afn : activFns) { for (int i = 0; i < lossFunctions.length; i++) { for (int k = 0; k < l2vals.length; k++) { LossFunction lf = lossFunctions[i]; Activation outputActivation = outputActivations[i]; double l2 = l2vals[k]; double l1 = l1vals[k]; NeuralNetConfiguration.Builder conf = new NeuralNetConfiguration.Builder() .regularization(l1 > 0.0 || l2 > 0.0).seed(12345L) .weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)) .updater(Updater.NONE); if (l1 > 0.0) conf.l1(l1); if (l2 > 0.0) conf.l2(l2); if (biasL2[k] > 0) conf.l2Bias(biasL2[k]); if (biasL1[k] > 0) conf.l1Bias(biasL1[k]); Layer layer; if(graves){ layer = new GravesLSTM.Builder().nIn(nIn).nOut(layerSize) .activation(afn).build(); } else { layer = new LSTM.Builder().nIn(nIn).nOut(layerSize) .activation(afn).build(); } NeuralNetConfiguration.ListBuilder conf2 = conf .list() .layer(0, layer) .layer(1, new RnnOutputLayer.Builder(lf).activation(outputActivation).nIn(layerSize) .nOut(nOut).build()) .pretrain(false).backprop(true); MultiLayerNetwork mln = new MultiLayerNetwork(conf2.build()); mln.init(); String testName = "testGradientLSTMFull(" + (graves ? "GravesLSTM" : "LSTM") + " - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", l2=" + l2 + ", l1=" + l1; if (PRINT_RESULTS) { System.out.println(testName); for (int j = 0; j < mln.getnLayers(); j++) System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams()); } boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); assertTrue(testName, gradOK); } } } } } @Test public void testGradientLSTMEdgeCases() { //Edge cases: T=1, miniBatchSize=1, both int[] timeSeriesLength = {1, 5, 1}; int[] miniBatchSize = {7, 1, 1}; int nIn = 7; int layerSize = 9; int nOut = 4; boolean[] gravesLSTM = new boolean[]{true, false}; for( boolean graves : gravesLSTM ) { for (int i = 0; i < timeSeriesLength.length; i++) { Random r = new Random(12345L); INDArray input = Nd4j.zeros(miniBatchSize[i], nIn, timeSeriesLength[i]); for (int m = 0; m < miniBatchSize[i]; m++) { for (int j = 0; j < nIn; j++) { for (int k = 0; k < timeSeriesLength[i]; k++) { input.putScalar(new int[]{m, j, k}, r.nextDouble() - 0.5); } } } INDArray labels = Nd4j.zeros(miniBatchSize[i], nOut, timeSeriesLength[i]); for (int m = 0; m < miniBatchSize[i]; m++) { for (int j = 0; j < timeSeriesLength[i]; j++) { int idx = r.nextInt(nOut); labels.putScalar(new int[]{m, idx, j}, 1.0f); } } Layer layer; if(graves){ layer = new GravesLSTM.Builder().nIn(nIn).nOut(layerSize).activation(Activation.TANH).build(); } else { layer = new LSTM.Builder().nIn(nIn).nOut(layerSize).activation(Activation.TANH).build(); } MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder() .seed(12345L) .regularization(false) .weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)).updater(Updater.NONE) .list() .layer(0, layer) .layer(1, new RnnOutputLayer.Builder(LossFunction.MCXENT).activation(Activation.SOFTMAX) .nIn(layerSize).nOut(nOut).build()) .pretrain(false).backprop(true).build(); MultiLayerNetwork mln = new MultiLayerNetwork(conf); mln.init(); String msg = "testGradientLSTMEdgeCases(" + (graves ? "GravesLSTM" : "LSTM") + " - timeSeriesLength=" + timeSeriesLength[i] + ", miniBatchSize=" + miniBatchSize[i]; System.out.println(msg); boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); assertTrue(msg, gradOK); } } } @Test public void testGradientGravesBidirectionalLSTMFull() { Activation[] activFns = {Activation.TANH, Activation.SOFTSIGN}; LossFunction[] lossFunctions = {LossFunction.MCXENT, LossFunction.MSE}; Activation[] outputActivations = {Activation.SOFTMAX, Activation.TANH}; //i.e., lossFunctions[i] used with outputActivations[i] here int timeSeriesLength = 4; int nIn = 2; int layerSize = 2; int nOut = 2; int miniBatchSize = 3; Random r = new Random(12345L); INDArray input = Nd4j.zeros(miniBatchSize, nIn, timeSeriesLength); for (int i = 0; i < miniBatchSize; i++) { for (int j = 0; j < nIn; j++) { for (int k = 0; k < timeSeriesLength; k++) { input.putScalar(new int[] {i, j, k}, r.nextDouble() - 0.5); } } } INDArray labels = Nd4j.zeros(miniBatchSize, nOut, timeSeriesLength); for (int i = 0; i < miniBatchSize; i++) { for (int j = 0; j < timeSeriesLength; j++) { int idx = r.nextInt(nOut); labels.putScalar(new int[] {i, idx, j}, 1.0f); } } //use l2vals[i] with l1vals[i] double[] l2vals = {0.4, 0.0, 0.4, 0.4}; double[] l1vals = {0.0, 0.0, 0.5, 0.0}; double[] biasL2 = {0.0, 0.0, 0.0, 0.2}; double[] biasL1 = {0.0, 0.0, 0.6, 0.0}; for (Activation afn : activFns) { for (int i = 0; i < lossFunctions.length; i++) { for (int k = 0; k < l2vals.length; k++) { LossFunction lf = lossFunctions[i]; Activation outputActivation = outputActivations[i]; double l2 = l2vals[k]; double l1 = l1vals[k]; NeuralNetConfiguration.Builder conf = new NeuralNetConfiguration.Builder().regularization(l1 > 0.0 || l2 > 0.0); if (l1 > 0.0) conf.l1(l1); if (l2 > 0.0) conf.l2(l2); if (biasL2[k] > 0) conf.l2Bias(biasL2[k]); if (biasL1[k] > 0) conf.l1Bias(biasL1[k]); MultiLayerConfiguration mlc = conf.seed(12345L) .list().layer(0, new GravesBidirectionalLSTM.Builder().nIn(nIn).nOut(layerSize) .weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)) .activation(afn).updater( Updater.NONE) .build()) .layer(1, new RnnOutputLayer.Builder(lf).activation(outputActivation).nIn(layerSize) .nOut(nOut).weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)).updater(Updater.NONE).build()) .pretrain(false).backprop(true).build(); MultiLayerNetwork mln = new MultiLayerNetwork(mlc); mln.init(); if (PRINT_RESULTS) { System.out.println("testGradientGravesBidirectionalLSTMFull() - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", l2=" + l2 + ", l1=" + l1); for (int j = 0; j < mln.getnLayers(); j++) System.out.println("Layer " + j + " # params: " + mln.getLayer(j).numParams()); } boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); String msg = "testGradientGravesLSTMFull() - activationFn=" + afn + ", lossFn=" + lf + ", outputActivation=" + outputActivation + ", l2=" + l2 + ", l1=" + l1; assertTrue(msg, gradOK); } } } } @Test public void testGradientGravesBidirectionalLSTMEdgeCases() { //Edge cases: T=1, miniBatchSize=1, both int[] timeSeriesLength = {1, 5, 1}; int[] miniBatchSize = {7, 1, 1}; int nIn = 7; int layerSize = 9; int nOut = 4; for (int i = 0; i < timeSeriesLength.length; i++) { Random r = new Random(12345L); INDArray input = Nd4j.zeros(miniBatchSize[i], nIn, timeSeriesLength[i]); for (int m = 0; m < miniBatchSize[i]; m++) { for (int j = 0; j < nIn; j++) { for (int k = 0; k < timeSeriesLength[i]; k++) { input.putScalar(new int[] {m, j, k}, r.nextDouble() - 0.5); } } } INDArray labels = Nd4j.zeros(miniBatchSize[i], nOut, timeSeriesLength[i]); for (int m = 0; m < miniBatchSize[i]; m++) { for (int j = 0; j < timeSeriesLength[i]; j++) { int idx = r.nextInt(nOut); labels.putScalar(new int[] {m, idx, j}, 1.0f); } } MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().regularization(false).seed(12345L) .list() .layer(0, new GravesBidirectionalLSTM.Builder().nIn(nIn).nOut(layerSize) .weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)).updater( Updater.NONE) .build()) .layer(1, new RnnOutputLayer.Builder(LossFunction.MCXENT).activation(Activation.SOFTMAX) .nIn(layerSize).nOut(nOut).weightInit(WeightInit.DISTRIBUTION) .dist(new NormalDistribution(0, 1)).updater(Updater.NONE).build()) .pretrain(false).backprop(true).build(); MultiLayerNetwork mln = new MultiLayerNetwork(conf); mln.init(); boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); String msg = "testGradientGravesLSTMEdgeCases() - timeSeriesLength=" + timeSeriesLength[i] + ", miniBatchSize=" + miniBatchSize[i]; assertTrue(msg, gradOK); } } @Test public void testGradientCnnFfRnn() { //Test gradients with CNN -> FF -> LSTM -> RnnOutputLayer //time series input/output (i.e., video classification or similar) int nChannelsIn = 3; int inputSize = 10 * 10 * nChannelsIn; //10px x 10px x 3 channels int miniBatchSize = 4; int timeSeriesLength = 10; int nClasses = 3; //Generate Nd4j.getRandom().setSeed(12345); INDArray input = Nd4j.rand(new int[] {miniBatchSize, inputSize, timeSeriesLength}); INDArray labels = Nd4j.zeros(miniBatchSize, nClasses, timeSeriesLength); Random r = new Random(12345); for (int i = 0; i < miniBatchSize; i++) { for (int j = 0; j < timeSeriesLength; j++) { int idx = r.nextInt(nClasses); labels.putScalar(new int[] {i, idx, j}, 1.0); } } MultiLayerConfiguration conf = new NeuralNetConfiguration.Builder().updater(Updater.NONE).seed(12345) .weightInit(WeightInit.DISTRIBUTION).dist(new UniformDistribution(-2, 2)).list() .layer(0, new ConvolutionLayer.Builder(5, 5).nIn(3).nOut(5).stride(1, 1) .activation(Activation.TANH).build()) //Out: (10-5)/1+1 = 6 -> 6x6x5 .layer(1, new SubsamplingLayer.Builder(SubsamplingLayer.PoolingType.MAX).kernelSize(2, 2) .stride(1, 1).build()) //Out: (6-2)/1+1 = 5 -> 5x5x5 .layer(2, new DenseLayer.Builder().nIn(5 * 5 * 5).nOut(4).activation(Activation.TANH).build()) .layer(3, new GravesLSTM.Builder().nIn(4).nOut(3).activation(Activation.TANH).build()) .layer(4, new RnnOutputLayer.Builder().lossFunction(LossFunction.MCXENT).nIn(3).nOut(nClasses) .activation(Activation.SOFTMAX).build()) .setInputType(InputType.convolutional(10, 10, 3)).pretrain(false).backprop(true).build(); //Here: ConvolutionLayerSetup in config builder doesn't know that we are expecting time series input, not standard FF input -> override it here conf.getInputPreProcessors().put(0, new RnnToCnnPreProcessor(10, 10, 3)); MultiLayerNetwork mln = new MultiLayerNetwork(conf); mln.init(); System.out.println("Params per layer:"); for (int i = 0; i < mln.getnLayers(); i++) { System.out.println("layer " + i + "\t" + mln.getLayer(i).numParams()); } boolean gradOK = GradientCheckUtil.checkGradients(mln, DEFAULT_EPS, DEFAULT_MAX_REL_ERROR, DEFAULT_MIN_ABS_ERROR, PRINT_RESULTS, RETURN_ON_FIRST_FAILURE, input, labels); assertTrue(gradOK); } }