package org.nd4j.linalg.dataset; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; import org.nd4j.linalg.BaseNd4jTest; import org.nd4j.linalg.api.ndarray.INDArray; import org.nd4j.linalg.dataset.api.iterator.DataSetIterator; import org.nd4j.linalg.dataset.api.iterator.TestDataSetIterator; import org.nd4j.linalg.dataset.api.preprocessor.NormalizerStandardize; import org.nd4j.linalg.factory.Nd4j; import org.nd4j.linalg.factory.Nd4jBackend; import org.nd4j.linalg.ops.transforms.Transforms; import static org.junit.Assert.*; /** * Created by susaneraly on 5/25/16. */ @RunWith(Parameterized.class) public class NormalizerStandardizeTest extends BaseNd4jTest { public NormalizerStandardizeTest(Nd4jBackend backend) { super(backend); } @Test public void testBruteForce() { /* This test creates a dataset where feature values are multiples of consecutive natural numbers The obtained values are compared to the theoretical mean and std dev */ double tolerancePerc = 0.01; // 0.01% of correct value int nSamples = 5120; int x = 1, y = 2, z = 3; INDArray featureX = Nd4j.linspace(1, nSamples, nSamples).reshape(nSamples, 1).mul(x); INDArray featureY = featureX.mul(y); INDArray featureZ = featureX.mul(z); INDArray featureSet = Nd4j.concat(1, featureX, featureY, featureZ); INDArray labelSet = Nd4j.zeros(nSamples, 1); DataSet sampleDataSet = new DataSet(featureSet, labelSet); double meanNaturalNums = (nSamples + 1) / 2.0; INDArray theoreticalMean = Nd4j.create(new double[] {meanNaturalNums * x, meanNaturalNums * y, meanNaturalNums * z}); double stdNaturalNums = Math.sqrt((nSamples * nSamples - 1) / 12.0); INDArray theoreticalStd = Nd4j.create(new double[] {stdNaturalNums * x, stdNaturalNums * y, stdNaturalNums * z}); NormalizerStandardize myNormalizer = new NormalizerStandardize(); myNormalizer.fit(sampleDataSet); INDArray meanDelta = Transforms.abs(theoreticalMean.sub(myNormalizer.getMean())); INDArray meanDeltaPerc = meanDelta.div(theoreticalMean).mul(100); double maxMeanDeltaPerc = meanDeltaPerc.max(1).getDouble(0, 0); assertTrue(maxMeanDeltaPerc < tolerancePerc); INDArray stdDelta = Transforms.abs(theoreticalStd.sub(myNormalizer.getStd())); INDArray stdDeltaPerc = stdDelta.div(theoreticalStd).mul(100); double maxStdDeltaPerc = stdDeltaPerc.max(1).getDouble(0, 0); assertTrue(maxStdDeltaPerc < tolerancePerc); // SAME TEST WITH THE ITERATOR int bSize = 10; tolerancePerc = 0.1; // 0.1% of correct value DataSetIterator sampleIter = new TestDataSetIterator(sampleDataSet, bSize); myNormalizer.fit(sampleIter); meanDelta = Transforms.abs(theoreticalMean.sub(myNormalizer.getMean())); meanDeltaPerc = meanDelta.div(theoreticalMean).mul(100); maxMeanDeltaPerc = meanDeltaPerc.max(1).getDouble(0, 0); assertTrue(maxMeanDeltaPerc < tolerancePerc); stdDelta = Transforms.abs(theoreticalStd.sub(myNormalizer.getStd())); stdDeltaPerc = stdDelta.div(theoreticalStd).mul(100); maxStdDeltaPerc = stdDeltaPerc.max(1).getDouble(0, 0); assertTrue(maxStdDeltaPerc < tolerancePerc); } @Test public void testTransform() { /*Random dataset is generated such that AX + B where X is from a normal distribution with mean 0 and std 1 The mean of above will be B and std A Obtained mean and std dev are compared to theoretical Transformed values should be the same as X with the same seed. */ long randSeed = 41732786; int nFeatures = 2; int nSamples = 6400; int bsize = 8; int a = 5; int b = 100; INDArray sampleMean, sampleStd, sampleMeanDelta, sampleStdDelta, delta, deltaPerc; double maxDeltaPerc, sampleMeanSEM; genRandomDataSet normData = new genRandomDataSet(nSamples, nFeatures, a, b, randSeed); DataSet genRandExpected = normData.theoreticalTransform; genRandomDataSet expectedData = new genRandomDataSet(nSamples, nFeatures, 1, 0, randSeed); genRandomDataSet beforeTransformData = new genRandomDataSet(nSamples, nFeatures, a, b, randSeed); NormalizerStandardize myNormalizer = new NormalizerStandardize(); DataSetIterator normIterator = normData.getIter(bsize); DataSetIterator genRandExpectedIter = new TestDataSetIterator(genRandExpected, bsize); DataSetIterator expectedIterator = expectedData.getIter(bsize); DataSetIterator beforeTransformIterator = beforeTransformData.getIter(bsize); myNormalizer.fit(normIterator); double tolerancePerc = 0.10; //within 0.1% sampleMean = myNormalizer.getMean(); sampleMeanDelta = Transforms.abs(sampleMean.sub(normData.theoreticalMean)); assertTrue(sampleMeanDelta.mul(100).div(normData.theoreticalMean).max(1).getDouble(0, 0) < tolerancePerc); //sanity check to see if it's within the theoretical standard error of mean sampleMeanSEM = sampleMeanDelta.div(normData.theoreticalSEM).max(1).getDouble(0, 0); assertTrue(sampleMeanSEM < 2.6); //99% of the time it should be within this many SEMs tolerancePerc = 1; //within 1% - std dev value sampleStd = myNormalizer.getStd(); sampleStdDelta = Transforms.abs(sampleStd.sub(normData.theoreticalStd)); assertTrue(sampleStdDelta.div(normData.theoreticalStd).max(1).mul(100).getDouble(0, 0) < tolerancePerc); tolerancePerc = 1; //within 1% normIterator.setPreProcessor(myNormalizer); while (normIterator.hasNext()) { INDArray before = beforeTransformIterator.next().getFeatures(); INDArray origBefore = genRandExpectedIter.next().getFeatures(); INDArray after = normIterator.next().getFeatures(); INDArray expected = expectedIterator.next().getFeatures(); delta = Transforms.abs(after.sub(expected)); deltaPerc = delta.div(Transforms.abs(before.sub(expected))); deltaPerc.muli(100); maxDeltaPerc = deltaPerc.max(0, 1).getDouble(0, 0); /* System.out.println("=== BEFORE ==="); System.out.println(before); System.out.println("=== ORIG BEFORE ==="); System.out.println(origBefore); System.out.println("=== AFTER ==="); System.out.println(after); System.out.println("=== SHOULD BE ==="); System.out.println(expected); System.out.println("% diff, "+ maxDeltaPerc); */ assertTrue(maxDeltaPerc < tolerancePerc); } } @Test public void testDifferentBatchSizes() { // Create 6x1 matrix of the numbers 1 through 6 INDArray values = Nd4j.linspace(1, 6, 6).transpose(); DataSet dataSet = new DataSet(values, values); // Test fitting a DataSet NormalizerStandardize norm1 = new NormalizerStandardize(); norm1.fit(dataSet); assertEquals(3.5f, norm1.getMean().getFloat(0), 1e-6); assertEquals(1.70783f, norm1.getStd().getFloat(0), 1e-4); // Test fitting an iterator with equal batch sizes DataSetIterator testIter1 = new TestDataSetIterator(dataSet, 3); // Will yield 2 batches of 3 rows NormalizerStandardize norm2 = new NormalizerStandardize(); norm2.fit(testIter1); assertEquals(3.5f, norm2.getMean().getFloat(0), 1e-6); assertEquals(1.70783f, norm2.getStd().getFloat(0), 1e-4); // Test fitting an iterator with varying batch sizes DataSetIterator testIter2 = new TestDataSetIterator(dataSet, 4); // Will yield batch of 4 and batch of 2 rows NormalizerStandardize norm3 = new NormalizerStandardize(); norm3.fit(testIter2); assertEquals(3.5f, norm3.getMean().getFloat(0), 1e-6); assertEquals(1.70783f, norm3.getStd().getFloat(0), 1e-4); // Test fitting an iterator with batches of single rows DataSetIterator testIter3 = new TestDataSetIterator(dataSet, 1); // Will yield 6 batches of 1 row NormalizerStandardize norm4 = new NormalizerStandardize(); norm4.fit(testIter3); assertEquals(3.5f, norm4.getMean().getFloat(0), 1e-6); assertEquals(1.70783f, norm4.getStd().getFloat(0), 1e-4); } @Test public void testUnderOverflow() { // This dataset will be basically constant with a small std deviation // And the constant is large. Checking if algorithm can handle double tolerancePerc = 1; //Within 1 % double toleranceAbs = 0.0005; int nSamples = 1000; int bSize = 10; int x = -1000000, y = 1000000; double z = 1000000; INDArray featureX = Nd4j.rand(nSamples, 1).mul(1).add(x); INDArray featureY = Nd4j.rand(nSamples, 1).mul(2).add(y); INDArray featureZ = Nd4j.rand(nSamples, 1).mul(3).add(z); INDArray featureSet = Nd4j.concat(1, featureX, featureY, featureZ); INDArray labelSet = Nd4j.zeros(nSamples, 1); DataSet sampleDataSet = new DataSet(featureSet, labelSet); DataSetIterator sampleIter = new TestDataSetIterator(sampleDataSet, bSize); INDArray theoreticalMean = Nd4j.create(new double[] {x, y, z}); NormalizerStandardize myNormalizer = new NormalizerStandardize(); myNormalizer.fit(sampleIter); INDArray meanDelta = Transforms.abs(theoreticalMean.sub(myNormalizer.getMean())); INDArray meanDeltaPerc = meanDelta.mul(100).div(theoreticalMean); assertTrue(meanDeltaPerc.max(1).getDouble(0, 0) < tolerancePerc); //this just has to not barf //myNormalizer.transform(sampleIter); myNormalizer.transform(sampleDataSet); } @Test public void testRevert() { double tolerancePerc = 0.01; // 0.01% of correct value int nSamples = 500; int nFeatures = 3; INDArray featureSet = Nd4j.randn(nSamples, nFeatures); INDArray labelSet = Nd4j.zeros(nSamples, 1); DataSet sampleDataSet = new DataSet(featureSet, labelSet); NormalizerStandardize myNormalizer = new NormalizerStandardize(); myNormalizer.fit(sampleDataSet); DataSet transformed = sampleDataSet.copy(); myNormalizer.transform(transformed); //System.out.println(transformed.getFeatures()); myNormalizer.revert(transformed); //System.out.println(transformed.getFeatures()); INDArray delta = Transforms.abs(transformed.getFeatures().sub(sampleDataSet.getFeatures())) .div(sampleDataSet.getFeatures()); double maxdeltaPerc = delta.max(0, 1).mul(100).getDouble(0, 0); assertTrue(maxdeltaPerc < tolerancePerc); } @Test public void testConstant() { double tolerancePerc = 10.0; // 10% of correct value int nSamples = 500; int nFeatures = 3; int constant = 100; INDArray featureSet = Nd4j.zeros(nSamples, nFeatures).add(constant); INDArray labelSet = Nd4j.zeros(nSamples, 1); DataSet sampleDataSet = new DataSet(featureSet, labelSet); NormalizerStandardize myNormalizer = new NormalizerStandardize(); myNormalizer.fit(sampleDataSet); //Checking if we gets nans assertFalse(Double.isNaN(myNormalizer.getStd().getDouble(0))); myNormalizer.transform(sampleDataSet); //Checking if we gets nans, because std dev is zero assertFalse(Double.isNaN(sampleDataSet.getFeatures().min(0, 1).getDouble(0))); //Checking to see if transformed values are close enough to zero assertEquals(Transforms.abs(sampleDataSet.getFeatures()).max(0, 1).getDouble(0, 0), 0, constant * tolerancePerc / 100.0); myNormalizer.revert(sampleDataSet); //Checking if we gets nans, because std dev is zero assertFalse(Double.isNaN(sampleDataSet.getFeatures().min(0, 1).getDouble(0))); assertEquals(Transforms.abs(sampleDataSet.getFeatures().sub(featureSet)).min(0, 1).getDouble(0), 0, constant * tolerancePerc / 100.0); } public class genRandomDataSet { /* generate random dataset from normally distributed mean 0, std 1 based on given seed and scaling constants */ DataSet sampleDataSet; INDArray theoreticalMean; INDArray theoreticalStd; INDArray theoreticalSEM; DataSet theoreticalTransform; public genRandomDataSet(int nSamples, int nFeatures, int a, int b, long randSeed) { /* if a =1 and b = 0,normal distribution otherwise with some random mean and some random distribution */ int i = 0; // Randomly generate scaling constants and add offsets // to get aA and bB INDArray aA = a == 1 ? Nd4j.ones(1, nFeatures) : Nd4j.rand(1, nFeatures, randSeed).mul(a); //a = 1, don't scale INDArray bB = Nd4j.rand(1, nFeatures, randSeed).mul(b); //b = 0 this zeros out // transform ndarray as X = aA + bB * X INDArray randomFeatures = Nd4j.zeros(nSamples, nFeatures); INDArray randomFeaturesTransform = Nd4j.zeros(nSamples, nFeatures); while (i < nFeatures) { INDArray randomSlice = Nd4j.randn(nSamples, 1, randSeed); randomFeaturesTransform.putColumn(i, randomSlice); randomSlice.muli(aA.getScalar(0, i)); randomSlice.addi(bB.getScalar(0, i)); randomFeatures.putColumn(i, randomSlice); i++; } INDArray randomLabels = Nd4j.zeros(nSamples, 1); this.sampleDataSet = new DataSet(randomFeatures, randomLabels); this.theoreticalTransform = new DataSet(randomFeaturesTransform, randomLabels); this.theoreticalMean = bB; this.theoreticalStd = aA; this.theoreticalSEM = this.theoreticalStd.div(Math.sqrt(nSamples)); } public DataSetIterator getIter(int bsize) { return new TestDataSetIterator(sampleDataSet, bsize); } } @Override public char ordering() { return 'c'; } }