DeepLearningGradientCheck.java example

Explorer
h2o-3-master
package hex.deeplearning;


import hex.deeplearning.DeepLearningModel.DeepLearningParameters;
import hex.DataInfo;
import hex.Distribution;
import hex.FrameTask;
import hex.ModelMetricsRegression;
import hex.genmodel.utils.DistributionFamily;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.Log;
import water.util.PrettyPrint;

import java.util.Random;

public class DeepLearningGradientCheck extends TestUtil {
  @BeforeClass public static void stall() { stall_till_cloudsize(1); }

  static final float MAX_TOLERANCE = 2e-2f;
  static final float MAX_FAILED_COUNT = 30;
  static final float SAMPLE_RATE = 0.01f;

  @Test
  public void gradientCheck() {
    Frame tfr = null;
    DeepLearningModel dl = null;

    try {
      tfr = parse_test_file("smalldata/glm_test/cancar_logIn.csv");
      for (String s : new String[]{
              "Merit", "Class"
      }) {
        Vec f = tfr.vec(s).toCategoricalVec();
        tfr.remove(s).remove();
        tfr.add(s, f);
      }
      DKV.put(tfr);
      tfr.add("Binary", tfr.anyVec().makeZero());
      new MRTask() {
        public void map(Chunk[] c) {
          for (int i=0;i<c[0]._len;++i)
            if (c[0].at8(i)==1) c[1].set(i,1);
        }
      }.doAll(tfr.vecs(new String[]{"Class","Binary"}));
      Vec cv = tfr.vec("Binary").toCategoricalVec();
      tfr.remove("Binary").remove();
      tfr.add("Binary", cv);
      DKV.put(tfr);

      Random rng = new Random(0xDECAF);
      int count=0;
      int failedcount=0;
      double maxRelErr = 0;
      double meanRelErr = 0;
      for (DistributionFamily dist : new DistributionFamily[]{
              DistributionFamily.gaussian,
              DistributionFamily.laplace,
              DistributionFamily.quantile,
              DistributionFamily.huber,
              // DistributionFamily.modified_huber,
              DistributionFamily.gamma,
              DistributionFamily.poisson,
              DistributionFamily.AUTO,
              DistributionFamily.tweedie,
              DistributionFamily.multinomial,
              DistributionFamily.bernoulli,
      }) {
        for (DeepLearningParameters.Activation act : new DeepLearningParameters.Activation[]{
//            DeepLearningParameters.Activation.ExpRectifier,
                DeepLearningParameters.Activation.Tanh,
                DeepLearningParameters.Activation.Rectifier,
//                DeepLearningParameters.Activation.Maxout,
        }) {
          for (String response : new String[]{
                  "Binary", //binary classification
                  "Class", //multi-class
                  "Cost", //regression
          }) {
            for (boolean adaptive : new boolean[]{
                    true,
                    false
            }) {
              for (int miniBatchSize : new int[]{
                      1
              }) {
                if (response.equals("Class")) {
                  if (dist != DistributionFamily.multinomial && dist != DistributionFamily.AUTO)
                    continue;
                }
                else if (response.equals("Binary")) {
                  if (dist != DistributionFamily.modified_huber && dist != DistributionFamily.bernoulli && dist != DistributionFamily.AUTO)
                    continue;
                }
                else {
                  if (dist == DistributionFamily.multinomial || dist == DistributionFamily.modified_huber || dist == DistributionFamily.bernoulli) continue;
                }

                DeepLearningParameters parms = new DeepLearningParameters();
                parms._huber_alpha = rng.nextDouble()+0.1;
                parms._tweedie_power = 1.01 + rng.nextDouble()*0.9;
                parms._quantile_alpha = 0.05 + rng.nextDouble()*0.9;
                parms._train = tfr._key;
                parms._epochs = 100; //converge to a reasonable model to avoid too large gradients
                parms._l1 = 1e-3;
                parms._l2 = 1e-3;
                parms._force_load_balance = false;
                parms._hidden = new int[]{10, 10, 10};
                parms._fast_mode = false; //otherwise we introduce small bprop errors
                parms._response_column = response;
                parms._distribution = dist;
                parms._max_w2 = 10;
                parms._seed = 0xaaabbb;
                parms._activation = act;
                parms._adaptive_rate = adaptive;
                parms._rate = 1e-4;
                parms._momentum_start = 0.9;
                parms._momentum_stable = 0.99;
                parms._mini_batch_size = miniBatchSize;
//                DeepLearningModelInfo.gradientCheck = null;
                DeepLearningModelInfo.gradientCheck = new DeepLearningModelInfo.GradientCheck(0, 0, 0); //tell it what gradient to collect

                // Build a first model; all remaining models should be equal
                DeepLearning job = new DeepLearning(parms);
                try {
                  dl = job.trainModel().get();

                  boolean classification = response.equals("Class") || response.equals("Binary");
                  if (!classification) {
                    Frame p = dl.score(tfr);
                    hex.ModelMetrics mm = hex.ModelMetrics.getFromDKV(dl, tfr);
                    double resdev = ((ModelMetricsRegression) mm)._mean_residual_deviance;
                    Log.info("Mean residual deviance: " + resdev);
                    p.delete();
                  }

                  DeepLearningModelInfo modelInfo = IcedUtils.deepCopy(dl.model_info()); //golden version
//                Log.info(modelInfo.toStringAll());
                  long before = dl.model_info().checksum_impl();

                  float meanLoss = 0;

                  // loop over every row in the dataset and check that the predictions
                  for (int rId = 0; rId < tfr.numRows(); rId+=1 /*miniBatchSize*/) {
                    // start from scratch - with a clean model
                    dl.set_model_info(IcedUtils.deepCopy(modelInfo));

                    final DataInfo di = dl.model_info().data_info();

                    // populate miniBatch (consecutive rows)
                    final DataInfo.Row[] rowsMiniBatch = new DataInfo.Row[miniBatchSize];
                    for (int i=0; i<rowsMiniBatch.length; ++i) {
                      if (0 <= rId+i && rId+i < tfr.numRows()) {
                        rowsMiniBatch[i] = new FrameTask.ExtractDenseRow(di, rId+i).doAll(di._adaptedFrame)._row;
                      }
                    }

                    // loss at weight
                    long cs = dl.model_info().checksum_impl();
                    double loss = dl.meanLoss(rowsMiniBatch);
                    assert(cs == before);
                    assert(before == dl.model_info().checksum_impl());
                    meanLoss += loss;

                    for (int layer = 0; layer <= parms._hidden.length; ++layer) {
                      int rows = dl.model_info().get_weights(layer).rows();
                      assert(dl.model_info().get_biases(layer).size()==rows);
                      for (int row = 0; row < rows; ++row) {

                        //check bias
                        if (true) {

                          // start from scratch - with a clean model
                          dl.set_model_info(IcedUtils.deepCopy(modelInfo));

                          // do one forward propagation pass (and fill the mini-batch gradients -> set training=true)
                          Neurons[] neurons = DeepLearningTask.makeNeuronsForTraining(dl.model_info());
                          double[] responses = new double[miniBatchSize];
                          double[] offsets = new double[miniBatchSize];
                          int n = 0;
                          for (DataInfo.Row myRow : rowsMiniBatch) {
                            if (myRow == null) continue;
                            ((Neurons.Input) neurons[0]).setInput(-1, myRow.numIds, myRow.numVals, myRow.nBins, myRow.binIds, n);
                            responses[n] = myRow.response(0);
                            offsets[n] = myRow.offset;
                            n++;
                          }
                          DeepLearningTask.fpropMiniBatch(-1 /*seed doesn't matter*/, neurons, dl.model_info(), null, true /*training*/, responses, offsets, n);

                          // check that we didn't change the model's weights/biases
                          long after = dl.model_info().checksum_impl();
                          assert (after == before);

                          // record the gradient since gradientChecking is enabled
                          DeepLearningModelInfo.gradientCheck = new DeepLearningModelInfo.GradientCheck(layer, row, -1); //tell it what gradient to collect
                          DeepLearningTask.bpropMiniBatch(neurons, n); //update the weights and biases
                          assert (before != dl.model_info().checksum_impl());

                          // reset the model back to the trained model
                          dl.set_model_info(IcedUtils.deepCopy(modelInfo));
                          assert (before == dl.model_info().checksum_impl());

                          double bpropGradient = DeepLearningModelInfo.gradientCheck.gradient;

                          // FIXME: re-enable this once the loss is computed from the de-standardized prediction/response
//                    double actualResponse=myRow.response[0];
//                    double predResponseLinkSpace = neurons[neurons.length-1]._a.get(0);
//                    if (di._normRespMul != null) {
//                      bpropGradient /= di._normRespMul[0]; //no shift for gradient
//                      actualResponse = (actualResponse / di._normRespMul[0] + di._normRespSub[0]);
//                      predResponseLinkSpace = (predResponseLinkSpace / di._normRespMul[0] + di._normRespSub[0]);
//                    }
//                    bpropGradient *= new Distribution(parms._distribution).gradient(actualResponse, predResponseLinkSpace);

                          final double bias = dl.model_info().get_biases(layer).get(row);

                          double eps = 1e-4 * Math.abs(bias); //don't make the weight deltas too small, or the float weights "won't notice"
                          if (eps == 0)
                            eps = 1e-6;

                          // loss at bias + eps
                          dl.model_info().get_biases(layer).set(row, bias + eps);
                          double up = dl.meanLoss(rowsMiniBatch);

                          // loss at bias - eps
                          dl.model_info().get_biases(layer).set(row, bias - eps);
                          double down = dl.meanLoss(rowsMiniBatch);

                          if (Math.abs(up - down) / Math.abs(up + down) < 1e-8) {
                            continue; //relative change in loss function is too small -> skip
                          }

                          double gradient = ((up - down) / (2. * eps));

                          double relError = 2 * Math.abs(bpropGradient - gradient) / (Math.abs(gradient) + Math.abs(bpropGradient));

                          count++;

                          // if either gradient is tiny, check if both are tiny
                          if (Math.abs(gradient) < 1e-7 || Math.abs(bpropGradient) < 1e-7) {
                            if (Math.abs(bpropGradient - gradient) < 1e-7) continue; //all good
                          }

                          meanRelErr += relError;

                          // if both gradients are tiny - numerically unstable relative error computation is not needed, since absolute error is small

                          if (relError > MAX_TOLERANCE) {
                            Log.info("\nDistribution: " + dl._parms._distribution);
                            Log.info("\nRow: " + rId);
                            Log.info("bias (layer " + layer + ", row " + row + "): " + bias + " +/- " + eps);
                            Log.info("loss: " + loss);
                            Log.info("losses up/down: " + up + " / " + down);
                            Log.info("=> Finite differences gradient: " + gradient);
                            Log.info("=> Back-propagation gradient  : " + bpropGradient);
                            Log.info("=> Relative error             : " + PrettyPrint.formatPct(relError));
                            failedcount++;
                          }
                        }

                        int cols = dl.model_info().get_weights(layer).cols();
                        for (int col = 0; col < cols; ++col) {
                          if (rng.nextFloat() >= SAMPLE_RATE) continue;

                          // start from scratch - with a clean model
                          dl.set_model_info(IcedUtils.deepCopy(modelInfo));

                          // do one forward propagation pass (and fill the mini-batch gradients -> set training=true)
                          Neurons[] neurons = DeepLearningTask.makeNeuronsForTraining(dl.model_info());
                          double [] responses = new double[miniBatchSize];
                          double [] offsets = new double[miniBatchSize];
                          int n=0;
                          for (DataInfo.Row myRow : rowsMiniBatch) {
                            if (myRow == null) continue;
                            ((Neurons.Input) neurons[0]).setInput(-1, myRow.numIds, myRow.numVals, myRow.nBins, myRow.binIds, n);
                            responses[n] = myRow.response(0);
                            offsets[n] = myRow.offset;
                            n++;
                          }
                          DeepLearningTask.fpropMiniBatch(-1 /*seed doesn't matter*/, neurons, dl.model_info(), null, true /*training*/, responses, offsets, n);

                          // check that we didn't change the model's weights/biases
                          long after = dl.model_info().checksum_impl();
                          assert (after == before);

                          // record the gradient since gradientChecking is enabled
                          DeepLearningModelInfo.gradientCheck = new DeepLearningModelInfo.GradientCheck(layer, row, col); //tell it what gradient to collect
                          DeepLearningTask.bpropMiniBatch(neurons, n); //update the weights
                          assert (before != dl.model_info().checksum_impl());

                          // reset the model back to the trained model
                          dl.set_model_info(IcedUtils.deepCopy(modelInfo));
                          assert (before == dl.model_info().checksum_impl());

                          double bpropGradient = DeepLearningModelInfo.gradientCheck.gradient;

                          // FIXME: re-enable this once the loss is computed from the de-standardized prediction/response
//                    double actualResponse=myRow.response[0];
//                    double predResponseLinkSpace = neurons[neurons.length-1]._a.get(0);
//                    if (di._normRespMul != null) {
//                      bpropGradient /= di._normRespMul[0]; //no shift for gradient
//                      actualResponse = (actualResponse / di._normRespMul[0] + di._normRespSub[0]);
//                      predResponseLinkSpace = (predResponseLinkSpace / di._normRespMul[0] + di._normRespSub[0]);
//                    }
//                    bpropGradient *= new Distribution(parms._distribution).gradient(actualResponse, predResponseLinkSpace);

                          final float weight = dl.model_info().get_weights(layer).get(row, col);

                          double eps = 1e-4 * Math.abs(weight); //don't make the weight deltas too small, or the float weights "won't notice"
                          if (eps == 0)
                            eps = 1e-6;

                          // loss at weight + eps
                          dl.model_info().get_weights(layer).set(row, col, (float)(weight + eps));
                          double up = dl.meanLoss(rowsMiniBatch);

                          // loss at weight - eps
                          dl.model_info().get_weights(layer).set(row, col, (float)(weight - eps));
                          double down = dl.meanLoss(rowsMiniBatch);

                          if (Math.abs(up-down)/Math.abs(up+down) < 1e-8) {
                            continue; //relative change in loss function is too small -> skip
                          }

                          double gradient = ((up - down) / (2. * eps));

                          double relError = 2 * Math.abs(bpropGradient - gradient) / (Math.abs(gradient) + Math.abs(bpropGradient));

                          count++;

                          // if either gradient is tiny, check if both are tiny
                          if (Math.abs(gradient) < 1e-7 || Math.abs(bpropGradient) < 1e-7) {
                            if (Math.abs(bpropGradient-gradient) < 1e-7) continue; //all good
                          }

                          meanRelErr += relError;

                          // if both gradients are tiny - numerically unstable relative error computation is not needed, since absolute error is small

                          if (relError > MAX_TOLERANCE) {
                            Log.info("\nDistribution: " + dl._parms._distribution);
                            Log.info("\nRow: " + rId);
                            Log.info("weight (layer " + layer + ", row " + row + ", col " + col + "): " + weight + " +/- " + eps);
                            Log.info("loss: " + loss);
                            Log.info("losses up/down: " + up + " / " + down);
                            Log.info("=> Finite differences gradient: " + gradient);
                            Log.info("=> Back-propagation gradient  : " + bpropGradient);
                            Log.info("=> Relative error             : " + PrettyPrint.formatPct(relError));
                            failedcount++;
                          }
//                          Assert.assertTrue(failedcount==0);

                          maxRelErr = Math.max(maxRelErr, relError);
                          assert(!Double.isNaN(maxRelErr));
                        }
                      }
                    }
                  }
                  meanLoss /= tfr.numRows();
                  Log.info("Mean loss: " + meanLoss);

//                  // FIXME: re-enable this
//                  if (parms._l1 == 0 && parms._l2 == 0) {
//                    assert(Math.abs(meanLoss-resdev)/Math.abs(resdev) < 1e-5);
//                  }
                } catch(RuntimeException ex) {
                  dl = DKV.getGet(job.dest());
                  if (dl != null)
                    Assert.assertTrue(dl.model_info().isUnstable());
                  else
                    Assert.assertTrue(job.isStopped());
                } finally {
                  if (dl != null) dl.delete();
                }
              }
            }
          }
        }
      }
      Log.info("Number of tests: " + count);
      Log.info("Number of failed tests: " + failedcount);
      Log.info("Mean. relative error: " + meanRelErr/count);
      Log.info("Max. relative error: " + PrettyPrint.formatPct(maxRelErr));
      Assert.assertTrue("Error too large: " + maxRelErr + " >= " + MAX_TOLERANCE, maxRelErr < MAX_TOLERANCE);
      Assert.assertTrue("Failed count too large: " + failedcount + " > " + MAX_FAILED_COUNT, failedcount <= MAX_FAILED_COUNT);

    } finally {
      if (tfr != null) tfr.remove();
    }
  }

  @Test public void checkDistributionGradients() {
    Random rng = new Random(0xDECAF);
    for (DistributionFamily dist : new DistributionFamily[]{
            DistributionFamily.AUTO,
            DistributionFamily.gaussian,
            DistributionFamily.laplace,
            DistributionFamily.quantile,
            DistributionFamily.huber,
            DistributionFamily.gamma,
            DistributionFamily.poisson,
            DistributionFamily.tweedie,
            DistributionFamily.bernoulli,
//            DistributionFamily.modified_huber,
//              DistributionFamily.multinomial, //no gradient/deviance implemented
    }) {
      DeepLearningParameters p = new DeepLearningParameters();
      p._distribution = dist;
      int N=1000;
      double eps=1./(10.*N);
      for (double y : new double[]{0,1}) { //actual - taylored for binomial, but should work for regression too
        // scan the range -2..2 in function approximation space (link space)
        for (int i=-5*N; i<5*N; ++i) {
          p._huber_alpha = rng.nextDouble()+0.1;
          p._tweedie_power = 1.01 + rng.nextDouble()*0.9;
          p._quantile_alpha = 0.05 + rng.nextDouble()*0.9;
          Distribution d = new Distribution(p);
          double f = (i+0.5)/N; // avoid issues at 0
          double grad = -2*d.negHalfGradient(y, f); //f in link space (model space)
          double w = rng.nextDouble()*10;
          double approxgrad = (d.deviance(w,y,d.linkInv(f+eps)) - d.deviance(w,y,d.linkInv(f-eps)))/(2*eps*w); //deviance in real space
          assert(Math.abs(grad - approxgrad) <= 1e-4);
        }
      }
    }
  }
}