package hex.deeplearning;
import hex.deeplearning.DeepLearningModel.DeepLearningParameters;
import hex.DataInfo;
import hex.Distribution;
import hex.FrameTask;
import hex.ModelMetricsRegression;
import hex.genmodel.utils.DistributionFamily;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.Log;
import water.util.PrettyPrint;
import java.util.Random;
public class DeepLearningGradientCheck extends TestUtil {
@BeforeClass public static void stall() { stall_till_cloudsize(1); }
static final float MAX_TOLERANCE = 2e-2f;
static final float MAX_FAILED_COUNT = 30;
static final float SAMPLE_RATE = 0.01f;
@Test
public void gradientCheck() {
Frame tfr = null;
DeepLearningModel dl = null;
try {
tfr = parse_test_file("smalldata/glm_test/cancar_logIn.csv");
for (String s : new String[]{
"Merit", "Class"
}) {
Vec f = tfr.vec(s).toCategoricalVec();
tfr.remove(s).remove();
tfr.add(s, f);
}
DKV.put(tfr);
tfr.add("Binary", tfr.anyVec().makeZero());
new MRTask() {
public void map(Chunk[] c) {
for (int i=0;i<c[0]._len;++i)
if (c[0].at8(i)==1) c[1].set(i,1);
}
}.doAll(tfr.vecs(new String[]{"Class","Binary"}));
Vec cv = tfr.vec("Binary").toCategoricalVec();
tfr.remove("Binary").remove();
tfr.add("Binary", cv);
DKV.put(tfr);
Random rng = new Random(0xDECAF);
int count=0;
int failedcount=0;
double maxRelErr = 0;
double meanRelErr = 0;
for (DistributionFamily dist : new DistributionFamily[]{
DistributionFamily.gaussian,
DistributionFamily.laplace,
DistributionFamily.quantile,
DistributionFamily.huber,
// DistributionFamily.modified_huber,
DistributionFamily.gamma,
DistributionFamily.poisson,
DistributionFamily.AUTO,
DistributionFamily.tweedie,
DistributionFamily.multinomial,
DistributionFamily.bernoulli,
}) {
for (DeepLearningParameters.Activation act : new DeepLearningParameters.Activation[]{
// DeepLearningParameters.Activation.ExpRectifier,
DeepLearningParameters.Activation.Tanh,
DeepLearningParameters.Activation.Rectifier,
// DeepLearningParameters.Activation.Maxout,
}) {
for (String response : new String[]{
"Binary", //binary classification
"Class", //multi-class
"Cost", //regression
}) {
for (boolean adaptive : new boolean[]{
true,
false
}) {
for (int miniBatchSize : new int[]{
1
}) {
if (response.equals("Class")) {
if (dist != DistributionFamily.multinomial && dist != DistributionFamily.AUTO)
continue;
}
else if (response.equals("Binary")) {
if (dist != DistributionFamily.modified_huber && dist != DistributionFamily.bernoulli && dist != DistributionFamily.AUTO)
continue;
}
else {
if (dist == DistributionFamily.multinomial || dist == DistributionFamily.modified_huber || dist == DistributionFamily.bernoulli) continue;
}
DeepLearningParameters parms = new DeepLearningParameters();
parms._huber_alpha = rng.nextDouble()+0.1;
parms._tweedie_power = 1.01 + rng.nextDouble()*0.9;
parms._quantile_alpha = 0.05 + rng.nextDouble()*0.9;
parms._train = tfr._key;
parms._epochs = 100; //converge to a reasonable model to avoid too large gradients
parms._l1 = 1e-3;
parms._l2 = 1e-3;
parms._force_load_balance = false;
parms._hidden = new int[]{10, 10, 10};
parms._fast_mode = false; //otherwise we introduce small bprop errors
parms._response_column = response;
parms._distribution = dist;
parms._max_w2 = 10;
parms._seed = 0xaaabbb;
parms._activation = act;
parms._adaptive_rate = adaptive;
parms._rate = 1e-4;
parms._momentum_start = 0.9;
parms._momentum_stable = 0.99;
parms._mini_batch_size = miniBatchSize;
// DeepLearningModelInfo.gradientCheck = null;
DeepLearningModelInfo.gradientCheck = new DeepLearningModelInfo.GradientCheck(0, 0, 0); //tell it what gradient to collect
// Build a first model; all remaining models should be equal
DeepLearning job = new DeepLearning(parms);
try {
dl = job.trainModel().get();
boolean classification = response.equals("Class") || response.equals("Binary");
if (!classification) {
Frame p = dl.score(tfr);
hex.ModelMetrics mm = hex.ModelMetrics.getFromDKV(dl, tfr);
double resdev = ((ModelMetricsRegression) mm)._mean_residual_deviance;
Log.info("Mean residual deviance: " + resdev);
p.delete();
}
DeepLearningModelInfo modelInfo = IcedUtils.deepCopy(dl.model_info()); //golden version
// Log.info(modelInfo.toStringAll());
long before = dl.model_info().checksum_impl();
float meanLoss = 0;
// loop over every row in the dataset and check that the predictions
for (int rId = 0; rId < tfr.numRows(); rId+=1 /*miniBatchSize*/) {
// start from scratch - with a clean model
dl.set_model_info(IcedUtils.deepCopy(modelInfo));
final DataInfo di = dl.model_info().data_info();
// populate miniBatch (consecutive rows)
final DataInfo.Row[] rowsMiniBatch = new DataInfo.Row[miniBatchSize];
for (int i=0; i<rowsMiniBatch.length; ++i) {
if (0 <= rId+i && rId+i < tfr.numRows()) {
rowsMiniBatch[i] = new FrameTask.ExtractDenseRow(di, rId+i).doAll(di._adaptedFrame)._row;
}
}
// loss at weight
long cs = dl.model_info().checksum_impl();
double loss = dl.meanLoss(rowsMiniBatch);
assert(cs == before);
assert(before == dl.model_info().checksum_impl());
meanLoss += loss;
for (int layer = 0; layer <= parms._hidden.length; ++layer) {
int rows = dl.model_info().get_weights(layer).rows();
assert(dl.model_info().get_biases(layer).size()==rows);
for (int row = 0; row < rows; ++row) {
//check bias
if (true) {
// start from scratch - with a clean model
dl.set_model_info(IcedUtils.deepCopy(modelInfo));
// do one forward propagation pass (and fill the mini-batch gradients -> set training=true)
Neurons[] neurons = DeepLearningTask.makeNeuronsForTraining(dl.model_info());
double[] responses = new double[miniBatchSize];
double[] offsets = new double[miniBatchSize];
int n = 0;
for (DataInfo.Row myRow : rowsMiniBatch) {
if (myRow == null) continue;
((Neurons.Input) neurons[0]).setInput(-1, myRow.numIds, myRow.numVals, myRow.nBins, myRow.binIds, n);
responses[n] = myRow.response(0);
offsets[n] = myRow.offset;
n++;
}
DeepLearningTask.fpropMiniBatch(-1 /*seed doesn't matter*/, neurons, dl.model_info(), null, true /*training*/, responses, offsets, n);
// check that we didn't change the model's weights/biases
long after = dl.model_info().checksum_impl();
assert (after == before);
// record the gradient since gradientChecking is enabled
DeepLearningModelInfo.gradientCheck = new DeepLearningModelInfo.GradientCheck(layer, row, -1); //tell it what gradient to collect
DeepLearningTask.bpropMiniBatch(neurons, n); //update the weights and biases
assert (before != dl.model_info().checksum_impl());
// reset the model back to the trained model
dl.set_model_info(IcedUtils.deepCopy(modelInfo));
assert (before == dl.model_info().checksum_impl());
double bpropGradient = DeepLearningModelInfo.gradientCheck.gradient;
// FIXME: re-enable this once the loss is computed from the de-standardized prediction/response
// double actualResponse=myRow.response[0];
// double predResponseLinkSpace = neurons[neurons.length-1]._a.get(0);
// if (di._normRespMul != null) {
// bpropGradient /= di._normRespMul[0]; //no shift for gradient
// actualResponse = (actualResponse / di._normRespMul[0] + di._normRespSub[0]);
// predResponseLinkSpace = (predResponseLinkSpace / di._normRespMul[0] + di._normRespSub[0]);
// }
// bpropGradient *= new Distribution(parms._distribution).gradient(actualResponse, predResponseLinkSpace);
final double bias = dl.model_info().get_biases(layer).get(row);
double eps = 1e-4 * Math.abs(bias); //don't make the weight deltas too small, or the float weights "won't notice"
if (eps == 0)
eps = 1e-6;
// loss at bias + eps
dl.model_info().get_biases(layer).set(row, bias + eps);
double up = dl.meanLoss(rowsMiniBatch);
// loss at bias - eps
dl.model_info().get_biases(layer).set(row, bias - eps);
double down = dl.meanLoss(rowsMiniBatch);
if (Math.abs(up - down) / Math.abs(up + down) < 1e-8) {
continue; //relative change in loss function is too small -> skip
}
double gradient = ((up - down) / (2. * eps));
double relError = 2 * Math.abs(bpropGradient - gradient) / (Math.abs(gradient) + Math.abs(bpropGradient));
count++;
// if either gradient is tiny, check if both are tiny
if (Math.abs(gradient) < 1e-7 || Math.abs(bpropGradient) < 1e-7) {
if (Math.abs(bpropGradient - gradient) < 1e-7) continue; //all good
}
meanRelErr += relError;
// if both gradients are tiny - numerically unstable relative error computation is not needed, since absolute error is small
if (relError > MAX_TOLERANCE) {
Log.info("\nDistribution: " + dl._parms._distribution);
Log.info("\nRow: " + rId);
Log.info("bias (layer " + layer + ", row " + row + "): " + bias + " +/- " + eps);
Log.info("loss: " + loss);
Log.info("losses up/down: " + up + " / " + down);
Log.info("=> Finite differences gradient: " + gradient);
Log.info("=> Back-propagation gradient : " + bpropGradient);
Log.info("=> Relative error : " + PrettyPrint.formatPct(relError));
failedcount++;
}
}
int cols = dl.model_info().get_weights(layer).cols();
for (int col = 0; col < cols; ++col) {
if (rng.nextFloat() >= SAMPLE_RATE) continue;
// start from scratch - with a clean model
dl.set_model_info(IcedUtils.deepCopy(modelInfo));
// do one forward propagation pass (and fill the mini-batch gradients -> set training=true)
Neurons[] neurons = DeepLearningTask.makeNeuronsForTraining(dl.model_info());
double [] responses = new double[miniBatchSize];
double [] offsets = new double[miniBatchSize];
int n=0;
for (DataInfo.Row myRow : rowsMiniBatch) {
if (myRow == null) continue;
((Neurons.Input) neurons[0]).setInput(-1, myRow.numIds, myRow.numVals, myRow.nBins, myRow.binIds, n);
responses[n] = myRow.response(0);
offsets[n] = myRow.offset;
n++;
}
DeepLearningTask.fpropMiniBatch(-1 /*seed doesn't matter*/, neurons, dl.model_info(), null, true /*training*/, responses, offsets, n);
// check that we didn't change the model's weights/biases
long after = dl.model_info().checksum_impl();
assert (after == before);
// record the gradient since gradientChecking is enabled
DeepLearningModelInfo.gradientCheck = new DeepLearningModelInfo.GradientCheck(layer, row, col); //tell it what gradient to collect
DeepLearningTask.bpropMiniBatch(neurons, n); //update the weights
assert (before != dl.model_info().checksum_impl());
// reset the model back to the trained model
dl.set_model_info(IcedUtils.deepCopy(modelInfo));
assert (before == dl.model_info().checksum_impl());
double bpropGradient = DeepLearningModelInfo.gradientCheck.gradient;
// FIXME: re-enable this once the loss is computed from the de-standardized prediction/response
// double actualResponse=myRow.response[0];
// double predResponseLinkSpace = neurons[neurons.length-1]._a.get(0);
// if (di._normRespMul != null) {
// bpropGradient /= di._normRespMul[0]; //no shift for gradient
// actualResponse = (actualResponse / di._normRespMul[0] + di._normRespSub[0]);
// predResponseLinkSpace = (predResponseLinkSpace / di._normRespMul[0] + di._normRespSub[0]);
// }
// bpropGradient *= new Distribution(parms._distribution).gradient(actualResponse, predResponseLinkSpace);
final float weight = dl.model_info().get_weights(layer).get(row, col);
double eps = 1e-4 * Math.abs(weight); //don't make the weight deltas too small, or the float weights "won't notice"
if (eps == 0)
eps = 1e-6;
// loss at weight + eps
dl.model_info().get_weights(layer).set(row, col, (float)(weight + eps));
double up = dl.meanLoss(rowsMiniBatch);
// loss at weight - eps
dl.model_info().get_weights(layer).set(row, col, (float)(weight - eps));
double down = dl.meanLoss(rowsMiniBatch);
if (Math.abs(up-down)/Math.abs(up+down) < 1e-8) {
continue; //relative change in loss function is too small -> skip
}
double gradient = ((up - down) / (2. * eps));
double relError = 2 * Math.abs(bpropGradient - gradient) / (Math.abs(gradient) + Math.abs(bpropGradient));
count++;
// if either gradient is tiny, check if both are tiny
if (Math.abs(gradient) < 1e-7 || Math.abs(bpropGradient) < 1e-7) {
if (Math.abs(bpropGradient-gradient) < 1e-7) continue; //all good
}
meanRelErr += relError;
// if both gradients are tiny - numerically unstable relative error computation is not needed, since absolute error is small
if (relError > MAX_TOLERANCE) {
Log.info("\nDistribution: " + dl._parms._distribution);
Log.info("\nRow: " + rId);
Log.info("weight (layer " + layer + ", row " + row + ", col " + col + "): " + weight + " +/- " + eps);
Log.info("loss: " + loss);
Log.info("losses up/down: " + up + " / " + down);
Log.info("=> Finite differences gradient: " + gradient);
Log.info("=> Back-propagation gradient : " + bpropGradient);
Log.info("=> Relative error : " + PrettyPrint.formatPct(relError));
failedcount++;
}
// Assert.assertTrue(failedcount==0);
maxRelErr = Math.max(maxRelErr, relError);
assert(!Double.isNaN(maxRelErr));
}
}
}
}
meanLoss /= tfr.numRows();
Log.info("Mean loss: " + meanLoss);
// // FIXME: re-enable this
// if (parms._l1 == 0 && parms._l2 == 0) {
// assert(Math.abs(meanLoss-resdev)/Math.abs(resdev) < 1e-5);
// }
} catch(RuntimeException ex) {
dl = DKV.getGet(job.dest());
if (dl != null)
Assert.assertTrue(dl.model_info().isUnstable());
else
Assert.assertTrue(job.isStopped());
} finally {
if (dl != null) dl.delete();
}
}
}
}
}
}
Log.info("Number of tests: " + count);
Log.info("Number of failed tests: " + failedcount);
Log.info("Mean. relative error: " + meanRelErr/count);
Log.info("Max. relative error: " + PrettyPrint.formatPct(maxRelErr));
Assert.assertTrue("Error too large: " + maxRelErr + " >= " + MAX_TOLERANCE, maxRelErr < MAX_TOLERANCE);
Assert.assertTrue("Failed count too large: " + failedcount + " > " + MAX_FAILED_COUNT, failedcount <= MAX_FAILED_COUNT);
} finally {
if (tfr != null) tfr.remove();
}
}
@Test public void checkDistributionGradients() {
Random rng = new Random(0xDECAF);
for (DistributionFamily dist : new DistributionFamily[]{
DistributionFamily.AUTO,
DistributionFamily.gaussian,
DistributionFamily.laplace,
DistributionFamily.quantile,
DistributionFamily.huber,
DistributionFamily.gamma,
DistributionFamily.poisson,
DistributionFamily.tweedie,
DistributionFamily.bernoulli,
// DistributionFamily.modified_huber,
// DistributionFamily.multinomial, //no gradient/deviance implemented
}) {
DeepLearningParameters p = new DeepLearningParameters();
p._distribution = dist;
int N=1000;
double eps=1./(10.*N);
for (double y : new double[]{0,1}) { //actual - taylored for binomial, but should work for regression too
// scan the range -2..2 in function approximation space (link space)
for (int i=-5*N; i<5*N; ++i) {
p._huber_alpha = rng.nextDouble()+0.1;
p._tweedie_power = 1.01 + rng.nextDouble()*0.9;
p._quantile_alpha = 0.05 + rng.nextDouble()*0.9;
Distribution d = new Distribution(p);
double f = (i+0.5)/N; // avoid issues at 0
double grad = -2*d.negHalfGradient(y, f); //f in link space (model space)
double w = rng.nextDouble()*10;
double approxgrad = (d.deviance(w,y,d.linkInv(f+eps)) - d.deviance(w,y,d.linkInv(f-eps)))/(2*eps*w); //deviance in real space
assert(Math.abs(grad - approxgrad) <= 1e-4);
}
}
}
}
}