XValPredictionsCheck.java example

Explorer
h2o-3-master
package hex;

import hex.deeplearning.DeepLearning;
import hex.deeplearning.DeepLearningModel;
import hex.genmodel.utils.DistributionFamily;
import hex.glm.GLM;
import hex.glm.GLMModel;
import hex.tree.drf.DRF;
import hex.tree.drf.DRFModel;
import hex.tree.gbm.GBM;
import hex.tree.gbm.GBMModel;
import org.junit.BeforeClass;
import org.junit.Test;
import water.*;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.Vec;
import water.rapids.ast.prims.advmath.AstKFold;
import water.util.ArrayUtils;

import static org.junit.Assert.assertEquals;

/**
 * This test is intended to corroborate the documented description of cross-validated
 * predictions as a result of model building. These datasets have identifiers of the form
 *         *_cv_1, *_cv_2, ..., *_cv_n
 *
 * This test makes GBM, DRF, GLM, and DL models with a randomized fold column, and it
 * checks that each *_cv_n contain predictions consistent with the fold column on the
 * original frame.
 */
public class XValPredictionsCheck extends TestUtil {
  @BeforeClass() public static void setup() { stall_till_cloudsize(1); }

  @Test public void testXValPredictions() {
    final int nfolds = 3;
    Frame tfr = null;
    try {
      // Load data, hack frames
      tfr = parse_test_file("smalldata/iris/iris_wheader.csv");
      Frame foldId = new Frame(new String[]{"foldId"}, new Vec[]{AstKFold.kfoldColumn(tfr.vec("class").makeZero(), nfolds, 543216789)});
      tfr.add(foldId);
      DKV.put(tfr);

      // GBM
      GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
      parms._train = tfr._key;
      parms._response_column = "class";
      parms._ntrees = 1;
      parms._max_depth = 1;
      parms._fold_column = "foldId";
      parms._distribution = DistributionFamily.multinomial;
      parms._keep_cross_validation_predictions=true;
      GBM job = new GBM(parms);
      GBMModel gbm = job.trainModel().get();
      checkModel(gbm, foldId.anyVec(),3);

      // DRF
      DRFModel.DRFParameters parmsDRF = new DRFModel.DRFParameters();
      parmsDRF._train = tfr._key;
      parmsDRF._response_column = "class";
      parmsDRF._ntrees = 1;
      parmsDRF._max_depth = 1;
      parmsDRF._fold_column = "foldId";
      parmsDRF._distribution = DistributionFamily.multinomial;
      parmsDRF._keep_cross_validation_predictions=true;
      DRF drfJob = new DRF(parmsDRF);
      DRFModel drf = drfJob.trainModel().get();
      checkModel(drf, foldId.anyVec(),3);

      // GLM
      GLMModel.GLMParameters parmsGLM = new GLMModel.GLMParameters();
      parmsGLM._train = tfr._key;
      parmsGLM._response_column = "sepal_len";
      parmsGLM._fold_column = "foldId";
      parmsGLM._keep_cross_validation_predictions=true;
      GLM glmJob = new GLM(parmsGLM);
      GLMModel glm = glmJob.trainModel().get();
      checkModel(glm, foldId.anyVec(),1);

      // DL
      DeepLearningModel.DeepLearningParameters parmsDL = new DeepLearningModel.DeepLearningParameters();
      parmsDL._train = tfr._key;
      parmsDL._response_column = "class";
      parmsDL._hidden = new int[]{1};
      parmsDL._epochs = 1;
      parmsDL._fold_column = "foldId";
      parmsDL._keep_cross_validation_predictions=true;
      DeepLearning dlJob = new DeepLearning(parmsDL);
      DeepLearningModel dl = dlJob.trainModel().get();
      checkModel(dl, foldId.anyVec(),3);

    } finally {
      if (tfr != null) tfr.remove();
    }
  }

  void checkModel(Model m, Vec foldId, int nclass) {
    if(!(m instanceof DRFModel)) // DRF does out of back instead of true training, nobs might be different
      assertEquals(m._output._training_metrics._nobs,m._output._cross_validation_metrics._nobs);
    m.delete();
    m.deleteCrossValidationModels();
    Key[] xvalKeys = m._output._cross_validation_predictions;
    Key xvalKey = m._output._cross_validation_holdout_predictions_frame_id;
    final int[] id = new int[1];
    for(Key k: xvalKeys) {
      Frame preds = DKV.getGet(k);
      assert preds.numRows() == foldId.length();
      Vec[] vecs = new Vec[nclass+1];
      vecs[0] = foldId;
      if( nclass==1 ) vecs[1] = preds.anyVec();
      else
        System.arraycopy(preds.vecs(ArrayUtils.range(1, nclass)), 0, vecs, 1, nclass);
      new MRTask() {
        @Override public void map(Chunk[] cs) {
          Chunk foldId = cs[0];
          for(int r=0;r<cs[0]._len; ++r)
            if( foldId.at8(r) != id[0] )
              for(int i=1; i<cs.length;++i)
                assert cs[i].atd(r)==0; // no prediction for this row!
        }
      }.doAll(vecs);
      id[0]++;
      preds.delete();
    }
    xvalKey.remove();
  }
}