PCATest.java example

Explorer
h2o-3-master
package hex.pca;

import hex.DataInfo;
import hex.SplitFrame;
import hex.pca.PCAModel.PCAParameters;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import water.DKV;
import water.Key;
import water.Scope;
import water.TestUtil;
import water.fvec.Frame;
import water.fvec.Vec;
import water.util.ArrayUtils;
import water.util.FrameUtils;

import java.util.concurrent.ExecutionException;

public class PCATest extends TestUtil {
  public static final double TOLERANCE = 1e-6;
  @BeforeClass public static void setup() { stall_till_cloudsize(1); }

  @Test public void testArrests() throws InterruptedException, ExecutionException {
    // Results with de-meaned training frame
    double[] stddev = new double[] {83.732400, 14.212402, 6.489426, 2.482790};
    double[][] eigvec = ard(ard(0.04170432, -0.04482166, 0.07989066, -0.99492173),
                            ard(0.99522128, -0.05876003, -0.06756974, 0.03893830),
                            ard(0.04633575, 0.97685748, -0.20054629, -0.05816914),
                            ard(0.07515550, 0.20071807, 0.97408059, 0.07232502));

    // Results with standardized training frame
    double[] stddev_std = new double[] {1.5748783, 0.9948694, 0.5971291, 0.4164494};
    double[][] eigvec_std = ard(ard(-0.5358995, 0.4181809, -0.3412327, 0.64922780),
                                ard(-0.5831836, 0.1879856, -0.2681484, -0.74340748),
                                ard(-0.2781909, -0.8728062, -0.3780158, 0.13387773),
                                ard(-0.5434321, -0.1673186, 0.8177779, 0.08902432));

    Frame train = null;
    try {
      train = parse_test_file(Key.make("arrests.hex"), "smalldata/pca_test/USArrests.csv");   // TODO: Move this outside loop
      for (DataInfo.TransformType std : new DataInfo.TransformType[] {
              DataInfo.TransformType.DEMEAN,
              DataInfo.TransformType.STANDARDIZE }) {
        PCAModel model = null;
        try {
          PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
          parms._train = train._key;
          parms._k = 4;
          parms._transform = std;
          parms._max_iterations = 1000;
          parms._pca_method = PCAParameters.Method.Power;

          model = new PCA(parms).trainModel().get();

          if (std == DataInfo.TransformType.DEMEAN) {
            TestUtil.checkStddev(stddev, model._output._std_deviation, TOLERANCE);
            TestUtil.checkEigvec(eigvec, model._output._eigenvectors, TOLERANCE);
          } else if (std == DataInfo.TransformType.STANDARDIZE) {
            TestUtil.checkStddev(stddev_std, model._output._std_deviation, TOLERANCE);
            TestUtil.checkEigvec(eigvec_std, model._output._eigenvectors, TOLERANCE);
          }
        } finally {
          if( model != null ) model.delete();
        }
      }
    } finally {
      if(train != null) train.delete();
    }
  }

  @Test public void testArrestsScoring() throws InterruptedException, ExecutionException {
    // Results with original training frame
    double[] stddev = new double[] {202.7230564, 27.8322637, 6.5230482, 2.5813652};
    double[][] eigvec = ard(ard(-0.04239181, 0.01616262, -0.06588426, 0.99679535),
                            ard(-0.94395706, 0.32068580, 0.06655170, -0.04094568),
                            ard(-0.30842767, -0.93845891, 0.15496743, 0.01234261),
                            ard(-0.10963744, -0.12725666, -0.98347101, -0.06760284));

    PCAModel model = null;
    Frame train = null, score = null, scoreR = null;
    try {
      train = parse_test_file(Key.make("arrests.hex"), "smalldata/pca_test/USArrests.csv");
      PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
      parms._train = train._key;
      parms._k = 4;
      parms._transform = DataInfo.TransformType.NONE;
      parms._pca_method = PCAParameters.Method.GramSVD;

      model = new PCA(parms).trainModel().get();
      TestUtil.checkStddev(stddev, model._output._std_deviation, 1e-5);
      boolean[] flippedEig = TestUtil.checkEigvec(eigvec, model._output._eigenvectors, 1e-5);

      score = model.score(train);
      scoreR = parse_test_file(Key.make("scoreR.hex"), "smalldata/pca_test/USArrests_PCAscore.csv");
      TestUtil.checkProjection(scoreR, score, TOLERANCE, flippedEig);    // Flipped cols must match those from eigenvectors

      // Build a POJO, validate same results
      Assert.assertTrue(model.testJavaScoring(train,score,1e-5));
    } finally {
      if (train != null) train.delete();
      if (score != null) score.delete();
      if (scoreR != null) scoreR.delete();
      if (model != null) model.delete();
    }
  }



  @Test public void testIrisScoring() throws InterruptedException, ExecutionException {
    // Results with original training frame
    double[] stddev = new double[] {7.88175203, 1.56002774, 0.59189816, 0.25917329, 0.15415273, 0.09381276, 0.04768590};
    double[][] eigvec = ard(ard(-0.03169051, -0.32305860,  0.185100382, -0.12336685, -0.14867156,  0.75932119, -0.496462912),
                            ard(-0.04289677,  0.04037565, -0.780961964,  0.19727933,  0.07251338, -0.12216945, -0.572298338),
                            ard(-0.05019689,  0.16836717,  0.551432201, -0.07122329,  0.08454116, -0.48327010, -0.647522462),
                            ard(-0.74915107, -0.26629420, -0.101102186, -0.48920057,  0.32458460, -0.09176909,  0.067412858),
                            ard(-0.37877011, -0.50636060,  0.142219195,  0.69081642, -0.26312992, -0.17811871,  0.041411296),
                            ard(-0.51177078,  0.65945159, -0.005079934,  0.04881900, -0.52128288,  0.17038367,  0.006223427),
                            ard(-0.16742875,  0.32166036,  0.145893901,  0.47102115,  0.72052968,  0.32523458,  0.020389463));

    PCAModel model = null;
    Frame train = null, score = null, scoreR = null;
    try {
      train = parse_test_file(Key.make("iris.hex"), "smalldata/iris/iris_wheader.csv");
      PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
      parms._train = train._key;
      parms._k = 7;
      parms._transform = DataInfo.TransformType.NONE;
      parms._use_all_factor_levels = true;
      parms._pca_method = PCAParameters.Method.Power;

      model = new PCA(parms).trainModel().get();
      TestUtil.checkStddev(stddev, model._output._std_deviation, 1e-5);
      boolean[] flippedEig = TestUtil.checkEigvec(eigvec, model._output._eigenvectors, 1e-5);

      score = model.score(train);
      scoreR = parse_test_file(Key.make("scoreR.hex"), "smalldata/pca_test/iris_PCAscore.csv");
      TestUtil.checkProjection(scoreR, score, TOLERANCE, flippedEig);    // Flipped cols must match those from eigenvectors

      // Build a POJO, validate same results
      Assert.assertTrue(model.testJavaScoring(train,score,1e-5));
    } finally {
      if (train != null) train.delete();
      if (score != null) score.delete();
      if (scoreR != null) scoreR.delete();
      if (model != null) model.delete();
    }
  }

  @Test public void testIrisSplitScoring() throws InterruptedException, ExecutionException {
    PCAModel model = null;
    Frame fr = null, fr2= null;
    Frame tr = null, te= null;

    try {
      fr = parse_test_file("smalldata/iris/iris_wheader.csv");
      SplitFrame sf = new SplitFrame(fr,new double[] { 0.5, 0.5 },new Key[] { Key.make("train.hex"), Key.make("test.hex")});

      // Invoke the job
      sf.exec().get();
      Key[] ksplits = sf._destination_frames;
      tr = DKV.get(ksplits[0]).get();
      te = DKV.get(ksplits[1]).get();

      PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
      parms._train = ksplits[0];
      parms._valid = ksplits[1];
      parms._k = 4;
      parms._max_iterations = 1000;
      parms._pca_method = PCAParameters.Method.GramSVD;

      model = new PCA(parms).trainModel().get();

      // Done building model; produce a score column with cluster choices
      fr2 = model.score(te);
      Assert.assertTrue(model.testJavaScoring(te, fr2, 1e-5));
    } finally {
      if( fr  != null ) fr.delete();
      if( fr2 != null ) fr2.delete();
      if( tr  != null ) tr .delete();
      if( te  != null ) te .delete();
      if (model != null) model.delete();
    }
  }

  @Test public void testImputeMissing() throws InterruptedException, ExecutionException {
    Frame train = null;
    double missing_fraction = 0.75;
    long seed = 12345;

    try {
      train = parse_test_file(Key.make("arrests.hex"), "smalldata/pca_test/USArrests.csv");
      // Add missing values to the training data
      if (missing_fraction > 0) {
        Frame frtmp = new Frame(Key.<Frame>make(), train.names(), train.vecs());
        DKV.put(frtmp._key, frtmp); // Need to put the frame (to be modified) into DKV for MissingInserter to pick up
        FrameUtils.MissingInserter j = new FrameUtils.MissingInserter(frtmp._key, seed, missing_fraction);
        j.execImpl().get(); // MissingInserter is non-blocking, must block here explicitly
        DKV.remove(frtmp._key); // Delete the frame header (not the data)
      }

      PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
      parms._train = train._key;
      parms._k = 4;
      parms._transform = DataInfo.TransformType.NONE;
      parms._pca_method = PCAModel.PCAParameters.Method.GramSVD;
      parms._impute_missing = true;   // Don't skip rows with NA entries, but impute using mean of column
      parms._seed = seed;

      PCAModel pca = null;
      pca = new PCA(parms).trainModel().get();
      if (pca != null) pca.remove();
    } finally {
      if (train != null) train.delete();
    }
  }

  @Test public void testGram() {
    double[][] x = ard(ard(1, 2, 3), ard(4, 5, 6));
    double[][] xgram = ard(ard(17, 22, 27), ard(22, 29, 36), ard(27, 36, 45));  // X'X
    double[][] xtgram = ard(ard(14, 32), ard(32, 77));    // (X')'X' = XX'

    double[][] xgram_glrm = ArrayUtils.formGram(x, false);
    double[][] xtgram_glrm = ArrayUtils.formGram(x, true);
    Assert.assertArrayEquals(xgram, xgram_glrm);
    Assert.assertArrayEquals(xtgram, xtgram_glrm);
  }

  /* Make sure POJO works if the model is only built from categorical variables (no numeric columns) */
  @Test public void testCatOnlyPUBDEV3988() throws InterruptedException, ExecutionException {
    PCAModel model = null;
    Frame train = null, score = null;
    try {
      train = parse_test_file(Key.make("prostate_cat.hex"), "smalldata/prostate/prostate_cat.csv");
      for (int i = train.numCols() - 1; i > 0; i--) {
        Vec v = train.vec(i);
        if (v.get_type() != Vec.T_CAT) {
          train.remove(i);
          Vec.remove(v._key);
        }
      }
      DKV.put(train);
      PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
      parms._train = train._key;
      parms._k = 2;
      parms._transform = DataInfo.TransformType.STANDARDIZE;
      parms._use_all_factor_levels = true;
      parms._pca_method = PCAParameters.Method.GramSVD;
      parms._impute_missing = false;
      parms._seed = 12345;

      PCA pcaParms = new PCA(parms);
      model = pcaParms.trainModel().get(); // get normal data
      score = model.score(train);

      // Build a POJO, check results with original PCA
      Assert.assertTrue(model.testJavaScoring(train,score,TOLERANCE));
    } finally {
      if (train != null) train.delete();
      if (score != null) score.delete();
      if (model != null) model.delete();
    }
  }

  /*
  Quick test to make sure changes made to PCA for rank deficient matrices do not cause leakage.
   */
  @Test public void testPUBDEV3500NoLeakage() throws InterruptedException, ExecutionException {
    Scope.enter();
    Frame train = null;
    try {
      train = parse_test_file(Key.make("prostate_cat.hex"), "smalldata/prostate/prostate_cat.csv");
      Scope.track(train);

      PCAModel.PCAParameters parms = new PCAModel.PCAParameters();
      parms._train = train._key;
      parms._k = 3;
      parms._transform = DataInfo.TransformType.NONE;
      parms._pca_method = PCAModel.PCAParameters.Method.Randomized;
      parms._impute_missing = true;   // Don't skip rows with NA entries, but impute using mean of column
      parms._seed = 12345;
      parms._use_all_factor_levels=true;

      PCAModel pca = null;
      pca = new PCA(parms).trainModel().get();
      Scope.track_generic(pca);
      Assert.assertTrue(pca._parms._k == pca._output._std_deviation.length);
    } finally {
      Scope.exit();
    }
  }
}