DeepLearningAutoEncoderTest.java example

Explorer
h2o-3-master
package hex.deeplearning;

import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import water.*;
import water.fvec.*;
import water.parser.ParseDataset;
import water.util.FileUtils;
import water.util.Log;
import hex.deeplearning.DeepLearningModel.DeepLearningParameters;

import java.util.HashSet;

public class DeepLearningAutoEncoderTest extends TestUtil {
  /*
    Visualize outliers with the following R code (from smalldata/anomaly dir):

    train <- scan("ecg_discord_train.csv", sep=",")
    test  <- scan("ecg_discord_test.csv",  sep=",")
    plot.ts(train)
    plot.ts(test)
  */

  static final String PATH = "smalldata/anomaly/ecg_discord_train.csv"; //first 20 points
  static final String PATH2 = "smalldata/anomaly/ecg_discord_test.csv"; //first 22 points

  @BeforeClass() public static void setup() { stall_till_cloudsize(1); }

  @Test
  public void run() {
    long seed = 0xDECAF;

    Frame train=null, test=null;

    try {

      NFSFileVec  nfs = TestUtil.makeNfsFileVec(PATH);
      train = ParseDataset.parse(Key.make("train.hex"), nfs._key);
      NFSFileVec  nfs2 = TestUtil.makeNfsFileVec(PATH2);
      test = ParseDataset.parse(Key.make("test.hex"), nfs2._key);

      for (float sparsity_beta : new float[]{0, 0.1f}) {
        DeepLearningParameters p = new DeepLearningParameters();
        p._train = train._key;
        p._valid = test._key;
        p._autoencoder = true;
        p._response_column = train.names()[train.names().length-1];
        p._seed = seed;
        p._hidden = new int[]{37, 12};
        p._adaptive_rate = true;
        p._train_samples_per_iteration = -1;
        p._sparsity_beta = sparsity_beta;
        p._average_activation = -0.7;
        p._l1 = 1e-4;
        p._activation = DeepLearningParameters.Activation.TanhWithDropout;
        p._loss = DeepLearningParameters.Loss.Absolute;
        p._epochs = 13.3;
        p._force_load_balance = true;
        p._elastic_averaging = false;
        DeepLearning dl = new DeepLearning(p);
        DeepLearningModel mymodel = dl.trainModel().get();

        Frame l2_frame_train=null, l2_frame_test=null;

        // Verification of results
        StringBuilder sb = new StringBuilder();
        try {

          sb.append("Verifying results.\n");

          // Training data

          // Reconstruct data using the same helper functions and verify that self-reported MSE agrees
          double quantile = 0.95;

          l2_frame_test = mymodel.scoreAutoEncoder(test, Key.make(), true);
          sb.append("Reconstruction error per feature (test): ").append(l2_frame_test.toString()).append("\n");
          l2_frame_test.remove();

          l2_frame_test = mymodel.scoreAutoEncoder(test, Key.make(), false);
          Vec l2_test = l2_frame_test.anyVec();
          sb.append("Mean reconstruction error (test): ").append(l2_test.mean()).append("\n");
          Assert.assertEquals(l2_test.mean(), mymodel.mse(), 1e-7);
          Assert.assertTrue("too big a reconstruction error: " + l2_test.mean(), l2_test.mean() < 2.0);
          l2_test.remove();

          // manually compute L2
          Frame reconstr = mymodel.score(train); //this creates real values in original space
          Assert.assertTrue(mymodel.testJavaScoring(train,reconstr,1e-6));

          l2_frame_train = mymodel.scoreAutoEncoder(train, Key.make(), false);
          final Vec l2_train = l2_frame_train.anyVec();
          double mean_l2 = 0;
          for (int r = 0; r < reconstr.numRows(); ++r) {
            double my_l2 = 0;
            for (int c = 0; c < reconstr.numCols(); ++c) {
              my_l2 += Math.pow((reconstr.vec(c).at(r) - train.vec(c).at(r)) * mymodel.model_info().data_info()._normMul[c], 2); //undo normalization here
            }
            my_l2 /= reconstr.numCols();
            mean_l2 += my_l2;
          }
          mean_l2 /= reconstr.numRows();
          reconstr.delete();
          sb.append("Mean reconstruction error (train): ").append(l2_train.mean()).append("\n");
          Assert.assertEquals(mymodel._output.errors.scored_train._mse, mean_l2, 1e-7);

          // print stats and potential outliers
          sb.append("The following training points are reconstructed with an error above the ").append(quantile * 100).append("-th percentile - check for \"goodness\" of training data.\n");
          double thresh_train = mymodel.calcOutlierThreshold(l2_train, quantile);
          for (long i = 0; i < l2_train.length(); i++) {
            if (l2_train.at(i) > thresh_train) {
              sb.append(String.format("row %d : l2_train error = %5f\n", i, l2_train.at(i)));
            }
          }

          // Test data

          // Reconstruct data using the same helper functions and verify that self-reported MSE agrees
          l2_frame_test.remove();
          l2_frame_test = mymodel.scoreAutoEncoder(test, Key.make(), false);
          l2_test = l2_frame_test.anyVec();
          double mult = 10;
          double thresh_test = mult * thresh_train;
          sb.append("\nFinding outliers.\n");
          sb.append("Mean reconstruction error (test): ").append(l2_test.mean()).append("\n");

          Frame df1 = mymodel.scoreDeepFeatures(test, 0);
          Assert.assertTrue(df1.numCols() == 37);
          Assert.assertTrue(df1.numRows() == test.numRows());
          df1.delete();

          Frame df2 = mymodel.scoreDeepFeatures(test, 1);
          Assert.assertTrue(df2.numCols() == 12);
          Assert.assertTrue(df2.numRows() == test.numRows());
          df2.delete();


          // print stats and potential outliers
          sb.append("The following test points are reconstructed with an error greater than ").append(mult).append(" times the mean reconstruction error of the training data:\n");
          HashSet<Long> outliers = new HashSet<>();
          for (long i = 0; i < l2_test.length(); i++) {
            if (l2_test.at(i) > thresh_test) {
              outliers.add(i);
              sb.append(String.format("row %d : l2 error = %5f\n", i, l2_test.at(i)));
            }
          }

          // check that the all outliers are found (and nothing else)
          Assert.assertTrue(outliers.contains(new Long(20)));
          Assert.assertTrue(outliers.contains(new Long(21)));
          Assert.assertTrue(outliers.contains(new Long(22)));
          Assert.assertTrue(outliers.size() == 3);
        } finally {
          Log.info(sb);
          // cleanup
          if (mymodel!=null) mymodel.delete();
          if (l2_frame_train!=null) l2_frame_train.delete();
          if (l2_frame_test!=null) l2_frame_test.delete();
        }
      }
    } finally {
      if (train!=null) train.delete();
      if (test!=null) test.delete();
    }
  }
}