package hex.deeplearning; import hex.deeplearning.DeepLearningModel.DeepLearningParameters; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import water.Key; import water.TestUtil; import water.fvec.Frame; import water.fvec.NFSFileVec; import water.fvec.Vec; import water.parser.ParseDataset; import water.util.FileUtils; import water.util.Log; public class DeepLearningAutoEncoderCategoricalTest extends TestUtil { static final String PATH = "smalldata/airlines/AirlinesTrain.csv.zip"; @BeforeClass() public static void setup() { stall_till_cloudsize(1); } @Test public void run() { long seed = 0xDECAF; NFSFileVec nfs = TestUtil.makeNfsFileVec(PATH); Frame train = ParseDataset.parse(Key.make("train.hex"), nfs._key); DeepLearningParameters p = new DeepLearningParameters(); p._train = train._key; p._autoencoder = true; p._response_column = train.names()[train.names().length-1]; p._seed = seed; p._hidden = new int[]{10, 5, 3}; p._adaptive_rate = true; // String[] n = train.names(); // p._ignored_columns = new String[]{n[0],n[1],n[2],n[3],n[6],n[7],n[8],n[10]}; //Optional: ignore all categoricals // p._ignored_columns = new String[]{train.names()[4], train.names()[5], train.names()[9]}; //Optional: ignore all numericals p._l1 = 1e-4; p._activation = DeepLearningParameters.Activation.Tanh; p._max_w2 = 10; p._train_samples_per_iteration = -1; p._loss = DeepLearningParameters.Loss.Huber; p._epochs = 0.2; p._force_load_balance = true; p._score_training_samples = 0; p._score_validation_samples = 0; p._reproducible = true; DeepLearning dl = new DeepLearning(p); DeepLearningModel mymodel = dl.trainModel().get(); // Verification of results StringBuilder sb = new StringBuilder(); sb.append("Verifying results.\n"); sb.append("Reported mean reconstruction error: " + mymodel.mse() + "\n"); // Training data // Reconstruct data using the same helper functions and verify that self-reported MSE agrees final Frame rec = mymodel.scoreAutoEncoder(train, Key.make(), true); sb.append("Reconstruction error per feature: " + rec.toString() + "\n"); rec.remove(); final Frame l2 = mymodel.scoreAutoEncoder(train, Key.make(), false); final Vec l2vec = l2.anyVec(); sb.append("Actual mean reconstruction error: " + l2vec.mean() + "\n"); // print stats and potential outliers double quantile = 1 - 5. / train.numRows(); sb.append("The following training points are reconstructed with an error above the " + quantile * 100 + "-th percentile - potential \"outliers\" in testing data.\n"); double thresh = mymodel.calcOutlierThreshold(l2vec, quantile); for (long i = 0; i < l2vec.length(); i++) { if (l2vec.at(i) > thresh) { sb.append(String.format("row %d : l2vec error = %5f\n", i, l2vec.at(i))); } } Log.info(sb.toString()); Assert.assertEquals(l2vec.mean(), mymodel.mse(), 1e-8*mymodel.mse()); // Create reconstruction Log.info("Creating full reconstruction."); final Frame recon_train = mymodel.score(train); Assert.assertTrue(mymodel.testJavaScoring(train,recon_train,1e-5)); Frame df1 = mymodel.scoreDeepFeatures(train, 0); Assert.assertTrue(df1.numCols() == 10); Assert.assertTrue(df1.numRows() == train.numRows()); df1.delete(); Frame df2 = mymodel.scoreDeepFeatures(train, 1); Assert.assertTrue(df2.numCols() == 5); Assert.assertTrue(df2.numRows() == train.numRows()); df2.delete(); Frame df3 = mymodel.scoreDeepFeatures(train, 2); Assert.assertTrue(df3.numCols() == 3); Assert.assertTrue(df3.numRows() == train.numRows()); df3.delete(); // cleanup recon_train.delete(); train.delete(); mymodel.delete(); l2.delete(); } }