package hex.tree.gbm; import hex.*; import hex.genmodel.utils.DistributionFamily; import hex.tree.SharedTreeModel; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; import water.*; import water.api.StreamingSchema; import water.exceptions.H2OModelBuilderIllegalArgumentException; import water.fvec.Chunk; import water.fvec.Frame; import water.fvec.RebalanceDataSet; import water.fvec.Vec; import water.parser.ParseDataset; import water.util.*; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; import static hex.genmodel.utils.DistributionFamily.*; import static org.junit.Assert.assertArrayEquals; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import static water.fvec.FVecTest.makeByteVec; public class GBMTest extends TestUtil { @BeforeClass public static void stall() { stall_till_cloudsize(1); } private abstract class PrepData { abstract int prep(Frame fr); } static final String ignored_aircols[] = new String[] { "DepTime", "ArrTime", "AirTime", "ArrDelay", "DepDelay", "TaxiIn", "TaxiOut", "Cancelled", "CancellationCode", "Diverted", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsDepDelayed"}; @Test public void testGBMRegressionGaussian() { GBMModel gbm = null; Frame fr = null, fr2 = null; try { fr = parse_test_file("./smalldata/gbm_test/Mfgdata_gaussian_GBM_testing.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = fr._key; parms._distribution = gaussian; parms._response_column = fr._names[1]; // Row in col 0, dependent in col 1, predictor in col 2 parms._ntrees = 1; parms._max_depth = 1; parms._min_rows = 1; parms._nbins = 20; // Drop ColV2 0 (row), keep 1 (response), keep col 2 (only predictor), drop remaining cols String[] xcols = parms._ignored_columns = new String[fr.numCols()-2]; xcols[0] = fr._names[0]; System.arraycopy(fr._names,3,xcols,1,fr.numCols()-3); parms._learn_rate = 1.0f; parms._score_each_iteration=true; GBM job = new GBM(parms); gbm = job.trainModel().get(); Assert.assertTrue(job.isStopped()); //HEX-1817 // Done building model; produce a score column with predictions fr2 = gbm.score(fr); //job.response() can be used in place of fr.vecs()[1] but it has been rebalanced double sq_err = new MathUtils.SquareError().doAll(fr.vecs()[1],fr2.vecs()[0])._sum; double mse = sq_err/fr2.numRows(); assertEquals(79152.12337641386,mse,0.1); assertEquals(79152.12337641386,gbm._output._scored_train[1]._mse,0.1); assertEquals(79152.12337641386,gbm._output._scored_train[1]._mean_residual_deviance,0.1); } finally { if( fr != null ) fr .remove(); if( fr2 != null ) fr2.remove(); if( gbm != null ) gbm.remove(); } } @Test public void testBasicGBM() { // Regression tests basicGBM("./smalldata/junit/cars.csv", new PrepData() { int prep(Frame fr ) {fr.remove("name").remove(); return ~fr.find("economy (mpg)"); }}, false, gaussian); basicGBM("./smalldata/junit/cars.csv", new PrepData() { int prep(Frame fr ) {fr.remove("name").remove(); return ~fr.find("economy (mpg)"); }}, false, DistributionFamily.poisson); basicGBM("./smalldata/junit/cars.csv", new PrepData() { int prep(Frame fr ) {fr.remove("name").remove(); return ~fr.find("economy (mpg)"); }}, false, DistributionFamily.gamma); basicGBM("./smalldata/junit/cars.csv", new PrepData() { int prep(Frame fr ) {fr.remove("name").remove(); return ~fr.find("economy (mpg)"); }}, false, DistributionFamily.tweedie); // Classification tests basicGBM("./smalldata/junit/test_tree.csv", new PrepData() { int prep(Frame fr) { return 1; } }, false, DistributionFamily.multinomial); basicGBM("./smalldata/junit/test_tree_minmax.csv", new PrepData() { int prep(Frame fr) { return fr.find("response"); } }, false, DistributionFamily.bernoulli); basicGBM("./smalldata/logreg/prostate.csv", new PrepData() { int prep(Frame fr) { fr.remove("ID").remove(); return fr.find("CAPSULE"); } }, false, DistributionFamily.bernoulli); basicGBM("./smalldata/logreg/prostate.csv", new PrepData() { int prep(Frame fr) { fr.remove("ID").remove(); return fr.find("CAPSULE"); } }, false, DistributionFamily.multinomial); basicGBM("./smalldata/junit/cars.csv", new PrepData() { int prep(Frame fr) { fr.remove("name").remove(); return fr.find("cylinders"); } }, false, DistributionFamily.multinomial); basicGBM("./smalldata/gbm_test/alphabet_cattest.csv", new PrepData() { int prep(Frame fr) { return fr.find("y"); } }, false, DistributionFamily.bernoulli); // basicGBM("./smalldata/gbm_test/alphabet_cattest.csv", // new PrepData() { int prep(Frame fr) { return fr.find("y"); } // }, // false, DistributionFamily.modified_huber); basicGBM("./smalldata/airlines/allyears2k_headers.zip", new PrepData() { int prep(Frame fr) { for( String s : ignored_aircols ) fr.remove(s).remove(); return fr.find("IsArrDelayed"); } }, false, DistributionFamily.bernoulli); // // Bigger Tests // basicGBM("../datasets/98LRN.CSV", // new PrepData() { int prep(Frame fr ) { // fr.remove("CONTROLN").remove(); // fr.remove("TARGET_D").remove(); // return fr.find("TARGET_B"); }}); // basicGBM("../datasets/UCI/UCI-large/covtype/covtype.data", // new PrepData() { int prep(Frame fr) { return fr.numCols()-1; } }); } @Test public void testBasicGBMFamily() { Scope.enter(); // Classification with Bernoulli family basicGBM("./smalldata/logreg/prostate.csv", new PrepData() { int prep(Frame fr) { fr.remove("ID").remove(); // Remove not-predictive ID int ci = fr.find("RACE"); // Change RACE to categorical Scope.track(fr.replace(ci,fr.vecs()[ci].toCategoricalVec())); return fr.find("CAPSULE"); // Prostate: predict on CAPSULE } }, false, DistributionFamily.bernoulli); Scope.exit(); } // ========================================================================== public GBMModel.GBMOutput basicGBM(String fname, PrepData prep, boolean validation, DistributionFamily family) { GBMModel gbm = null; Frame fr = null, fr2= null, vfr=null; try { Scope.enter(); fr = parse_test_file(fname); int idx = prep.prep(fr); // hack frame per-test if (family == DistributionFamily.bernoulli || family == DistributionFamily.multinomial || family == DistributionFamily.modified_huber) { if (!fr.vecs()[idx].isCategorical()) { Scope.track(fr.replace(idx, fr.vecs()[idx].toCategoricalVec())); } } DKV.put(fr); // Update frame after hacking it GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); if( idx < 0 ) idx = ~idx; parms._train = fr._key; parms._response_column = fr._names[idx]; parms._ntrees = 5; parms._distribution = family; parms._max_depth = 4; parms._min_rows = 1; parms._nbins = 50; parms._learn_rate = .2f; parms._score_each_iteration = true; if( validation ) { // Make a validation frame that's a clone of the training data vfr = new Frame(fr); DKV.put(vfr); parms._valid = vfr._key; } GBM job = new GBM(parms); gbm = job.trainModel().get(); // Done building model; produce a score column with predictions fr2 = gbm.score(fr); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(fr,fr2,1e-15)); Assert.assertTrue(job.isStopped()); //HEX-1817 return gbm._output; } finally { if( fr != null ) fr .remove(); if( fr2 != null ) fr2.remove(); if( vfr != null ) vfr.remove(); if( gbm != null ) gbm.delete(); Scope.exit(); } } // Test-on-Train. Slow test, needed to build a good model. @Test public void testGBMTrainTest() { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); try { Scope.enter(); parms._valid = parse_test_file("smalldata/gbm_test/ecology_eval.csv")._key; Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv"); train.remove("Site").remove(); // Remove unique ID int ci = train.find("Angaus"); // Convert response to categorical Scope.track(train.replace(ci, train.vecs()[ci].toCategoricalVec())); DKV.put(train); // Update frame after hacking it parms._train = train._key; parms._response_column = "Angaus"; // Train on the outcome parms._ntrees = 5; parms._max_depth = 5; parms._min_rows = 10; parms._nbins = 100; parms._learn_rate = .2f; parms._distribution = DistributionFamily.multinomial; gbm = new GBM(parms).trainModel().get(); hex.ModelMetricsBinomial mm = hex.ModelMetricsBinomial.getFromDKV(gbm,parms.valid()); double auc = mm._auc._auc; Assert.assertTrue(0.83 <= auc && auc < 0.87); // Sanely good model double[][] cm = mm._auc.defaultCM(); Assert.assertArrayEquals(ard(ard(349, 44), ard(43, 64)), cm); } finally { parms._train.remove(); parms._valid.remove(); if( gbm != null ) gbm.delete(); Scope.exit(); } } // Predict with no actual, after training @Test public void testGBMPredict() { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); Frame pred=null, res=null; Scope.enter(); try { Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv"); train.remove("Site").remove(); // Remove unique ID int ci = train.find("Angaus"); Scope.track(train.replace(ci, train.vecs()[ci].toCategoricalVec())); // Convert response 'Angaus' to categorical DKV.put(train); // Update frame after hacking it parms._train = train._key; parms._response_column = "Angaus"; // Train on the outcome parms._distribution = DistributionFamily.multinomial; gbm = new GBM(parms).trainModel().get(); pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv" ); pred.remove("Angaus").remove(); // No response column during scoring res = gbm.score(pred); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(pred, res, 1e-15)); } finally { parms._train.remove(); if( gbm != null ) gbm .delete(); if( pred != null ) pred.remove(); if( res != null ) res .remove(); Scope.exit(); } } // Scoring should output original probabilities and probabilities calibrated by Platt Scaling @Test public void testGBMPredictWithCalibration() { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); Scope.enter(); try { Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv"); Frame calib = parse_test_file("smalldata/gbm_test/ecology_eval.csv"); // Fix training set train.remove("Site").remove(); // Remove unique ID Scope.track(train.vec("Angaus")); train.replace(train.find("Angaus"), train.vecs()[train.find("Angaus")].toCategoricalVec()); Scope.track(train); DKV.put(train); // Update frame after hacking it // Fix calibration set (the same way as training) Scope.track(calib.vec("Angaus")); calib.replace(calib.find("Angaus"), calib.vecs()[calib.find("Angaus")].toCategoricalVec()); Scope.track(calib); DKV.put(calib); // Update frame after hacking it parms._train = train._key; parms._calibrate_model = true; parms._calibration_frame = calib._key; parms._response_column = "Angaus"; // Train on the outcome parms._distribution = DistributionFamily.multinomial; gbm = new GBM(parms).trainModel().get(); Frame pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv"); pred.remove("Angaus").remove(); // No response column during scoring Scope.track(pred); Frame res = Scope.track(gbm.score(pred)); assertArrayEquals(new String[]{"predict", "p0", "p1", "cal_p0", "cal_p1"}, res._names); assertEquals(res.vec("cal_p0").mean(), 0.7860, 1e-4); assertEquals(res.vec("cal_p1").mean(), 0.2139, 1e-4); } finally { if (gbm != null) gbm.remove(); Scope.exit(); } } // Adapt a trained model to a test dataset with different categoricals @Test public void testModelAdaptMultinomial() { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); try { Scope.enter(); Frame v; parms._train = ( parse_test_file("smalldata/junit/mixcat_train.csv"))._key; parms._valid = (v=parse_test_file("smalldata/junit/mixcat_test.csv" ))._key; parms._response_column = "Response"; // Train on the outcome parms._ntrees = 1; // Build a CART tree - 1 tree, full learn rate, down to 1 row parms._learn_rate = 1.0f; parms._min_rows = 1; parms._distribution = DistributionFamily.multinomial; gbm = new GBM(parms).trainModel().get(); Frame res = gbm.score(v); int[] ps = new int[(int)v.numRows()]; Vec.Reader vr = res.vecs()[0].new Reader(); for( int i=0; i<ps.length; i++ ) ps[i] = (int)vr.at8(i); // Expected predictions are X,X,Y,Y,X,Y,Z,X,Y // Never predicts W, the extra class in the test set. // Badly predicts Z because 1 tree does not pick up that feature#2 can also // be used to predict Z, and instead relies on factor C which does not appear // in the test set. Assert.assertArrayEquals("", ps, new int[]{1, 1, 2, 2, 1, 2, 3, 1, 2}); hex.ModelMetricsMultinomial mm = hex.ModelMetricsMultinomial.getFromDKV(gbm,parms.valid()); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(v,res,1e-15)); res.remove(); } finally { parms._train.remove(); parms._valid.remove(); if( gbm != null ) gbm.delete(); Scope.exit(); } } // A test of locking the input dataset during model building. @Test public void testModelLock() { GBM gbm=null; Frame fr=null; Scope.enter(); try { GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); fr = parse_test_file("smalldata/gbm_test/ecology_model.csv"); fr.remove("Site").remove(); // Remove unique ID int ci = fr.find("Angaus"); Scope.track(fr.replace(ci, fr.vecs()[ci].toCategoricalVec())); // Convert response 'Angaus' to categorical DKV.put(fr); // Update after hacking parms._train = fr._key; parms._response_column = "Angaus"; // Train on the outcome parms._ntrees = 10; parms._max_depth = 10; parms._min_rows = 1; parms._nbins = 20; parms._learn_rate = .2f; parms._distribution = DistributionFamily.multinomial; gbm = new GBM(parms); gbm.trainModel(); try { Thread.sleep(100); } catch( Exception ignore ) { } try { Log.info("Trying illegal frame delete."); fr.delete(); // Attempted delete while model-build is active Assert.fail("Should toss IAE instead of reaching here"); } catch( IllegalArgumentException ignore ) { } catch( RuntimeException re ) { assertTrue( re.getCause() instanceof IllegalArgumentException); } Log.info("Getting model"); GBMModel model = gbm.get(); Assert.assertTrue(gbm.isStopped()); //HEX-1817 if( model != null ) model.delete(); } finally { if( fr != null ) fr .remove(); Scope.exit(); } } // MSE generated by GBM with/without validation dataset should be same @Test public void testModelScoreKeeperEqualityOnProstateBernoulli() { final PrepData prostatePrep = new PrepData() { @Override int prep(Frame fr) { fr.remove("ID").remove(); return fr.find("CAPSULE"); } }; ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, false, DistributionFamily.bernoulli)._scored_train; ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, true , DistributionFamily.bernoulli)._scored_valid; Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal); } @Test public void testModelScoreKeeperEqualityOnProstateGaussian() { final PrepData prostatePrep = new PrepData() { @Override int prep(Frame fr) { fr.remove("ID").remove(); return ~fr.find("CAPSULE"); } }; ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, false, gaussian)._scored_train; ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, true , gaussian)._scored_valid; Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal); } @Test public void testModelScoreKeeperEqualityOnProstateMultinomial() { final PrepData prostatePrep = new PrepData() { @Override int prep(Frame fr) { fr.remove("ID").remove(); return fr.find("RACE"); } }; ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, false, DistributionFamily.multinomial)._scored_train; ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, true , DistributionFamily.multinomial)._scored_valid; Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal); } @Test public void testModelScoreKeeperEqualityOnTitanicGaussian() { final PrepData titanicPrep = new PrepData() { @Override int prep(Frame fr) { return fr.find("age"); } }; ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, false, gaussian)._scored_train; ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, true , gaussian)._scored_valid; Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal); } @Test public void testModelScoreKeeperEqualityOnTitanicBernoulli() { final PrepData titanicPrep = new PrepData() { @Override int prep(Frame fr) { return fr.find("survived"); } }; ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, false, DistributionFamily.bernoulli)._scored_train; ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, true , DistributionFamily.bernoulli)._scored_valid; Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal); } @Test public void testModelScoreKeeperEqualityOnTitanicMultinomial() { final PrepData titanicPrep = new PrepData() { @Override int prep(Frame fr) { return fr.find("survived"); } }; ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, false, DistributionFamily.multinomial)._scored_train; ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, true , DistributionFamily.multinomial)._scored_valid; Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal); } @Test public void testBigCat() { final PrepData prep = new PrepData() { @Override int prep(Frame fr) { return fr.find("y"); } }; basicGBM("./smalldata/gbm_test/50_cattest_test.csv" , prep, false, DistributionFamily.bernoulli); basicGBM("./smalldata/gbm_test/50_cattest_train.csv", prep, false, DistributionFamily.bernoulli); basicGBM("./smalldata/gbm_test/swpreds_1000x3.csv", prep, false, DistributionFamily.bernoulli); } // Test uses big data and is too slow for a pre-push @Test @Ignore public void testKDDTrees() { Frame tfr=null, vfr=null; String[] cols = new String[] {"DOB", "LASTGIFT", "TARGET_D"}; try { // Load data, hack frames Frame inF1 = parse_test_file("bigdata/laptop/usecases/cup98LRN_z.csv"); Frame inF2 = parse_test_file("bigdata/laptop/usecases/cup98VAL_z.csv"); tfr = inF1.subframe(cols); // Just the columns to train on vfr = inF2.subframe(cols); inF1.remove(cols).remove(); // Toss all the rest away inF2.remove(cols).remove(); tfr.replace(0, tfr.vec("DOB").toCategoricalVec()); // Convert 'DOB' to categorical vfr.replace(0, vfr.vec("DOB").toCategoricalVec()); DKV.put(tfr); DKV.put(vfr); // Same parms for all GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._valid = vfr._key; parms._response_column = "TARGET_D"; parms._ntrees = 3; parms._distribution = gaussian; // Build a first model; all remaining models should be equal GBM job1 = new GBM(parms); GBMModel gbm1 = job1.trainModel().get(); // Validation MSE should be equal ScoreKeeper[] firstScored = gbm1._output._scored_valid; // Build 10 more models, checking for equality for( int i=0; i<10; i++ ) { GBM job2 = new GBM(parms); GBMModel gbm2 = job2.trainModel().get(); ScoreKeeper[] secondScored = gbm2._output._scored_valid; // Check that MSE's from both models are equal int j; for( j=0; j<firstScored.length; j++ ) if (firstScored[j] != secondScored[j]) break; // Not Equals Enough // Report on unequal if( j < firstScored.length ) { System.out.println("=== =============== ==="); System.out.println("=== ORIGINAL MODEL ==="); for( int t=0; t<parms._ntrees; t++ ) System.out.println(gbm1._output.toStringTree(t,0)); System.out.println("=== DIFFERENT MODEL ==="); for( int t=0; t<parms._ntrees; t++ ) System.out.println(gbm2._output.toStringTree(t,0)); System.out.println("=== =============== ==="); Assert.assertArrayEquals("GBM should have the exact same MSEs for identical parameters", firstScored, secondScored); } gbm2.delete(); } gbm1.delete(); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); } } // Test uses big data and is too slow for a pre-push @Test @Ignore public void testMNIST() { Frame tfr=null, vfr=null; Scope.enter(); try { // Load data, hack frames tfr = parse_test_file("bigdata/laptop/mnist/train.csv.gz"); Scope.track(tfr.replace(784, tfr.vecs()[784].toCategoricalVec())); // Convert response 'C785' to categorical DKV.put(tfr); vfr = parse_test_file("bigdata/laptop/mnist/test.csv.gz"); Scope.track(vfr.replace(784, vfr.vecs()[784].toCategoricalVec())); // Convert response 'C785' to categorical DKV.put(vfr); // Same parms for all GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._valid = vfr._key; parms._response_column = "C785"; parms._ntrees = 2; parms._max_depth = 4; parms._distribution = DistributionFamily.multinomial; // Build a first model; all remaining models should be equal GBMModel gbm = new GBM(parms).trainModel().get(); Frame pred = gbm.score(vfr); double sq_err = new MathUtils.SquareError().doAll(vfr.lastVec(),pred.vecs()[0])._sum; double mse = sq_err/pred.numRows(); assertEquals(3.0199, mse, 1e-15); //same results gbm.delete(); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); Scope.exit(); } } // HEXDEV-194: Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters @Test public void testReprodubility() { Frame tfr=null; final int N = 5; double[] mses = new double[N]; Scope.enter(); try { // Load data, hack frames tfr = parse_test_file("smalldata/covtype/covtype.20k.data"); // rebalance to 256 chunks Key dest = Key.make("df.rebalanced.hex"); RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256); H2O.submitTask(rb); rb.join(); tfr.delete(); tfr = DKV.get(dest).get(); // Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key); // DKV.put(tfr); for (int i=0; i<N; ++i) { GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "C55"; parms._nbins = 1000; parms._ntrees = 5; parms._max_depth = 8; parms._learn_rate = 0.1f; parms._min_rows = 10; // parms._distribution = Family.multinomial; parms._distribution = gaussian; // Build a first model; all remaining models should be equal GBMModel gbm = new GBM(parms).trainModel().get(); assertEquals(gbm._output._ntrees, parms._ntrees); mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse; gbm.delete(); } } finally{ if (tfr != null) tfr.remove(); } Scope.exit(); for( double mse : mses ) System.out.println(mse); for( double mse : mses ) assertEquals(mse, mses[0], 1e-15); } // PUBDEV-557: Test dependency on # nodes (for small number of bins, but fixed number of chunks) @Test public void testReprodubilityAirline() { Frame tfr=null; final int N = 5; double[] mses = new double[N]; Scope.enter(); try { // Load data, hack frames tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip"); // rebalance to fixed number of chunks Key dest = Key.make("df.rebalanced.hex"); RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256); H2O.submitTask(rb); rb.join(); tfr.delete(); tfr = DKV.get(dest).get(); // Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key); // DKV.put(tfr); for (String s : new String[]{ "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) { tfr.remove(s).remove(); } DKV.put(tfr); for (int i=0; i<N; ++i) { GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "IsDepDelayed"; parms._nbins = 10; parms._nbins_cats = 500; parms._ntrees = 7; parms._max_depth = 5; parms._min_rows = 10; parms._distribution = DistributionFamily.bernoulli; parms._balance_classes = true; parms._seed = 0; // Build a first model; all remaining models should be equal GBMModel gbm = new GBM(parms).trainModel().get(); assertEquals(gbm._output._ntrees, parms._ntrees); mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse; gbm.delete(); } } finally { if (tfr != null) tfr.remove(); } Scope.exit(); System.out.println("MSEs start"); for(double d:mses) System.out.println(d); System.out.println("MSEs End"); System.out.flush(); for( double mse : mses ) assertEquals(0.21694215729861027, mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks), mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks) } @Test public void testReprodubilityAirlineSingleNode() { Frame tfr=null; final int N = 10; double[] mses = new double[N]; Scope.enter(); try { // Load data, hack frames tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip"); // rebalance to fixed number of chunks Key dest = Key.make("df.rebalanced.hex"); RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256); H2O.submitTask(rb); rb.join(); tfr.delete(); tfr = DKV.get(dest).get(); // Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key); // DKV.put(tfr); for (String s : new String[]{ "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) { tfr.remove(s).remove(); } DKV.put(tfr); for (int i=0; i<N; ++i) { GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "IsDepDelayed"; parms._nbins = 10; parms._nbins_cats = 500; parms._ntrees = 7; parms._max_depth = 5; parms._min_rows = 10; parms._distribution = DistributionFamily.bernoulli; parms._balance_classes = true; parms._seed = 0; parms._build_tree_one_node = true; // Build a first model; all remaining models should be equal GBMModel gbm = new GBM(parms).trainModel().get(); assertEquals(gbm._output._ntrees, parms._ntrees); mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse; gbm.delete(); } } finally { if (tfr != null) tfr.remove(); } Scope.exit(); System.out.println("MSE"); for(double d:mses) System.out.println(d); for( double mse : mses ) assertEquals(0.21694215729861027, mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks) } // HEXDEV-223 @Test public void testCategorical() { Frame tfr=null; final int N = 1; double[] mses = new double[N]; Scope.enter(); try { tfr = parse_test_file("smalldata/gbm_test/alphabet_cattest.csv"); Scope.track(tfr.replace(1, tfr.vecs()[1].toCategoricalVec())); DKV.put(tfr); for (int i=0; i<N; ++i) { GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "y"; parms._ntrees = 1; parms._max_depth = 1; parms._learn_rate = 1; parms._distribution = DistributionFamily.bernoulli; // Build a first model; all remaining models should be equal GBMModel gbm = new GBM(parms).trainModel().get(); assertEquals(gbm._output._ntrees, parms._ntrees); hex.ModelMetricsBinomial mm = hex.ModelMetricsBinomial.getFromDKV(gbm,parms.train()); double auc = mm._auc._auc; Assert.assertTrue(1 == auc); mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse; gbm.delete(); } } finally{ if (tfr != null) tfr.remove(); } Scope.exit(); for( double mse : mses ) assertEquals(0.0142093, mse, 1e-6); } // Test uses big data and is too slow for a pre-push @Test @Ignore public void testCUST_A() { Frame tfr=null, vfr=null, t_pred=null, v_pred=null; GBMModel gbm=null; Scope.enter(); try { // Load data, hack frames tfr = parse_test_file("./bigdata/covktr.csv"); vfr = parse_test_file("./bigdata/covkts.csv"); int idx = tfr.find("V55"); Scope.track(tfr.replace(idx, tfr.vecs()[idx].toCategoricalVec())); Scope.track(vfr.replace(idx, vfr.vecs()[idx].toCategoricalVec())); DKV.put(tfr); DKV.put(vfr); // Build model GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._valid = vfr._key; parms._response_column = "V55"; parms._ntrees = 10; parms._max_depth = 1; parms._nbins = 20; parms._min_rows = 10; parms._learn_rate = 0.01f; parms._distribution = DistributionFamily.multinomial; gbm = new GBM(parms).trainModel().get(); // Report AUC from training hex.ModelMetricsBinomial tmm = hex.ModelMetricsBinomial.getFromDKV(gbm,tfr); hex.ModelMetricsBinomial vmm = hex.ModelMetricsBinomial.getFromDKV(gbm,vfr); double t_auc = tmm._auc._auc; double v_auc = vmm._auc._auc; System.out.println("train_AUC= "+t_auc+" , validation_AUC= "+v_auc); // Report AUC from scoring t_pred = gbm.score(tfr); v_pred = gbm.score(vfr); hex.ModelMetricsBinomial tmm2 = hex.ModelMetricsBinomial.getFromDKV(gbm,tfr); hex.ModelMetricsBinomial vmm2 = hex.ModelMetricsBinomial.getFromDKV(gbm,vfr); assert tmm != tmm2; assert vmm != vmm2; double t_auc2 = tmm._auc._auc; double v_auc2 = vmm._auc._auc; System.out.println("train_AUC2= "+t_auc2+" , validation_AUC2= "+v_auc2); t_pred.remove(); v_pred.remove(); // Compute the perfect AUC double t_auc3 = AUC2.perfectAUC(t_pred.vecs()[2], tfr.vec("V55")); double v_auc3 = AUC2.perfectAUC(v_pred.vecs()[2], vfr.vec("V55")); System.out.println("train_AUC3= "+t_auc3+" , validation_AUC3= "+v_auc3); Assert.assertEquals(t_auc3, t_auc , 1e-6); Assert.assertEquals(t_auc3, t_auc2, 1e-6); Assert.assertEquals(v_auc3, v_auc , 1e-6); Assert.assertEquals(v_auc3, v_auc2, 1e-6); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if( t_pred != null ) t_pred.remove(); if( v_pred != null ) v_pred.remove(); if (gbm != null) gbm.delete(); Scope.exit(); } } static double _AUC = 1; static double _MSE = 0.24850374695598948; static double _LogLoss = 0.690155; @Test public void testNoRowWeights() { Frame tfr = null, vfr = null; GBMModel gbm = null; Scope.enter(); try { tfr = parse_test_file("smalldata/junit/no_weights.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "response"; parms._seed = 0xdecaf; parms._min_rows = 1; parms._ntrees = 3; parms._learn_rate = 1e-3f; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics; assertEquals(_AUC, mm.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm.mse(), 1e-8); assertEquals(_LogLoss, mm.logloss(), 1e-6); Frame pred = gbm.score(parms.train()); hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train()); assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm2.mse(), 1e-8); assertEquals(_LogLoss, mm2.logloss(), 1e-6); pred.remove(); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) gbm.delete(); Scope.exit(); } } @Test public void testRowWeightsOne() { Frame tfr = null, vfr = null; Scope.enter(); GBMModel gbm = null; try { tfr = parse_test_file("smalldata/junit/weights_all_ones.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "response"; parms._weights_column = "weight"; parms._seed = 0xdecaf; parms._min_rows = 1; parms._max_depth = 2; parms._ntrees = 3; parms._learn_rate = 1e-3f; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics; assertEquals(_AUC, mm.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm.mse(), 1e-8); assertEquals(_LogLoss, mm.logloss(), 1e-6); Frame pred = gbm.score(parms.train()); hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train()); assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm2.mse(), 1e-8); assertEquals(_LogLoss, mm2.logloss(), 1e-6); pred.remove(); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) gbm.delete(); Scope.exit(); } } @Test public void testRowWeightsTwo() { Frame tfr = null, vfr = null; Scope.enter(); GBMModel gbm = null; try { tfr = parse_test_file("smalldata/junit/weights_all_twos.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "response"; parms._weights_column = "weight"; parms._seed = 0xdecaf; parms._min_rows = 2; //Must be adapted to the weights parms._max_depth = 2; parms._ntrees = 3; parms._learn_rate = 1e-3f; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics; assertEquals(_AUC, mm.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm.mse(), 1e-8); assertEquals(_LogLoss, mm.logloss(), 1e-6); Frame pred = gbm.score(parms.train()); hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train()); assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm2.mse(), 1e-8); assertEquals(_LogLoss, mm2.logloss(), 1e-6); pred.remove(); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) gbm.delete(); Scope.exit(); } } @Test public void testRowWeightsTiny() { Frame tfr = null, vfr = null; Scope.enter(); GBMModel gbm = null; try { tfr = parse_test_file("smalldata/junit/weights_all_tiny.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "response"; parms._weights_column = "weight"; parms._seed = 0xdecaf; parms._min_rows = 0.01242; //Must be adapted to the weights parms._max_depth = 2; parms._ntrees = 3; parms._learn_rate = 1e-3f; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics; assertEquals(_AUC, mm.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm.mse(), 1e-8); assertEquals(_LogLoss, mm.logloss(), 1e-6); Frame pred = gbm.score(parms.train()); hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train()); assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm2.mse(), 1e-8); assertEquals(_LogLoss, mm2.logloss(), 1e-6); pred.remove(); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) gbm.delete(); Scope.exit(); } } @Test public void testNoRowWeightsShuffled() { Frame tfr = null, vfr = null; GBMModel gbm = null; Scope.enter(); try { tfr = parse_test_file("smalldata/junit/no_weights_shuffled.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "response"; parms._seed = 0xdecaf; parms._min_rows = 1; parms._max_depth = 2; parms._ntrees = 3; parms._learn_rate = 1e-3f; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics; assertEquals(_AUC, mm.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm.mse(), 1e-8); assertEquals(_LogLoss, mm.logloss(), 1e-6); Frame pred = gbm.score(parms.train()); hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train()); assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm2.mse(), 1e-8); assertEquals(_LogLoss, mm2.logloss(), 1e-6); pred.remove(); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) gbm.delete(); Scope.exit(); } } @Test public void testRowWeights() { Frame tfr = null, vfr = null; GBMModel gbm = null; Scope.enter(); try { tfr = parse_test_file("smalldata/junit/weights.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "response"; parms._weights_column = "weight"; parms._seed = 0xdecaf; parms._min_rows = 1; parms._max_depth = 2; parms._ntrees = 3; parms._learn_rate = 1e-3f; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics; assertEquals(_AUC, mm.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm.mse(), 1e-8); assertEquals(_LogLoss, mm.logloss(), 1e-6); Frame pred = gbm.score(parms.train()); hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train()); assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8); assertEquals(_MSE, mm2.mse(), 1e-8); assertEquals(_LogLoss, mm2.logloss(), 1e-6); pred.remove(); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) gbm.delete(); Scope.exit(); } } @Test public void testNFold() { Frame tfr = null, vfr = null; GBMModel gbm = null; Scope.enter(); try { tfr = parse_test_file("smalldata/junit/weights.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "response"; parms._weights_column = "weight"; parms._seed = 123; parms._min_rows = 1; parms._max_depth = 2; parms._nfolds = 2; parms._ntrees = 3; parms._learn_rate = 1e-3f; parms._keep_cross_validation_predictions = true; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._cross_validation_metrics; assertEquals(0.6296296296296297, mm.auc_obj()._auc, 1e-8); assertEquals(0.28640022521234304, mm.mse(), 1e-8); assertEquals(0.7674117059335286, mm.logloss(), 1e-6); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) { gbm.deleteCrossValidationModels(); gbm.delete(); for (Key k : gbm._output._cross_validation_predictions) k.remove(); gbm._output._cross_validation_holdout_predictions_frame_id.remove(); } Scope.exit(); } } @Test public void testNfoldsOneVsRest() { Frame tfr = null; GBMModel gbm1 = null; GBMModel gbm2 = null; Scope.enter(); try { tfr = parse_test_file("smalldata/junit/weights.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "response"; parms._min_rows = 1; parms._max_depth = 2; parms._nfolds = (int) tfr.numRows(); parms._fold_assignment = Model.Parameters.FoldAssignmentScheme.Modulo; parms._ntrees = 3; parms._seed = 12345; parms._learn_rate = 1e-3f; gbm1 = new GBM(parms).trainModel().get(); //parms._nfolds = (int) tfr.numRows() + 1; //This is now an error gbm2 = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm1 = (ModelMetricsBinomial)gbm1._output._cross_validation_metrics; ModelMetricsBinomial mm2 = (ModelMetricsBinomial)gbm2._output._cross_validation_metrics; assertEquals(mm1.auc_obj()._auc, mm2.auc_obj()._auc, 1e-12); assertEquals(mm1.mse(), mm2.mse(), 1e-12); //assertEquals(mm1.r2(), mm2.r2(), 1e-12); assertEquals(mm1.logloss(), mm2.logloss(), 1e-12); //TODO: add check: the correct number of individual models were built. PUBDEV-1690 } finally { if (tfr != null) tfr.remove(); if (gbm1 != null) { gbm1.deleteCrossValidationModels(); gbm1.delete(); } if (gbm2 != null) { gbm2.deleteCrossValidationModels(); gbm2.delete(); } Scope.exit(); } } @Test public void testNfoldsInvalidValues() { Frame tfr = null; GBMModel gbm1 = null; GBMModel gbm2 = null; GBMModel gbm3 = null; Scope.enter(); try { tfr = parse_test_file("smalldata/junit/weights.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "response"; parms._min_rows = 1; parms._seed = 12345; parms._max_depth = 2; parms._ntrees = 3; parms._learn_rate = 1e-3f; parms._nfolds = 0; gbm1 = new GBM(parms).trainModel().get(); parms._nfolds = 1; try { Log.info("Trying nfolds==1."); gbm2 = new GBM(parms).trainModel().get(); Assert.fail("Should toss H2OModelBuilderIllegalArgumentException instead of reaching here"); } catch(H2OModelBuilderIllegalArgumentException e) {} parms._nfolds = -99; try { Log.info("Trying nfolds==-99."); gbm3 = new GBM(parms).trainModel().get(); Assert.fail("Should toss H2OModelBuilderIllegalArgumentException instead of reaching here"); } catch(H2OModelBuilderIllegalArgumentException e) {} } finally { if (tfr != null) tfr.remove(); if (gbm1 != null) gbm1.delete(); if (gbm2 != null) gbm2.delete(); if (gbm3 != null) gbm3.delete(); Scope.exit(); } } @Test public void testNfoldsCVAndValidation() { Frame tfr = null, vfr = null; GBMModel gbm = null; Scope.enter(); try { tfr = parse_test_file("smalldata/junit/weights.csv"); vfr = parse_test_file("smalldata/junit/weights.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._valid = vfr._key; parms._response_column = "response"; parms._seed = 12345; parms._min_rows = 1; parms._max_depth = 2; parms._nfolds = 3; parms._ntrees = 3; parms._learn_rate = 1e-3f; try { Log.info("Trying N-fold cross-validation AND Validation dataset provided."); gbm = new GBM(parms).trainModel().get(); } catch(H2OModelBuilderIllegalArgumentException e) { Assert.fail("Should not toss H2OModelBuilderIllegalArgumentException."); } } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) { gbm.deleteCrossValidationModels(); gbm.delete(); } Scope.exit(); } } @Test public void testNfoldsConsecutiveModelsSame() { Frame tfr = null; Vec old = null; GBMModel gbm1 = null; GBMModel gbm2 = null; Scope.enter(); try { tfr = parse_test_file("smalldata/junit/cars_20mpg.csv"); tfr.remove("name").remove(); // Remove unique id tfr.remove("economy").remove(); old = tfr.remove("economy_20mpg"); tfr.add("economy_20mpg", old.toCategoricalVec()); // response to last column DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "economy_20mpg"; parms._min_rows = 1; parms._seed = 12345; parms._max_depth = 2; parms._nfolds = 3; parms._ntrees = 3; parms._learn_rate = 1e-3f; gbm1 = new GBM(parms).trainModel().get(); gbm2 = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm1 = (ModelMetricsBinomial)gbm1._output._cross_validation_metrics; ModelMetricsBinomial mm2 = (ModelMetricsBinomial)gbm2._output._cross_validation_metrics; assertEquals(mm1.auc_obj()._auc, mm2.auc_obj()._auc, 1e-12); assertEquals(mm1.mse(), mm2.mse(), 1e-12); //assertEquals(mm1.r2(), mm2.r2(), 1e-12); assertEquals(mm1.logloss(), mm2.logloss(), 1e-12); } finally { if (tfr != null) tfr.remove(); if (old != null) old.remove(); if (gbm1 != null) { gbm1.deleteCrossValidationModels(); gbm1.delete(); } if (gbm2 != null) { gbm2.deleteCrossValidationModels(); gbm2.delete(); } Scope.exit(); } } @Test public void testNfoldsColumn() { Frame tfr = null; GBMModel gbm1 = null; try { tfr = parse_test_file("smalldata/junit/cars_20mpg.csv"); tfr.remove("name").remove(); // Remove unique id DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "economy_20mpg"; parms._fold_column = "cylinders"; Vec old = tfr.remove("cylinders"); tfr.add("cylinders",old.toCategoricalVec()); DKV.put(tfr); parms._ntrees = 10; parms._keep_cross_validation_fold_assignment = true; GBM job1 = new GBM(parms); gbm1 = job1.trainModel().get(); Assert.assertTrue(gbm1._output._cross_validation_models.length == 5); old.remove(); } finally { if (tfr != null) tfr.remove(); if (gbm1 != null) { gbm1.deleteCrossValidationModels(); gbm1.delete(); gbm1._output._cross_validation_fold_assignment_frame_id.remove(); } } } @Test public void testNfoldsColumnNumbersFrom0() { Frame tfr = null; Vec old = null; GBMModel gbm1 = null; try { tfr = parse_test_file("smalldata/junit/cars_20mpg.csv"); tfr.remove("name").remove(); // Remove unique id new MRTask() { @Override public void map(Chunk c) { for (int i=0;i<c.len();++i) { if (c.at8(i) == 3) c.set(i, 0); if (c.at8(i) == 4) c.set(i, 1); if (c.at8(i) == 5) c.set(i, 2); if (c.at8(i) == 6) c.set(i, 3); if (c.at8(i) == 8) c.set(i, 4); } } }.doAll(tfr.vec("cylinders")); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "economy_20mpg"; parms._fold_column = "cylinders"; parms._ntrees = 10; GBM job1 = new GBM(parms); gbm1 = job1.trainModel().get(); Assert.assertTrue(gbm1._output._cross_validation_models.length == 5); } finally { if (tfr != null) tfr.remove(); if (old != null) old.remove(); if (gbm1 != null) { gbm1.deleteCrossValidationModels(); gbm1.delete(); } } } @Test public void testNfoldsColumnCategorical() { Frame tfr = null; Vec old = null; GBMModel gbm1 = null; try { tfr = parse_test_file("smalldata/junit/cars_20mpg.csv"); tfr.remove("name").remove(); // Remove unique id old = tfr.remove("cylinders"); tfr.add("folds", old.toCategoricalVec()); old.remove(); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "economy_20mpg"; parms._fold_column = "folds"; parms._ntrees = 10; GBM job1 = new GBM(parms); gbm1 = job1.trainModel().get(); Assert.assertTrue(gbm1._output._cross_validation_models.length == 5); } finally { if (tfr != null) tfr.remove(); if (old != null) old.remove(); if (gbm1 != null) { gbm1.deleteCrossValidationModels(); gbm1.delete(); } } } @Test public void testNFoldAirline() { Frame tfr = null, vfr = null; GBMModel gbm = null; Scope.enter(); try { tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip"); for (String s : new String[]{ "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) { tfr.remove(s).remove(); } DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "IsDepDelayed"; parms._seed = 234; parms._min_rows = 2; parms._nfolds = 3; parms._max_depth = 5; parms._ntrees = 5; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._cross_validation_metrics; assertEquals(0.7309795467719639, mm.auc_obj()._auc, 1e-4); // 1 node assertEquals(0.22511756378273942, mm.mse(), 1e-4); assertEquals(0.6425515048581261, mm.logloss(), 1e-4); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) { gbm.deleteCrossValidationModels(); gbm.delete(); } Scope.exit(); } } // just a simple sanity check - not a golden test @Test public void testDistributions() { Frame tfr = null, vfr = null, res= null; GBMModel gbm = null; for (DistributionFamily dist : new DistributionFamily[]{ DistributionFamily.AUTO, gaussian, DistributionFamily.poisson, DistributionFamily.gamma, DistributionFamily.tweedie }) { Scope.enter(); try { tfr = parse_test_file("smalldata/glm_test/cancar_logIn.csv"); vfr = parse_test_file("smalldata/glm_test/cancar_logIn.csv"); for (String s : new String[]{ "Merit", "Class" }) { Scope.track(tfr.replace(tfr.find(s), tfr.vec(s).toCategoricalVec())); Scope.track(vfr.replace(vfr.find(s), vfr.vec(s).toCategoricalVec())); } DKV.put(tfr); DKV.put(vfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "Cost"; parms._seed = 0xdecaf; parms._distribution = dist; parms._min_rows = 1; parms._ntrees = 30; // parms._offset_column = "logInsured"; //POJO scoring not supported for offsets parms._learn_rate = 1e-3f; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); res = gbm.score(vfr); Assert.assertTrue(gbm.testJavaScoring(vfr,res,1e-15)); res.remove(); ModelMetricsRegression mm = (ModelMetricsRegression)gbm._output._training_metrics; } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (res != null) res.remove(); if (gbm != null) gbm.delete(); Scope.exit(); } } } @Test public void testStochasticGBM() { Frame tfr = null, vfr = null; GBMModel gbm = null; float[] sample_rates = new float[]{0.2f, 0.4f, 0.6f, 0.8f, 1.0f}; float[] col_sample_rates = new float[]{0.2f, 0.4f, 0.6f, 0.8f, 1.0f}; Map<Double, Pair<Float,Float>> hm = new TreeMap<>(); for (float sample_rate : sample_rates) { for (float col_sample_rate : col_sample_rates) { Scope.enter(); try { tfr = parse_test_file("./smalldata/gbm_test/ecology_model.csv"); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "Angaus"; //regression parms._seed = 123; parms._min_rows = 2; parms._max_depth = 10; parms._ntrees = 2; parms._col_sample_rate = col_sample_rate; parms._sample_rate = sample_rate; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); ModelMetricsRegression mm = (ModelMetricsRegression)gbm._output._training_metrics; hm.put(mm.mse(), new Pair<>(sample_rate, col_sample_rate)); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) gbm.delete(); Scope.exit(); } } } double fullDataMSE = hm.entrySet().iterator().next().getKey(); Iterator<Map.Entry<Double, Pair<Float, Float>>> it; int i=0; Pair<Float, Float> last = null; // iterator over results (min to max MSE) - best to worst for (it=hm.entrySet().iterator(); it.hasNext(); ++i) { Map.Entry<Double, Pair<Float,Float>> n = it.next(); if (i>0) Assert.assertTrue(n.getKey() > fullDataMSE); //any sampling should make training set MSE worse Log.info( "MSE: " + n.getKey() + ", " + ", row sample: " + ((Pair)n.getValue())._1() + ", col sample: " + ((Pair)n.getValue())._2()); last=n.getValue(); } // worst training MSE should belong to the most sampled case Assert.assertTrue(last._1()==sample_rates[0]); Assert.assertTrue(last._2()==col_sample_rates[0]); } @Test public void testStochasticGBMHoldout() { Frame tfr = null; Key[] ksplits = new Key[0]; try{ tfr=parse_test_file("./smalldata/gbm_test/ecology_model.csv"); SplitFrame sf = new SplitFrame(tfr,new double[] { 0.5, 0.5 },new Key[] { Key.make("train.hex"), Key.make("test.hex")}); // Invoke the job sf.exec().get(); ksplits = sf._destination_frames; GBMModel gbm = null; float[] sample_rates = new float[]{0.2f, 0.4f, 0.8f, 1.0f}; float[] col_sample_rates = new float[]{0.4f, 0.8f, 1.0f}; float[] col_sample_rates_per_tree = new float[]{0.4f, 0.6f, 1.0f}; Map<Double, Triple<Float>> hm = new TreeMap<>(); for (float sample_rate : sample_rates) { for (float col_sample_rate : col_sample_rates) { for (float col_sample_rate_per_tree : col_sample_rates_per_tree) { Scope.enter(); try { GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = ksplits[0]; parms._valid = ksplits[1]; parms._response_column = "Angaus"; //regression parms._seed = 42; parms._min_rows = 2; parms._max_depth = 12; parms._ntrees = 6; parms._col_sample_rate = col_sample_rate; parms._col_sample_rate_per_tree = col_sample_rate_per_tree; parms._sample_rate = sample_rate; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); // too slow, but passes (now) // // Build a POJO, validate same results // Frame pred = gbm.score(tfr); // Assert.assertTrue(gbm.testJavaScoring(tfr,pred,1e-15)); // pred.remove(); ModelMetricsRegression mm = (ModelMetricsRegression)gbm._output._validation_metrics; hm.put(mm.mse(), new Triple<>(sample_rate, col_sample_rate, col_sample_rate_per_tree)); } finally { if (gbm != null) gbm.delete(); Scope.exit(); } } } } Iterator<Map.Entry<Double, Triple<Float>>> it; Triple<Float> last = null; // iterator over results (min to max MSE) - best to worst for (it=hm.entrySet().iterator(); it.hasNext();) { Map.Entry<Double, Triple<Float>> n = it.next(); Log.info( "MSE: " + n.getKey() + ", row sample: " + n.getValue().v1 + ", col sample: " + n.getValue().v2 + ", col sample per tree: " + n.getValue().v3); last=n.getValue(); } // worst validation MSE should belong to the most overfit case (1.0, 1.0, 1.0) // Assert.assertTrue(last.v1==sample_rates[sample_rates.length-1]); // Assert.assertTrue(last.v2==col_sample_rates[col_sample_rates.length-1]); // Assert.assertTrue(last.v3==col_sample_rates_per_tree[col_sample_rates_per_tree.length-1]); } finally { if (tfr != null) tfr.remove(); for (Key k : ksplits) if (k!=null) k.remove(); } } // PUBDEV-2476 Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters @Test public void testChunks() { Frame tfr; int[] chunks = new int[]{1,2,2,39,39,500}; final int N = chunks.length; double[] mses = new double[N]; for (int i=0; i<N; ++i) { Scope.enter(); // Load data, hack frames tfr = parse_test_file("smalldata/covtype/covtype.20k.data"); // rebalance to a given number of chunks Key dest = Key.make("df.rebalanced.hex"); RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, chunks[i]); H2O.submitTask(rb); rb.join(); tfr.delete(); tfr = DKV.get(dest).get(); assertEquals(tfr.vec(0).nChunks(), chunks[i]); // Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key); DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "C55"; parms._seed = 1234; parms._auto_rebalance = false; parms._col_sample_rate_per_tree = 0.5f; parms._col_sample_rate = 0.3f; parms._ntrees = 5; parms._max_depth = 5; // Build a first model; all remaining models should be equal GBM job = new GBM(parms); GBMModel drf = job.trainModel().get(); assertEquals(drf._output._ntrees, parms._ntrees); mses[i] = drf._output._scored_train[drf._output._scored_train.length-1]._mse; drf.delete(); if (tfr != null) tfr.remove(); Scope.exit(); } for (int i=0; i<mses.length; ++i) { Log.info("trial: " + i + " -> MSE: " + mses[i]); } for(double mse : mses) assertEquals(mse, mses[0], 1e-10); } @Test public void testLaplace2() { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); Frame pred=null, res=null; Scope.enter(); try { Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv"); train.remove("Site").remove(); // Remove unique ID train.remove("Method").remove(); // Remove categorical DKV.put(train); // Update frame after hacking it parms._train = train._key; parms._response_column = "DSDist"; // Train on the outcome parms._distribution = laplace; parms._sample_rate = 0.6f; parms._col_sample_rate = 0.8f; parms._col_sample_rate_per_tree = 0.8f; parms._seed = 1234; GBM job = new GBM(parms); gbm = job.trainModel().get(); pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv" ); res = gbm.score(pred); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(pred, res, 1e-15)); Assert.assertTrue(Math.abs(((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance - 23.05805) < 1e-4); } finally { parms._train.remove(); if( gbm != null ) gbm .delete(); if( pred != null ) pred.remove(); if( res != null ) res .remove(); Scope.exit(); } } @Test public void testQuantileRegression() { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); Frame pred=null, res=null; Scope.enter(); try { Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv"); train.remove("Site").remove(); // Remove unique ID train.remove("Method").remove(); // Remove categorical DKV.put(train); // Update frame after hacking it parms._train = train._key; parms._response_column = "DSDist"; // Train on the outcome parms._distribution = DistributionFamily.quantile; parms._quantile_alpha = 0.4; parms._sample_rate = 0.6f; parms._col_sample_rate = 0.8f; parms._col_sample_rate_per_tree = 0.8f; parms._seed = 1234; GBM job = new GBM(parms); gbm = job.trainModel().get(); pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv" ); res = gbm.score(pred); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(pred, res, 1e-15)); Assert.assertTrue(Math.abs(((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance - 10.69611) < 1e-4); } finally { parms._train.remove(); if( gbm != null ) gbm .delete(); if( pred != null ) pred.remove(); if( res != null ) res .remove(); Scope.exit(); } } @Test public void missingAndUnseenValues() { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); Frame train=null, test=null, train_preds=null, test_preds=null; Scope.enter(); try { { CreateFrame cf = new CreateFrame(); cf.rows = 100; cf.cols = 10; cf.integer_range = 1000; cf.categorical_fraction = 1.0; cf.integer_fraction = 0.0; cf.binary_fraction = 0.0; cf.time_fraction = 0.0; cf.string_fraction = 0.0; cf.binary_ones_fraction = 0.0; cf.missing_fraction = 0.2; cf.factors = 3; cf.response_factors = 2; cf.positive_response = false; cf.has_response = true; cf.seed = 1235; cf.seed_for_column_types = 1234; train = cf.execImpl().get(); } { CreateFrame cf = new CreateFrame(); cf.rows = 100; cf.cols = 10; cf.integer_range = 1000; cf.categorical_fraction = 1.0; cf.integer_fraction = 0.0; cf.binary_fraction = 0.0; cf.time_fraction = 0.0; cf.string_fraction = 0.0; cf.binary_ones_fraction = 0.0; cf.missing_fraction = 0.2; cf.factors = 3; cf.response_factors = 2; cf.positive_response = false; cf.has_response = true; cf.seed = 4321; //different test set cf.seed_for_column_types = 1234; test = cf.execImpl().get(); } parms._train = train._key; parms._response_column = "response"; // Train on the outcome parms._distribution = DistributionFamily.multinomial; parms._max_depth = 20; parms._min_rows = 1; parms._ntrees = 5; parms._seed = 1; GBM job = new GBM(parms); gbm = job.trainModel().get(); train_preds = gbm.score(train); test_preds = gbm.score(test); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(train, train_preds, 1e-15)); Key old = gbm._key; gbm._key = Key.make(gbm._key + "ha"); Assert.assertTrue(gbm.testJavaScoring(test, test_preds, 1e-15)); DKV.remove(old); } finally { if( gbm != null ) gbm .delete(); if( train != null ) train.remove(); if( test != null ) test.remove(); if( train_preds != null ) train_preds .remove(); if( test_preds != null ) test_preds .remove(); Scope.exit(); } } @Test public void minSplitImprovement() { Frame tfr = null; Key[] ksplits = null; GBMModel gbm = null; try { Scope.enter(); tfr = parse_test_file("smalldata/covtype/covtype.20k.data"); int resp = 54; // tfr = parse_test_file("bigdata/laptop/mnist/train.csv.gz"); // int resp = 784; Scope.track(tfr.replace(resp, tfr.vecs()[resp].toCategoricalVec())); DKV.put(tfr); SplitFrame sf = new SplitFrame(tfr, new double[]{0.5, 0.5}, new Key[]{Key.make("train.hex"), Key.make("valid.hex")}); // Invoke the job sf.exec().get(); ksplits = sf._destination_frames; double[] msi = new double[]{0, 1e-1}; final int N = msi.length; double[] loglosses = new double[N]; for (int i = 0; i < N; ++i) { // Load data, hack frames GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = ksplits[0]; parms._valid = ksplits[1]; parms._response_column = tfr.names()[resp]; parms._learn_rate = 0.05f; parms._min_split_improvement = msi[i]; parms._ntrees = 10; parms._score_tree_interval = parms._ntrees; parms._max_depth = 5; GBM job = new GBM(parms); gbm = job.trainModel().get(); loglosses[i] = gbm._output._scored_valid[gbm._output._scored_valid.length - 1]._logloss; if (gbm!=null) gbm.delete(); } for (int i = 0; i < msi.length; ++i) { Log.info("min_split_improvement: " + msi[i] + " -> validation logloss: " + loglosses[i]); } int idx = ArrayUtils.minIndex(loglosses); Log.info("Optimal min_split_improvement: " + msi[idx]); assertTrue(0 == idx); } finally { if (gbm!=null) gbm.delete(); if (tfr!=null) tfr.delete(); if (ksplits[0]!=null) ksplits[0].remove(); if (ksplits[1]!=null) ksplits[1].remove(); Scope.exit(); } } @Test public void histoTypes() { Frame tfr = null; Key[] ksplits = null; GBMModel gbm = null; try { Scope.enter(); tfr = parse_test_file("smalldata/covtype/covtype.20k.data"); int resp = 54; // tfr = parse_test_file("bigdata/laptop/mnist/train.csv.gz"); // int resp = 784; Scope.track(tfr.replace(resp, tfr.vecs()[resp].toCategoricalVec())); DKV.put(tfr); SplitFrame sf = new SplitFrame(tfr, new double[]{0.5, 0.5}, new Key[]{Key.make("train.hex"), Key.make("valid.hex")}); // Invoke the job sf.exec().get(); ksplits = sf._destination_frames; SharedTreeModel.SharedTreeParameters.HistogramType[] histoType = SharedTreeModel.SharedTreeParameters.HistogramType.values(); final int N = histoType.length; double[] loglosses = new double[N]; for (int i = 0; i < N; ++i) { // Load data, hack frames GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = ksplits[0]; parms._valid = ksplits[1]; parms._response_column = tfr.names()[resp]; parms._learn_rate = 0.05f; parms._histogram_type = histoType[i]; parms._ntrees = 10; parms._score_tree_interval = parms._ntrees; parms._max_depth = 5; parms._seed = 0xDECAFFEE; GBM job = new GBM(parms); gbm = job.trainModel().get(); loglosses[i] = gbm._output._scored_valid[gbm._output._scored_valid.length - 1]._logloss; if (gbm!=null) gbm.delete(); } for (int i = 0; i < histoType.length; ++i) { Log.info("histoType: " + histoType[i] + " -> validation logloss: " + loglosses[i]); } int idx = ArrayUtils.minIndex(loglosses); Log.info("Optimal randomization: " + histoType[idx]); assertTrue(4 == idx); } finally { if (tfr!=null) tfr.delete(); if (ksplits[0]!=null) ksplits[0].remove(); if (ksplits[1]!=null) ksplits[1].remove(); Scope.exit(); } } @Test public void sampleRatePerClass() { Frame tfr = null; Key[] ksplits = null; GBMModel gbm = null; try { Scope.enter(); tfr = parse_test_file("smalldata/covtype/covtype.20k.data"); int resp = 54; Scope.track(tfr.replace(resp, tfr.vecs()[resp].toCategoricalVec())); DKV.put(tfr); SplitFrame sf = new SplitFrame(tfr, new double[]{0.5, 0.5}, new Key[]{Key.make("train.hex"), Key.make("valid.hex")}); // Invoke the job sf.exec().get(); ksplits = sf._destination_frames; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = ksplits[0]; parms._valid = ksplits[1]; parms._response_column = tfr.names()[resp]; parms._learn_rate = 0.05f; parms._min_split_improvement = 1e-5; parms._ntrees = 10; parms._score_tree_interval = parms._ntrees; parms._max_depth = 5; parms._sample_rate_per_class = new double[]{0.1f,0.1f,0.2f,0.4f,1f,0.3f,0.2f}; GBM job = new GBM(parms); gbm = job.trainModel().get(); if (gbm!=null) gbm.delete(); } finally { if (gbm!=null) gbm.delete(); if (tfr!=null) tfr.delete(); if (ksplits[0]!=null) ksplits[0].remove(); if (ksplits[1]!=null) ksplits[1].remove(); Scope.exit(); } } // PUBDEV-2822 @Test public void testNA() { String xy = ",0\n1,0\n2,0\n3,0\n4,-10\n,0"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Frame preds = gbm.score(df); Log.info(df); Log.info(preds); Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15)); Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(4) - -10) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 0) < 1e-6); preds.remove(); gbm.remove(); df.remove(); } // PUBDEV-2822 @Test public void testNARight() { String xy = ",10\n1,0\n2,0\n3,0\n4,10\n,10"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Frame preds = gbm.score(df); Log.info(df); Log.info(preds); Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15)); Assert.assertTrue(preds.vec(0).at(0) == 10); Assert.assertTrue(preds.vec(0).at(1) == 0); Assert.assertTrue(preds.vec(0).at(2) == 0); Assert.assertTrue(preds.vec(0).at(3) == 0); Assert.assertTrue(preds.vec(0).at(4) == 10); Assert.assertTrue(preds.vec(0).at(5) == 10); preds.remove(); gbm.remove(); df.remove(); } // PUBDEV-2822 @Test public void testNALeft() { String xy = ",0\n1,0\n2,0\n3,0\n4,10\n,0"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Frame preds = gbm.score(df); Log.info(df); Log.info(preds); Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15)); Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 10) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 0) < 1e-6); preds.remove(); gbm.remove(); df.remove(); } // PUBDEV-2822 @Test public void testNAvsRest() { String xy = ",5\n1,0\n2,0\n3,0\n4,0\n,3"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Frame preds = gbm.score(df); Log.info(df); Log.info(preds); Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15)); Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 4) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 4) < 1e-6); preds.remove(); gbm.remove(); df.remove(); } // PUBDEV-2822 @Test public void testOnevsRest() { String xy = "-9,5\n1,0\n2,0\n3,0\n4,0\n-9,3"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Frame preds = gbm.score(df); Log.info(df); Log.info(preds); Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15)); Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 4) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 4) < 1e-6); preds.remove(); gbm.remove(); df.remove(); } // PUBDEV-2822 @Test public void testNACategorical() { String xy = ",0\nA,0\nB,0\nA,0\nD,-10\n,0"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Log.info(df.toTwoDimTable()); Frame preds = gbm.score(df); Log.info(preds.toTwoDimTable()); Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15)); Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(4) - -10) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 0) < 1e-6); preds.remove(); gbm.remove(); df.remove(); } // PUBDEV-2822 @Test public void testNARightCategorical() { String xy = ",10\nA,0\nB,0\nA,0\n4,10\n,10"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Frame preds = gbm.score(df); Log.info(df); Log.info(preds); Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15)); Assert.assertTrue(preds.vec(0).at(0) == 10); Assert.assertTrue(preds.vec(0).at(1) == 0); Assert.assertTrue(preds.vec(0).at(2) == 0); Assert.assertTrue(preds.vec(0).at(3) == 0); Assert.assertTrue(preds.vec(0).at(4) == 10); Assert.assertTrue(preds.vec(0).at(5) == 10); preds.remove(); gbm.remove(); df.remove(); } // PUBDEV-2822 @Test public void testNALeftCategorical() { String xy = ",0\nA,0\nB,0\nA,0\nD,10\n,0"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Frame preds = gbm.score(df); Log.info(df); Log.info(preds); Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15)); Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 10) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 0) < 1e-6); preds.remove(); gbm.remove(); df.remove(); } // PUBDEV-2822 @Test public void testNAvsRestCategorical() { String xy = ",5\nA,0\nB,0\nA,0\nD,0\n,3"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Frame preds = gbm.score(df); Log.info(df); Log.info(preds); Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15)); Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 4) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 4) < 1e-6); preds.remove(); gbm.remove(); df.remove(); } // PUBDEV-2822 @Test public void testUnseenNACategorical() { String xy = "B,-5\nA,0\nB,0\nA,0\nD,0\nA,3"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); String test = ",5\n,0\nB,0\n,0\nE,0\n,3"; Key te = Key.make("test"); Frame df2 = ParseDataset.parse(te, makeByteVec(Key.make("te"), test)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Scope.enter(); //AdaptTestTrain leaks when it does inplace Vec adaptation, need a Scope to catch that stuff Frame preds = gbm.score(df); Frame preds2 = gbm.score(df2); Log.info(df); Log.info(preds); Log.info(df2); Log.info(preds2); Assert.assertTrue(gbm.testJavaScoring(df, preds, 1e-15)); Assert.assertTrue(gbm.testJavaScoring(df2, preds2, 1e-15)); Assert.assertTrue(Math.abs(preds.vec(0).at(0) - -2.5) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 1) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(2) - -2.5) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 1) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6); Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 1) < 1e-6); preds.remove(); preds2.remove(); gbm.remove(); df.remove(); df2.remove(); Scope.exit(); } @Test public void unseenMissing() { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); Frame train=null, test=null, train_preds=null, test_preds=null; Scope.enter(); try { { CreateFrame cf = new CreateFrame(); cf.rows = 100; cf.cols = 10; cf.integer_range = 1000; cf.categorical_fraction = 1.0; cf.integer_fraction = 0.0; cf.binary_fraction = 0.0; cf.time_fraction = 0.0; cf.string_fraction = 0.0; cf.binary_ones_fraction = 0.0; cf.missing_fraction = 0.0; cf.factors = 3; cf.response_factors = 2; cf.positive_response = false; cf.has_response = true; cf.seed = 1235; cf.seed_for_column_types = 1234; train = cf.execImpl().get(); } { CreateFrame cf = new CreateFrame(); cf.rows = 100; cf.cols = 10; cf.integer_range = 1000; cf.categorical_fraction = 1.0; cf.integer_fraction = 0.0; cf.binary_fraction = 0.0; cf.time_fraction = 0.0; cf.string_fraction = 0.0; cf.binary_ones_fraction = 0.0; cf.missing_fraction = 0.8; cf.factors = 3; cf.response_factors = 2; cf.positive_response = false; cf.has_response = true; cf.seed = 4321; //different test set cf.seed_for_column_types = 1234; test = cf.execImpl().get(); } parms._train = train._key; parms._response_column = "response"; // Train on the outcome parms._distribution = DistributionFamily.multinomial; parms._max_depth = 20; parms._min_rows = 1; parms._ntrees = 5; parms._seed = 1; GBM job = new GBM(parms); gbm = job.trainModel().get(); train_preds = gbm.score(train); test_preds = gbm.score(test); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(train, train_preds, 1e-15)); Key old = gbm._key; gbm._key = Key.make(gbm._key + "ha"); Assert.assertTrue(gbm.testJavaScoring(test, test_preds, 1e-15)); DKV.remove(old); } finally { if( gbm != null ) gbm .delete(); if( train != null ) train.remove(); if( test != null ) test.remove(); if( train_preds != null ) train_preds .remove(); if( test_preds != null ) test_preds .remove(); Scope.exit(); } } //PUBDEV-3066 @Test public void testAnnealingStop() { Frame tfr=null; final int N = 1; Scope.enter(); try { // Load data, hack frames tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip"); for (String s : new String[]{ "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) { tfr.remove(s).remove(); } DKV.put(tfr); for (int i=0; i<N; ++i) { GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "IsDepDelayed"; parms._nbins = 10; parms._nbins_cats = 500; parms._ntrees = 100; parms._learn_rate_annealing = 0.5; parms._max_depth = 5; parms._min_rows = 10; parms._distribution = DistributionFamily.bernoulli; parms._balance_classes = true; parms._seed = 0; // Build a first model; all remaining models should be equal GBMModel gbm = new GBM(parms).trainModel().get(); Assert.assertNotEquals(gbm._output._ntrees, parms._ntrees); gbm.delete(); } } finally { if (tfr != null) tfr.remove(); } Scope.exit(); } @Ignore public void testModifiedHuber() { Frame tfr = null, vfr = null; GBMModel gbm = null; Scope.enter(); try { tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip"); for (String s : new String[]{ "DepTime", "ArrTime", "ActualElapsedTime", "AirTime", "ArrDelay", "DepDelay", "Cancelled", "CancellationCode", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed" }) { tfr.remove(s).remove(); } DKV.put(tfr); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = "IsDepDelayed"; parms._seed = 1234; parms._distribution = DistributionFamily.modified_huber; parms._min_rows = 1; parms._learn_rate = .1; parms._max_depth = 5; parms._ntrees = 10; // Build a first model; all remaining models should be equal gbm = new GBM(parms).trainModel().get(); Frame train_preds = gbm.score(tfr); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(tfr, train_preds, 1e-15)); train_preds.remove(); ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics; // assertEquals(0.59998, mm.auc_obj()._auc, 1e-4); // 1 node // assertEquals(0.31692, mm.mse(), 1e-4); // assertEquals(0.79069, mm.logloss(), 1e-4); } finally { if (tfr != null) tfr.remove(); if (vfr != null) vfr.remove(); if (gbm != null) { gbm.deleteCrossValidationModels(); gbm.delete(); } Scope.exit(); } } @Ignore public void testModifiedHuberStability() { String xy = "A,Y\nB,N\nA,N\nB,N\nA,Y\nA,Y"; Key tr = Key.make("train"); Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy)); String test = "A,Y\nB,N\nA,N\nB,N\nA,Y\nA,Y"; Key te = Key.make("test"); Frame df2 = ParseDataset.parse(te, makeByteVec(Key.make("te"), test)); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tr; parms._response_column = "C2"; parms._min_rows = 1; parms._learn_rate = 1; parms._distribution = DistributionFamily.modified_huber; parms._ntrees = 1; GBM job = new GBM(parms); GBMModel gbm = job.trainModel().get(); Scope.enter(); //AdaptTestTrain leaks when it does inplace Vec adaptation, need a Scope to catch that stuff Frame preds = gbm.score(df); Frame preds2 = gbm.score(df2); Log.info(df); Log.info(preds); Log.info(df2); Log.info(preds2); Assert.assertTrue(gbm.testJavaScoring(df, preds, 1e-15)); Assert.assertTrue(gbm.testJavaScoring(df2, preds2, 1e-15)); // Assert.assertTrue(Math.abs(preds.vec(0).at(0) - -2.5) < 1e-6); // Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 1) < 1e-6); // Assert.assertTrue(Math.abs(preds.vec(0).at(2) - -2.5) < 1e-6); // Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 1) < 1e-6); // Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6); // Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 1) < 1e-6); preds.remove(); preds2.remove(); gbm.remove(); df.remove(); df2.remove(); Scope.exit(); } @Test public void testHuber2() { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); Frame pred=null, res=null; Scope.enter(); try { Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv"); train.remove("Site").remove(); // Remove unique ID train.remove("Method").remove(); // Remove categorical DKV.put(train); // Update frame after hacking it parms._train = train._key; parms._response_column = "DSDist"; // Train on the outcome parms._distribution = huber; parms._huber_alpha = 0.5; parms._sample_rate = 0.6f; parms._col_sample_rate = 0.8f; parms._col_sample_rate_per_tree = 0.8f; parms._seed = 1234; GBM job = new GBM(parms); gbm = job.trainModel().get(); pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv" ); res = gbm.score(pred); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(pred, res, 1e-15)); Assert.assertTrue(Math.abs(((ModelMetricsRegression)gbm._output._training_metrics)._MSE - 1485) < 1); Assert.assertTrue(Math.abs(((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance - 256.88) < 1); } finally { parms._train.remove(); if( gbm != null ) gbm .delete(); if( pred != null ) pred.remove(); if( res != null ) res .remove(); Scope.exit(); } } @Test public void testLaplace() { Frame tfr = null; GBMModel gbm = null; try { tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = tfr.lastVecName(); parms._seed = 0xdecaf; parms._distribution = laplace; gbm = new GBM(parms).trainModel().get(); Assert.assertEquals(8.05716257,((ModelMetricsRegression)gbm._output._training_metrics)._MSE,1e-5); Assert.assertEquals(1.42298/*MAE*/,((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance,1e-5); } finally { if (tfr != null) tfr.delete(); if (gbm != null) gbm.deleteCrossValidationModels(); if (gbm != null) gbm.delete(); } } @Test public void testGaussian() { Frame tfr = null; GBMModel gbm = null; try { tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = tfr.lastVecName(); parms._seed = 0xdecaf; parms._distribution = gaussian; gbm = new GBM(parms).trainModel().get(); Assert.assertEquals(2.9423857564,((ModelMetricsRegression) gbm._output._training_metrics)._MSE,1e-5); Assert.assertEquals(2.9423857564,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-5); } finally { if (tfr != null) tfr.delete(); if (gbm != null) gbm.deleteCrossValidationModels(); if (gbm != null) gbm.delete(); } } @Test public void testHuberDeltaLarge() { Frame tfr = null; GBMModel gbm = null; try { tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = tfr.lastVecName(); parms._seed = 0xdecaf; parms._distribution = huber; parms._huber_alpha = 1; // nothing is an outlier - same as gaussian gbm = new GBM(parms).trainModel().get(); Assert.assertEquals(2.9423857564,((ModelMetricsRegression) gbm._output._training_metrics)._MSE,1e-2); // huber loss with delta -> max(error) goes to MSE Assert.assertEquals(2.9423857564,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-2); } finally { if (tfr != null) tfr.delete(); if (gbm != null) gbm.deleteCrossValidationModels(); if (gbm != null) gbm.delete(); } } @Test public void testHuberDeltaTiny() { Frame tfr = null; GBMModel gbm = null; try { tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = tfr.lastVecName(); parms._seed = 0xdecaf; parms._distribution = huber; parms._huber_alpha = 1e-2; //everything is an outlier and we should get laplace loss gbm = new GBM(parms).trainModel().get(); Assert.assertEquals(8.05716257,((ModelMetricsRegression)gbm._output._training_metrics)._MSE,0.3); // Huber loss can be derived from MAE since no obs weights double delta = 0.0047234; //hardcoded from output double MAE = 1.42298; //see laplace above Assert.assertEquals((2*MAE-delta)*delta,((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance,2e-4); } finally { if (tfr != null) tfr.delete(); if (gbm != null) gbm.deleteCrossValidationModels(); if (gbm != null) gbm.delete(); } } @Test public void testHuber() { Frame tfr = null; GBMModel gbm = null; try { tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = tfr.lastVecName(); parms._seed = 0xdecaf; parms._distribution = huber; parms._huber_alpha = 0.9; //that's the default gbm = new GBM(parms).trainModel().get(); Assert.assertEquals(4.447062185,((ModelMetricsRegression)gbm._output._training_metrics)._MSE,1e-5); Assert.assertEquals(1.962926332,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-4); } finally { if (tfr != null) tfr.delete(); if (gbm != null) gbm.deleteCrossValidationModels(); if (gbm != null) gbm.delete(); } } @Test public void testHuberNoise() { Frame tfr = null; GBMModel gbm = null; try { tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = tfr.lastVecName(); parms._seed = 0xdecaf; parms._distribution = huber; parms._huber_alpha = 0.9; //that's the default parms._pred_noise_bandwidth = 0.2; gbm = new GBM(parms).trainModel().get(); Assert.assertEquals(4.8056900203,((ModelMetricsRegression)gbm._output._training_metrics)._MSE,1e-5); Assert.assertEquals(2.0080997,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-4); } finally { if (tfr != null) tfr.delete(); if (gbm != null) gbm.deleteCrossValidationModels(); if (gbm != null) gbm.delete(); } } @Test public void testDeviances() { for (DistributionFamily dist : DistributionFamily.values()) { if (dist == modified_huber) continue; Frame tfr = null; Frame res = null; Frame preds = null; GBMModel gbm = null; try { tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; String resp = tfr.lastVecName(); if (dist==modified_huber || dist==bernoulli || dist==multinomial) { resp = dist==multinomial?"rad":"chas"; Vec v = tfr.remove(resp); tfr.add(resp, v.toCategoricalVec()); v.remove(); DKV.put(tfr); } parms._response_column = resp; parms._distribution = dist; gbm = new GBM(parms).trainModel().get(); preds = gbm.score(tfr); res = gbm.computeDeviances(tfr,preds,"myDeviances"); double meanDeviance = res.anyVec().mean(); if (gbm._output.nclasses()==2) Assert.assertEquals(meanDeviance,((ModelMetricsBinomial) gbm._output._training_metrics)._logloss,1e-6*Math.abs(meanDeviance)); else if (gbm._output.nclasses()>2) Assert.assertEquals(meanDeviance,((ModelMetricsMultinomial) gbm._output._training_metrics)._logloss,1e-6*Math.abs(meanDeviance)); else Assert.assertEquals(meanDeviance,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-6*Math.abs(meanDeviance)); } finally { if (tfr != null) tfr.delete(); if (res != null) res.delete(); if (preds != null) preds.delete(); if (gbm != null) gbm.delete(); } } } @Test public void testCatEncoding() { for (Model.Parameters.CategoricalEncodingScheme c : Model.Parameters.CategoricalEncodingScheme.values()) { if (c == Model.Parameters.CategoricalEncodingScheme.OneHotInternal) continue; Frame tfr = null; GBMModel gbm = null; Frame fr2 = null; try { tfr = parse_test_file("./smalldata/junit/weather.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = tfr.lastVecName(); parms._ntrees = 5; parms._categorical_encoding = c; gbm = new GBM(parms).trainModel().get(); // Done building model; produce a score column with predictions fr2 = gbm.score(tfr); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(tfr,fr2,1e-15)); } finally { if (tfr != null) tfr.delete(); if (fr2 != null) fr2.delete(); if (gbm != null) gbm.deleteCrossValidationModels(); if (gbm != null) gbm.delete(); } } } @Test public void testCatEncodingCV() { for (Model.Parameters.CategoricalEncodingScheme c : Model.Parameters.CategoricalEncodingScheme.values()) { if (c == Model.Parameters.CategoricalEncodingScheme.OneHotInternal) continue; Frame tfr = null; GBMModel gbm = null; try { tfr = parse_test_file("./smalldata/junit/weather.csv"); GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); parms._train = tfr._key; parms._response_column = tfr.lastVecName(); parms._ntrees = 5; parms._categorical_encoding = c; parms._nfolds = 3; gbm = new GBM(parms).trainModel().get(); } finally { if (tfr != null) tfr.delete(); if (gbm != null) gbm.deleteCrossValidationModels(); if (gbm != null) gbm.delete(); } } } // A test of the validity of categorical splits @Test public void testCategoricalSplits() throws FileNotFoundException { Frame fr=null; GBMModel model = null; Scope.enter(); try { GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); fr = parse_test_file("smalldata/gbm_test/ecology_model.csv"); fr.remove("Site").remove(); fr.remove("SegSumT").remove(); fr.remove("SegTSeas").remove(); fr.remove("SegLowFlow").remove(); fr.remove("DSDist").remove(); fr.remove("DSMaxSlope").remove(); fr.remove("USAvgT").remove(); fr.remove("USRainDays").remove(); fr.remove("USSlope").remove(); // fr.remove("USNative").remove(); fr.remove("DSDam").remove(); // fr.remove("LocSed").remove(); fr.remove("Method").remove(); int ci = fr.find("Angaus"); Scope.track(fr.replace(ci, fr.vecs()[ci].toCategoricalVec())); // Convert response 'Angaus' to categorical DKV.put(fr); parms._train = fr._key; parms._response_column = "Angaus"; parms._ntrees = 1; parms._min_rows = 10; parms._max_depth = 13; parms._distribution = DistributionFamily.multinomial; model = new GBM(parms).trainModel().get(); // StreamingSchema ss = new StreamingSchema(model.getMojo(), "model.zip"); // FileOutputStream fos = new FileOutputStream("model.zip"); // ss.getStreamWriter().writeTo(fos); } finally { if( model != null ) model.delete(); if( fr != null ) fr.remove(); Scope.exit(); } } // A test of the validity of categorical splits @Test public void testCategoricalSplits2() throws FileNotFoundException { Frame fr=null; GBMModel model = null; Scope.enter(); try { GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); fr = parse_test_file("smalldata/airlines/allyears2k_headers.zip"); Frame fr2 = new Frame(Key.<Frame>make(), new String[]{"C","R"}, new Vec[]{fr.vec("Origin"),fr.vec("IsDepDelayed")}); int ci = fr2.find("R"); Scope.track(fr2.replace(ci, fr2.vecs()[ci].toCategoricalVec())); // Convert response 'Angaus' to categorical DKV.put(fr2); parms._train = fr2._key; parms._response_column = "R"; parms._ntrees = 1; parms._min_rows = 1000; parms._max_depth = 4; parms._distribution = DistributionFamily.bernoulli; model = new GBM(parms).trainModel().get(); DKV.remove(fr2._key); // StreamingSchema ss = new StreamingSchema(model.getMojo(), "model.zip"); // FileOutputStream fos = new FileOutputStream("model.zip"); // ss.getStreamWriter().writeTo(fos); } finally { if( model != null ) model.delete(); if( fr != null ) fr.remove(); Scope.exit(); } } @Test public void highCardinalityLowNbinsCats() { highCardinality(2000); } @Test public void highCardinalityHighNbinsCats() { highCardinality(6000); } public void highCardinality(int nbins_cats) { GBMModel gbm = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); Frame train=null, test=null, train_preds=null, test_preds=null; Scope.enter(); try { { CreateFrame cf = new CreateFrame(); cf.rows = 10000; cf.cols = 10; cf.integer_range = 1000; cf.categorical_fraction = 1.0; cf.integer_fraction = 0.0; cf.binary_fraction = 0.0; cf.time_fraction = 0.0; cf.string_fraction = 0.0; cf.binary_ones_fraction = 0.0; cf.missing_fraction = 0.2; cf.factors = 3000; cf.response_factors = 2; cf.positive_response = false; cf.has_response = true; cf.seed = 1235; cf.seed_for_column_types = 1234; train = cf.execImpl().get(); } { CreateFrame cf = new CreateFrame(); cf.rows = 10000; cf.cols = 10; cf.integer_range = 1000; cf.categorical_fraction = 1.0; cf.integer_fraction = 0.0; cf.binary_fraction = 0.0; cf.time_fraction = 0.0; cf.string_fraction = 0.0; cf.binary_ones_fraction = 0.0; cf.missing_fraction = 0.2; cf.factors = 5000; cf.response_factors = 2; cf.positive_response = false; cf.has_response = true; cf.seed = 5321; cf.seed_for_column_types = 1234; test = cf.execImpl().get(); } parms._train = train._key; parms._response_column = "response"; // Train on the outcome parms._max_depth = 20; //allow it to overfit parms._min_rows = 1; parms._ntrees = 1; parms._nbins_cats = nbins_cats; parms._seed = 0x2834234; GBM job = new GBM(parms); gbm = job.trainModel().get(); train_preds = gbm.score(train); test_preds = gbm.score(test); new MRTask() { public void map(Chunk c) { for (int i=0;i<c._len;++i) if (c.isNA(i)) c.set(i, 0.5); } }.doAll(train.vec("response")); new MRTask() { public void map(Chunk c) { for (int i=0;i<c._len;++i) if (c.isNA(i)) c.set(i, 0.5); } }.doAll(test.vec("response")); Log.info("Train AUC: " + ModelMetricsBinomial.make(train_preds.vec(2), train.vec("response")).auc()); Log.info("Test AUC: " + ModelMetricsBinomial.make(test_preds.vec(2), test.vec("response")).auc()); // Build a POJO, validate same results Assert.assertTrue(gbm.testJavaScoring(train, train_preds, 1e-15)); Key old = gbm._key; gbm._key = Key.make(gbm._key + "ha"); Assert.assertTrue(gbm.testJavaScoring(test, test_preds, 1e-15)); DKV.remove(old); } finally { if( gbm != null ) gbm .delete(); if( train != null ) train.remove(); if( test != null ) test.remove(); if( train_preds != null ) train_preds .remove(); if( test_preds != null ) test_preds .remove(); Scope.exit(); } } @Test public void lowCardinality() throws IOException { for (boolean sort_cats : new boolean[]{true, false}) { int[] vals = new int[]{2,10,20,25,26,27,100}; double[] maes = new double[vals.length]; int i=0; for (int nbins_cats : vals) { GBMModel model = null; GBMModel.GBMParameters parms = new GBMModel.GBMParameters(); Frame train, train_preds=null; Scope.enter(); train = parse_test_file("smalldata/gbm_test/alphabet_cattest.csv"); try { parms._train = train._key; parms._response_column = "y"; // Train on the outcome parms._max_depth = 2; parms._min_rows = 1; parms._ntrees = 1; parms._learn_rate = 1; parms._nbins_cats = nbins_cats; if (sort_cats) parms._categorical_encoding = Model.Parameters.CategoricalEncodingScheme.SortByResponse; GBM job = new GBM(parms); model = job.trainModel().get(); StreamingSchema ss = new StreamingSchema(model.getMojo(), "model.zip"); FileOutputStream fos = new FileOutputStream("model.zip"); ss.getStreamWriter().writeTo(fos); train_preds = model.score(train); Assert.assertTrue(model.testJavaScoring(train, train_preds, 1e-15)); double mae = ModelMetricsRegression.make(train_preds.vec(0), train.vec("y"), gaussian).mae(); Log.info("Train MAE: " + mae); maes[i++] = mae; if (nbins_cats >= 25 || sort_cats) Assert.assertEquals(0, mae, 1e-8); // sorting of categoricals is enough else Assert.assertTrue(mae > 0); } finally { if( model != null ) model.delete(); if( train != null ) train.remove(); if( train_preds != null ) train_preds .remove(); new File("model.zip").delete(); Scope.exit(); } } Log.info(Arrays.toString(vals)); Log.info(Arrays.toString(maes)); } } }