package hex.tree.gbm;
import hex.*;
import hex.genmodel.utils.DistributionFamily;
import hex.tree.SharedTreeModel;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import water.*;
import water.api.StreamingSchema;
import water.exceptions.H2OModelBuilderIllegalArgumentException;
import water.fvec.Chunk;
import water.fvec.Frame;
import water.fvec.RebalanceDataSet;
import water.fvec.Vec;
import water.parser.ParseDataset;
import water.util.*;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import static hex.genmodel.utils.DistributionFamily.*;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static water.fvec.FVecTest.makeByteVec;
public class GBMTest extends TestUtil {
@BeforeClass public static void stall() { stall_till_cloudsize(1); }
private abstract class PrepData { abstract int prep(Frame fr); }
static final String ignored_aircols[] = new String[] { "DepTime", "ArrTime", "AirTime", "ArrDelay", "DepDelay", "TaxiIn", "TaxiOut", "Cancelled", "CancellationCode", "Diverted", "CarrierDelay", "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay", "IsDepDelayed"};
@Test public void testGBMRegressionGaussian() {
GBMModel gbm = null;
Frame fr = null, fr2 = null;
try {
fr = parse_test_file("./smalldata/gbm_test/Mfgdata_gaussian_GBM_testing.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = fr._key;
parms._distribution = gaussian;
parms._response_column = fr._names[1]; // Row in col 0, dependent in col 1, predictor in col 2
parms._ntrees = 1;
parms._max_depth = 1;
parms._min_rows = 1;
parms._nbins = 20;
// Drop ColV2 0 (row), keep 1 (response), keep col 2 (only predictor), drop remaining cols
String[] xcols = parms._ignored_columns = new String[fr.numCols()-2];
xcols[0] = fr._names[0];
System.arraycopy(fr._names,3,xcols,1,fr.numCols()-3);
parms._learn_rate = 1.0f;
parms._score_each_iteration=true;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
Assert.assertTrue(job.isStopped()); //HEX-1817
// Done building model; produce a score column with predictions
fr2 = gbm.score(fr);
//job.response() can be used in place of fr.vecs()[1] but it has been rebalanced
double sq_err = new MathUtils.SquareError().doAll(fr.vecs()[1],fr2.vecs()[0])._sum;
double mse = sq_err/fr2.numRows();
assertEquals(79152.12337641386,mse,0.1);
assertEquals(79152.12337641386,gbm._output._scored_train[1]._mse,0.1);
assertEquals(79152.12337641386,gbm._output._scored_train[1]._mean_residual_deviance,0.1);
} finally {
if( fr != null ) fr .remove();
if( fr2 != null ) fr2.remove();
if( gbm != null ) gbm.remove();
}
}
@Test public void testBasicGBM() {
// Regression tests
basicGBM("./smalldata/junit/cars.csv",
new PrepData() { int prep(Frame fr ) {fr.remove("name").remove(); return ~fr.find("economy (mpg)"); }},
false, gaussian);
basicGBM("./smalldata/junit/cars.csv",
new PrepData() { int prep(Frame fr ) {fr.remove("name").remove(); return ~fr.find("economy (mpg)"); }},
false, DistributionFamily.poisson);
basicGBM("./smalldata/junit/cars.csv",
new PrepData() { int prep(Frame fr ) {fr.remove("name").remove(); return ~fr.find("economy (mpg)"); }},
false, DistributionFamily.gamma);
basicGBM("./smalldata/junit/cars.csv",
new PrepData() { int prep(Frame fr ) {fr.remove("name").remove(); return ~fr.find("economy (mpg)"); }},
false, DistributionFamily.tweedie);
// Classification tests
basicGBM("./smalldata/junit/test_tree.csv",
new PrepData() { int prep(Frame fr) { return 1; }
},
false, DistributionFamily.multinomial);
basicGBM("./smalldata/junit/test_tree_minmax.csv",
new PrepData() { int prep(Frame fr) { return fr.find("response"); }
},
false, DistributionFamily.bernoulli);
basicGBM("./smalldata/logreg/prostate.csv",
new PrepData() { int prep(Frame fr) { fr.remove("ID").remove(); return fr.find("CAPSULE"); }
},
false, DistributionFamily.bernoulli);
basicGBM("./smalldata/logreg/prostate.csv",
new PrepData() { int prep(Frame fr) { fr.remove("ID").remove(); return fr.find("CAPSULE"); }
},
false, DistributionFamily.multinomial);
basicGBM("./smalldata/junit/cars.csv",
new PrepData() { int prep(Frame fr) { fr.remove("name").remove(); return fr.find("cylinders"); }
},
false, DistributionFamily.multinomial);
basicGBM("./smalldata/gbm_test/alphabet_cattest.csv",
new PrepData() { int prep(Frame fr) { return fr.find("y"); }
},
false, DistributionFamily.bernoulli);
// basicGBM("./smalldata/gbm_test/alphabet_cattest.csv",
// new PrepData() { int prep(Frame fr) { return fr.find("y"); }
// },
// false, DistributionFamily.modified_huber);
basicGBM("./smalldata/airlines/allyears2k_headers.zip",
new PrepData() { int prep(Frame fr) {
for( String s : ignored_aircols ) fr.remove(s).remove();
return fr.find("IsArrDelayed"); }
},
false, DistributionFamily.bernoulli);
// // Bigger Tests
// basicGBM("../datasets/98LRN.CSV",
// new PrepData() { int prep(Frame fr ) {
// fr.remove("CONTROLN").remove();
// fr.remove("TARGET_D").remove();
// return fr.find("TARGET_B"); }});
// basicGBM("../datasets/UCI/UCI-large/covtype/covtype.data",
// new PrepData() { int prep(Frame fr) { return fr.numCols()-1; } });
}
@Test public void testBasicGBMFamily() {
Scope.enter();
// Classification with Bernoulli family
basicGBM("./smalldata/logreg/prostate.csv",
new PrepData() {
int prep(Frame fr) {
fr.remove("ID").remove(); // Remove not-predictive ID
int ci = fr.find("RACE"); // Change RACE to categorical
Scope.track(fr.replace(ci,fr.vecs()[ci].toCategoricalVec()));
return fr.find("CAPSULE"); // Prostate: predict on CAPSULE
}
}, false, DistributionFamily.bernoulli);
Scope.exit();
}
// ==========================================================================
public GBMModel.GBMOutput basicGBM(String fname, PrepData prep, boolean validation, DistributionFamily family) {
GBMModel gbm = null;
Frame fr = null, fr2= null, vfr=null;
try {
Scope.enter();
fr = parse_test_file(fname);
int idx = prep.prep(fr); // hack frame per-test
if (family == DistributionFamily.bernoulli || family == DistributionFamily.multinomial || family == DistributionFamily.modified_huber) {
if (!fr.vecs()[idx].isCategorical()) {
Scope.track(fr.replace(idx, fr.vecs()[idx].toCategoricalVec()));
}
}
DKV.put(fr); // Update frame after hacking it
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
if( idx < 0 ) idx = ~idx;
parms._train = fr._key;
parms._response_column = fr._names[idx];
parms._ntrees = 5;
parms._distribution = family;
parms._max_depth = 4;
parms._min_rows = 1;
parms._nbins = 50;
parms._learn_rate = .2f;
parms._score_each_iteration = true;
if( validation ) { // Make a validation frame that's a clone of the training data
vfr = new Frame(fr);
DKV.put(vfr);
parms._valid = vfr._key;
}
GBM job = new GBM(parms);
gbm = job.trainModel().get();
// Done building model; produce a score column with predictions
fr2 = gbm.score(fr);
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(fr,fr2,1e-15));
Assert.assertTrue(job.isStopped()); //HEX-1817
return gbm._output;
} finally {
if( fr != null ) fr .remove();
if( fr2 != null ) fr2.remove();
if( vfr != null ) vfr.remove();
if( gbm != null ) gbm.delete();
Scope.exit();
}
}
// Test-on-Train. Slow test, needed to build a good model.
@Test public void testGBMTrainTest() {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
try {
Scope.enter();
parms._valid = parse_test_file("smalldata/gbm_test/ecology_eval.csv")._key;
Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv");
train.remove("Site").remove(); // Remove unique ID
int ci = train.find("Angaus"); // Convert response to categorical
Scope.track(train.replace(ci, train.vecs()[ci].toCategoricalVec()));
DKV.put(train); // Update frame after hacking it
parms._train = train._key;
parms._response_column = "Angaus"; // Train on the outcome
parms._ntrees = 5;
parms._max_depth = 5;
parms._min_rows = 10;
parms._nbins = 100;
parms._learn_rate = .2f;
parms._distribution = DistributionFamily.multinomial;
gbm = new GBM(parms).trainModel().get();
hex.ModelMetricsBinomial mm = hex.ModelMetricsBinomial.getFromDKV(gbm,parms.valid());
double auc = mm._auc._auc;
Assert.assertTrue(0.83 <= auc && auc < 0.87); // Sanely good model
double[][] cm = mm._auc.defaultCM();
Assert.assertArrayEquals(ard(ard(349, 44), ard(43, 64)), cm);
} finally {
parms._train.remove();
parms._valid.remove();
if( gbm != null ) gbm.delete();
Scope.exit();
}
}
// Predict with no actual, after training
@Test public void testGBMPredict() {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Frame pred=null, res=null;
Scope.enter();
try {
Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv");
train.remove("Site").remove(); // Remove unique ID
int ci = train.find("Angaus");
Scope.track(train.replace(ci, train.vecs()[ci].toCategoricalVec())); // Convert response 'Angaus' to categorical
DKV.put(train); // Update frame after hacking it
parms._train = train._key;
parms._response_column = "Angaus"; // Train on the outcome
parms._distribution = DistributionFamily.multinomial;
gbm = new GBM(parms).trainModel().get();
pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv" );
pred.remove("Angaus").remove(); // No response column during scoring
res = gbm.score(pred);
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(pred, res, 1e-15));
} finally {
parms._train.remove();
if( gbm != null ) gbm .delete();
if( pred != null ) pred.remove();
if( res != null ) res .remove();
Scope.exit();
}
}
// Scoring should output original probabilities and probabilities calibrated by Platt Scaling
@Test public void testGBMPredictWithCalibration() {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Scope.enter();
try {
Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv");
Frame calib = parse_test_file("smalldata/gbm_test/ecology_eval.csv");
// Fix training set
train.remove("Site").remove(); // Remove unique ID
Scope.track(train.vec("Angaus"));
train.replace(train.find("Angaus"), train.vecs()[train.find("Angaus")].toCategoricalVec());
Scope.track(train);
DKV.put(train); // Update frame after hacking it
// Fix calibration set (the same way as training)
Scope.track(calib.vec("Angaus"));
calib.replace(calib.find("Angaus"), calib.vecs()[calib.find("Angaus")].toCategoricalVec());
Scope.track(calib);
DKV.put(calib); // Update frame after hacking it
parms._train = train._key;
parms._calibrate_model = true;
parms._calibration_frame = calib._key;
parms._response_column = "Angaus"; // Train on the outcome
parms._distribution = DistributionFamily.multinomial;
gbm = new GBM(parms).trainModel().get();
Frame pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv");
pred.remove("Angaus").remove(); // No response column during scoring
Scope.track(pred);
Frame res = Scope.track(gbm.score(pred));
assertArrayEquals(new String[]{"predict", "p0", "p1", "cal_p0", "cal_p1"}, res._names);
assertEquals(res.vec("cal_p0").mean(), 0.7860, 1e-4);
assertEquals(res.vec("cal_p1").mean(), 0.2139, 1e-4);
} finally {
if (gbm != null)
gbm.remove();
Scope.exit();
}
}
// Adapt a trained model to a test dataset with different categoricals
@Test public void testModelAdaptMultinomial() {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
try {
Scope.enter();
Frame v;
parms._train = ( parse_test_file("smalldata/junit/mixcat_train.csv"))._key;
parms._valid = (v=parse_test_file("smalldata/junit/mixcat_test.csv" ))._key;
parms._response_column = "Response"; // Train on the outcome
parms._ntrees = 1; // Build a CART tree - 1 tree, full learn rate, down to 1 row
parms._learn_rate = 1.0f;
parms._min_rows = 1;
parms._distribution = DistributionFamily.multinomial;
gbm = new GBM(parms).trainModel().get();
Frame res = gbm.score(v);
int[] ps = new int[(int)v.numRows()];
Vec.Reader vr = res.vecs()[0].new Reader();
for( int i=0; i<ps.length; i++ ) ps[i] = (int)vr.at8(i);
// Expected predictions are X,X,Y,Y,X,Y,Z,X,Y
// Never predicts W, the extra class in the test set.
// Badly predicts Z because 1 tree does not pick up that feature#2 can also
// be used to predict Z, and instead relies on factor C which does not appear
// in the test set.
Assert.assertArrayEquals("", ps, new int[]{1, 1, 2, 2, 1, 2, 3, 1, 2});
hex.ModelMetricsMultinomial mm = hex.ModelMetricsMultinomial.getFromDKV(gbm,parms.valid());
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(v,res,1e-15));
res.remove();
} finally {
parms._train.remove();
parms._valid.remove();
if( gbm != null ) gbm.delete();
Scope.exit();
}
}
// A test of locking the input dataset during model building.
@Test public void testModelLock() {
GBM gbm=null;
Frame fr=null;
Scope.enter();
try {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
fr = parse_test_file("smalldata/gbm_test/ecology_model.csv");
fr.remove("Site").remove(); // Remove unique ID
int ci = fr.find("Angaus");
Scope.track(fr.replace(ci, fr.vecs()[ci].toCategoricalVec())); // Convert response 'Angaus' to categorical
DKV.put(fr); // Update after hacking
parms._train = fr._key;
parms._response_column = "Angaus"; // Train on the outcome
parms._ntrees = 10;
parms._max_depth = 10;
parms._min_rows = 1;
parms._nbins = 20;
parms._learn_rate = .2f;
parms._distribution = DistributionFamily.multinomial;
gbm = new GBM(parms);
gbm.trainModel();
try { Thread.sleep(100); } catch( Exception ignore ) { }
try {
Log.info("Trying illegal frame delete.");
fr.delete(); // Attempted delete while model-build is active
Assert.fail("Should toss IAE instead of reaching here");
} catch( IllegalArgumentException ignore ) {
} catch( RuntimeException re ) {
assertTrue( re.getCause() instanceof IllegalArgumentException);
}
Log.info("Getting model");
GBMModel model = gbm.get();
Assert.assertTrue(gbm.isStopped()); //HEX-1817
if( model != null ) model.delete();
} finally {
if( fr != null ) fr .remove();
Scope.exit();
}
}
// MSE generated by GBM with/without validation dataset should be same
@Test public void testModelScoreKeeperEqualityOnProstateBernoulli() {
final PrepData prostatePrep = new PrepData() { @Override int prep(Frame fr) { fr.remove("ID").remove(); return fr.find("CAPSULE"); } };
ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, false, DistributionFamily.bernoulli)._scored_train;
ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, true , DistributionFamily.bernoulli)._scored_valid;
Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal);
}
@Test public void testModelScoreKeeperEqualityOnProstateGaussian() {
final PrepData prostatePrep = new PrepData() { @Override int prep(Frame fr) { fr.remove("ID").remove(); return ~fr.find("CAPSULE"); } };
ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, false, gaussian)._scored_train;
ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, true , gaussian)._scored_valid;
Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal);
}
@Test public void testModelScoreKeeperEqualityOnProstateMultinomial() {
final PrepData prostatePrep = new PrepData() { @Override int prep(Frame fr) { fr.remove("ID").remove(); return fr.find("RACE"); } };
ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, false, DistributionFamily.multinomial)._scored_train;
ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/logreg/prostate.csv", prostatePrep, true , DistributionFamily.multinomial)._scored_valid;
Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal);
}
@Test public void testModelScoreKeeperEqualityOnTitanicGaussian() {
final PrepData titanicPrep = new PrepData() { @Override int prep(Frame fr) { return fr.find("age"); } };
ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, false, gaussian)._scored_train;
ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, true , gaussian)._scored_valid;
Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal);
}
@Test public void testModelScoreKeeperEqualityOnTitanicBernoulli() {
final PrepData titanicPrep = new PrepData() { @Override int prep(Frame fr) { return fr.find("survived"); } };
ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, false, DistributionFamily.bernoulli)._scored_train;
ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, true , DistributionFamily.bernoulli)._scored_valid;
Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal);
}
@Test public void testModelScoreKeeperEqualityOnTitanicMultinomial() {
final PrepData titanicPrep = new PrepData() { @Override int prep(Frame fr) { return fr.find("survived"); } };
ScoreKeeper[] scoredWithoutVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, false, DistributionFamily.multinomial)._scored_train;
ScoreKeeper[] scoredWithVal = basicGBM("./smalldata/junit/titanic_alt.csv", titanicPrep, true , DistributionFamily.multinomial)._scored_valid;
Assert.assertArrayEquals("GBM has to report same list of MSEs for run without/with validation dataset (which is equal to training data)", scoredWithoutVal, scoredWithVal);
}
@Test public void testBigCat() {
final PrepData prep = new PrepData() { @Override int prep(Frame fr) { return fr.find("y"); } };
basicGBM("./smalldata/gbm_test/50_cattest_test.csv" , prep, false, DistributionFamily.bernoulli);
basicGBM("./smalldata/gbm_test/50_cattest_train.csv", prep, false, DistributionFamily.bernoulli);
basicGBM("./smalldata/gbm_test/swpreds_1000x3.csv", prep, false, DistributionFamily.bernoulli);
}
// Test uses big data and is too slow for a pre-push
@Test @Ignore public void testKDDTrees() {
Frame tfr=null, vfr=null;
String[] cols = new String[] {"DOB", "LASTGIFT", "TARGET_D"};
try {
// Load data, hack frames
Frame inF1 = parse_test_file("bigdata/laptop/usecases/cup98LRN_z.csv");
Frame inF2 = parse_test_file("bigdata/laptop/usecases/cup98VAL_z.csv");
tfr = inF1.subframe(cols); // Just the columns to train on
vfr = inF2.subframe(cols);
inF1.remove(cols).remove(); // Toss all the rest away
inF2.remove(cols).remove();
tfr.replace(0, tfr.vec("DOB").toCategoricalVec()); // Convert 'DOB' to categorical
vfr.replace(0, vfr.vec("DOB").toCategoricalVec());
DKV.put(tfr);
DKV.put(vfr);
// Same parms for all
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._valid = vfr._key;
parms._response_column = "TARGET_D";
parms._ntrees = 3;
parms._distribution = gaussian;
// Build a first model; all remaining models should be equal
GBM job1 = new GBM(parms);
GBMModel gbm1 = job1.trainModel().get();
// Validation MSE should be equal
ScoreKeeper[] firstScored = gbm1._output._scored_valid;
// Build 10 more models, checking for equality
for( int i=0; i<10; i++ ) {
GBM job2 = new GBM(parms);
GBMModel gbm2 = job2.trainModel().get();
ScoreKeeper[] secondScored = gbm2._output._scored_valid;
// Check that MSE's from both models are equal
int j;
for( j=0; j<firstScored.length; j++ )
if (firstScored[j] != secondScored[j])
break; // Not Equals Enough
// Report on unequal
if( j < firstScored.length ) {
System.out.println("=== =============== ===");
System.out.println("=== ORIGINAL MODEL ===");
for( int t=0; t<parms._ntrees; t++ )
System.out.println(gbm1._output.toStringTree(t,0));
System.out.println("=== DIFFERENT MODEL ===");
for( int t=0; t<parms._ntrees; t++ )
System.out.println(gbm2._output.toStringTree(t,0));
System.out.println("=== =============== ===");
Assert.assertArrayEquals("GBM should have the exact same MSEs for identical parameters", firstScored, secondScored);
}
gbm2.delete();
}
gbm1.delete();
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
}
}
// Test uses big data and is too slow for a pre-push
@Test @Ignore public void testMNIST() {
Frame tfr=null, vfr=null;
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("bigdata/laptop/mnist/train.csv.gz");
Scope.track(tfr.replace(784, tfr.vecs()[784].toCategoricalVec())); // Convert response 'C785' to categorical
DKV.put(tfr);
vfr = parse_test_file("bigdata/laptop/mnist/test.csv.gz");
Scope.track(vfr.replace(784, vfr.vecs()[784].toCategoricalVec())); // Convert response 'C785' to categorical
DKV.put(vfr);
// Same parms for all
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._valid = vfr._key;
parms._response_column = "C785";
parms._ntrees = 2;
parms._max_depth = 4;
parms._distribution = DistributionFamily.multinomial;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
Frame pred = gbm.score(vfr);
double sq_err = new MathUtils.SquareError().doAll(vfr.lastVec(),pred.vecs()[0])._sum;
double mse = sq_err/pred.numRows();
assertEquals(3.0199, mse, 1e-15); //same results
gbm.delete();
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
Scope.exit();
}
}
// HEXDEV-194: Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters
@Test public void testReprodubility() {
Frame tfr=null;
final int N = 5;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
// rebalance to 256 chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "C55";
parms._nbins = 1000;
parms._ntrees = 5;
parms._max_depth = 8;
parms._learn_rate = 0.1f;
parms._min_rows = 10;
// parms._distribution = Family.multinomial;
parms._distribution = gaussian;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse;
gbm.delete();
}
} finally{
if (tfr != null) tfr.remove();
}
Scope.exit();
for( double mse : mses )
System.out.println(mse);
for( double mse : mses )
assertEquals(mse, mses[0], 1e-15);
}
// PUBDEV-557: Test dependency on # nodes (for small number of bins, but fixed number of chunks)
@Test public void testReprodubilityAirline() {
Frame tfr=null;
final int N = 5;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 7;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = DistributionFamily.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse;
gbm.delete();
}
} finally {
if (tfr != null) tfr.remove();
}
Scope.exit();
System.out.println("MSEs start");
for(double d:mses)
System.out.println(d);
System.out.println("MSEs End");
System.out.flush();
for( double mse : mses )
assertEquals(0.21694215729861027, mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks), mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
}
@Test public void testReprodubilityAirlineSingleNode() {
Frame tfr=null;
final int N = 10;
double[] mses = new double[N];
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
// rebalance to fixed number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, 256);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
// DKV.put(tfr);
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 7;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = DistributionFamily.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
parms._build_tree_one_node = true;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse;
gbm.delete();
}
} finally {
if (tfr != null) tfr.remove();
}
Scope.exit();
System.out.println("MSE");
for(double d:mses)
System.out.println(d);
for( double mse : mses )
assertEquals(0.21694215729861027, mse, 1e-8); //check for the same result on 1 nodes and 5 nodes (will only work with enough chunks)
}
// HEXDEV-223
@Test public void testCategorical() {
Frame tfr=null;
final int N = 1;
double[] mses = new double[N];
Scope.enter();
try {
tfr = parse_test_file("smalldata/gbm_test/alphabet_cattest.csv");
Scope.track(tfr.replace(1, tfr.vecs()[1].toCategoricalVec()));
DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "y";
parms._ntrees = 1;
parms._max_depth = 1;
parms._learn_rate = 1;
parms._distribution = DistributionFamily.bernoulli;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
assertEquals(gbm._output._ntrees, parms._ntrees);
hex.ModelMetricsBinomial mm = hex.ModelMetricsBinomial.getFromDKV(gbm,parms.train());
double auc = mm._auc._auc;
Assert.assertTrue(1 == auc);
mses[i] = gbm._output._scored_train[gbm._output._scored_train.length-1]._mse;
gbm.delete();
}
} finally{
if (tfr != null) tfr.remove();
}
Scope.exit();
for( double mse : mses ) assertEquals(0.0142093, mse, 1e-6);
}
// Test uses big data and is too slow for a pre-push
@Test @Ignore public void testCUST_A() {
Frame tfr=null, vfr=null, t_pred=null, v_pred=null;
GBMModel gbm=null;
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./bigdata/covktr.csv");
vfr = parse_test_file("./bigdata/covkts.csv");
int idx = tfr.find("V55");
Scope.track(tfr.replace(idx, tfr.vecs()[idx].toCategoricalVec()));
Scope.track(vfr.replace(idx, vfr.vecs()[idx].toCategoricalVec()));
DKV.put(tfr);
DKV.put(vfr);
// Build model
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._valid = vfr._key;
parms._response_column = "V55";
parms._ntrees = 10;
parms._max_depth = 1;
parms._nbins = 20;
parms._min_rows = 10;
parms._learn_rate = 0.01f;
parms._distribution = DistributionFamily.multinomial;
gbm = new GBM(parms).trainModel().get();
// Report AUC from training
hex.ModelMetricsBinomial tmm = hex.ModelMetricsBinomial.getFromDKV(gbm,tfr);
hex.ModelMetricsBinomial vmm = hex.ModelMetricsBinomial.getFromDKV(gbm,vfr);
double t_auc = tmm._auc._auc;
double v_auc = vmm._auc._auc;
System.out.println("train_AUC= "+t_auc+" , validation_AUC= "+v_auc);
// Report AUC from scoring
t_pred = gbm.score(tfr);
v_pred = gbm.score(vfr);
hex.ModelMetricsBinomial tmm2 = hex.ModelMetricsBinomial.getFromDKV(gbm,tfr);
hex.ModelMetricsBinomial vmm2 = hex.ModelMetricsBinomial.getFromDKV(gbm,vfr);
assert tmm != tmm2;
assert vmm != vmm2;
double t_auc2 = tmm._auc._auc;
double v_auc2 = vmm._auc._auc;
System.out.println("train_AUC2= "+t_auc2+" , validation_AUC2= "+v_auc2);
t_pred.remove();
v_pred.remove();
// Compute the perfect AUC
double t_auc3 = AUC2.perfectAUC(t_pred.vecs()[2], tfr.vec("V55"));
double v_auc3 = AUC2.perfectAUC(v_pred.vecs()[2], vfr.vec("V55"));
System.out.println("train_AUC3= "+t_auc3+" , validation_AUC3= "+v_auc3);
Assert.assertEquals(t_auc3, t_auc , 1e-6);
Assert.assertEquals(t_auc3, t_auc2, 1e-6);
Assert.assertEquals(v_auc3, v_auc , 1e-6);
Assert.assertEquals(v_auc3, v_auc2, 1e-6);
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if( t_pred != null ) t_pred.remove();
if( v_pred != null ) v_pred.remove();
if (gbm != null) gbm.delete();
Scope.exit();
}
}
static double _AUC = 1;
static double _MSE = 0.24850374695598948;
static double _LogLoss = 0.690155;
@Test
public void testNoRowWeights() {
Frame tfr = null, vfr = null;
GBMModel gbm = null;
Scope.enter();
try {
tfr = parse_test_file("smalldata/junit/no_weights.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "response";
parms._seed = 0xdecaf;
parms._min_rows = 1;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics;
assertEquals(_AUC, mm.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm.mse(), 1e-8);
assertEquals(_LogLoss, mm.logloss(), 1e-6);
Frame pred = gbm.score(parms.train());
hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train());
assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm2.mse(), 1e-8);
assertEquals(_LogLoss, mm2.logloss(), 1e-6);
pred.remove();
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) gbm.delete();
Scope.exit();
}
}
@Test
public void testRowWeightsOne() {
Frame tfr = null, vfr = null;
Scope.enter();
GBMModel gbm = null;
try {
tfr = parse_test_file("smalldata/junit/weights_all_ones.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "response";
parms._weights_column = "weight";
parms._seed = 0xdecaf;
parms._min_rows = 1;
parms._max_depth = 2;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics;
assertEquals(_AUC, mm.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm.mse(), 1e-8);
assertEquals(_LogLoss, mm.logloss(), 1e-6);
Frame pred = gbm.score(parms.train());
hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train());
assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm2.mse(), 1e-8);
assertEquals(_LogLoss, mm2.logloss(), 1e-6);
pred.remove();
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) gbm.delete();
Scope.exit();
}
}
@Test
public void testRowWeightsTwo() {
Frame tfr = null, vfr = null;
Scope.enter();
GBMModel gbm = null;
try {
tfr = parse_test_file("smalldata/junit/weights_all_twos.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "response";
parms._weights_column = "weight";
parms._seed = 0xdecaf;
parms._min_rows = 2; //Must be adapted to the weights
parms._max_depth = 2;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics;
assertEquals(_AUC, mm.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm.mse(), 1e-8);
assertEquals(_LogLoss, mm.logloss(), 1e-6);
Frame pred = gbm.score(parms.train());
hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train());
assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm2.mse(), 1e-8);
assertEquals(_LogLoss, mm2.logloss(), 1e-6);
pred.remove();
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) gbm.delete();
Scope.exit();
}
}
@Test
public void testRowWeightsTiny() {
Frame tfr = null, vfr = null;
Scope.enter();
GBMModel gbm = null;
try {
tfr = parse_test_file("smalldata/junit/weights_all_tiny.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "response";
parms._weights_column = "weight";
parms._seed = 0xdecaf;
parms._min_rows = 0.01242; //Must be adapted to the weights
parms._max_depth = 2;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics;
assertEquals(_AUC, mm.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm.mse(), 1e-8);
assertEquals(_LogLoss, mm.logloss(), 1e-6);
Frame pred = gbm.score(parms.train());
hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train());
assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm2.mse(), 1e-8);
assertEquals(_LogLoss, mm2.logloss(), 1e-6);
pred.remove();
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) gbm.delete();
Scope.exit();
}
}
@Test
public void testNoRowWeightsShuffled() {
Frame tfr = null, vfr = null;
GBMModel gbm = null;
Scope.enter();
try {
tfr = parse_test_file("smalldata/junit/no_weights_shuffled.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "response";
parms._seed = 0xdecaf;
parms._min_rows = 1;
parms._max_depth = 2;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics;
assertEquals(_AUC, mm.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm.mse(), 1e-8);
assertEquals(_LogLoss, mm.logloss(), 1e-6);
Frame pred = gbm.score(parms.train());
hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train());
assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm2.mse(), 1e-8);
assertEquals(_LogLoss, mm2.logloss(), 1e-6);
pred.remove();
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) gbm.delete();
Scope.exit();
}
}
@Test
public void testRowWeights() {
Frame tfr = null, vfr = null;
GBMModel gbm = null;
Scope.enter();
try {
tfr = parse_test_file("smalldata/junit/weights.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "response";
parms._weights_column = "weight";
parms._seed = 0xdecaf;
parms._min_rows = 1;
parms._max_depth = 2;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics;
assertEquals(_AUC, mm.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm.mse(), 1e-8);
assertEquals(_LogLoss, mm.logloss(), 1e-6);
Frame pred = gbm.score(parms.train());
hex.ModelMetricsBinomial mm2 = hex.ModelMetricsBinomial.getFromDKV(gbm, parms.train());
assertEquals(_AUC, mm2.auc_obj()._auc, 1e-8);
assertEquals(_MSE, mm2.mse(), 1e-8);
assertEquals(_LogLoss, mm2.logloss(), 1e-6);
pred.remove();
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) gbm.delete();
Scope.exit();
}
}
@Test
public void testNFold() {
Frame tfr = null, vfr = null;
GBMModel gbm = null;
Scope.enter();
try {
tfr = parse_test_file("smalldata/junit/weights.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "response";
parms._weights_column = "weight";
parms._seed = 123;
parms._min_rows = 1;
parms._max_depth = 2;
parms._nfolds = 2;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
parms._keep_cross_validation_predictions = true;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._cross_validation_metrics;
assertEquals(0.6296296296296297, mm.auc_obj()._auc, 1e-8);
assertEquals(0.28640022521234304, mm.mse(), 1e-8);
assertEquals(0.7674117059335286, mm.logloss(), 1e-6);
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) {
gbm.deleteCrossValidationModels();
gbm.delete();
for (Key k : gbm._output._cross_validation_predictions) k.remove();
gbm._output._cross_validation_holdout_predictions_frame_id.remove();
}
Scope.exit();
}
}
@Test
public void testNfoldsOneVsRest() {
Frame tfr = null;
GBMModel gbm1 = null;
GBMModel gbm2 = null;
Scope.enter();
try {
tfr = parse_test_file("smalldata/junit/weights.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "response";
parms._min_rows = 1;
parms._max_depth = 2;
parms._nfolds = (int) tfr.numRows();
parms._fold_assignment = Model.Parameters.FoldAssignmentScheme.Modulo;
parms._ntrees = 3;
parms._seed = 12345;
parms._learn_rate = 1e-3f;
gbm1 = new GBM(parms).trainModel().get();
//parms._nfolds = (int) tfr.numRows() + 1; //This is now an error
gbm2 = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm1 = (ModelMetricsBinomial)gbm1._output._cross_validation_metrics;
ModelMetricsBinomial mm2 = (ModelMetricsBinomial)gbm2._output._cross_validation_metrics;
assertEquals(mm1.auc_obj()._auc, mm2.auc_obj()._auc, 1e-12);
assertEquals(mm1.mse(), mm2.mse(), 1e-12);
//assertEquals(mm1.r2(), mm2.r2(), 1e-12);
assertEquals(mm1.logloss(), mm2.logloss(), 1e-12);
//TODO: add check: the correct number of individual models were built. PUBDEV-1690
} finally {
if (tfr != null) tfr.remove();
if (gbm1 != null) {
gbm1.deleteCrossValidationModels();
gbm1.delete();
}
if (gbm2 != null) {
gbm2.deleteCrossValidationModels();
gbm2.delete();
}
Scope.exit();
}
}
@Test
public void testNfoldsInvalidValues() {
Frame tfr = null;
GBMModel gbm1 = null;
GBMModel gbm2 = null;
GBMModel gbm3 = null;
Scope.enter();
try {
tfr = parse_test_file("smalldata/junit/weights.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "response";
parms._min_rows = 1;
parms._seed = 12345;
parms._max_depth = 2;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
parms._nfolds = 0;
gbm1 = new GBM(parms).trainModel().get();
parms._nfolds = 1;
try {
Log.info("Trying nfolds==1.");
gbm2 = new GBM(parms).trainModel().get();
Assert.fail("Should toss H2OModelBuilderIllegalArgumentException instead of reaching here");
} catch(H2OModelBuilderIllegalArgumentException e) {}
parms._nfolds = -99;
try {
Log.info("Trying nfolds==-99.");
gbm3 = new GBM(parms).trainModel().get();
Assert.fail("Should toss H2OModelBuilderIllegalArgumentException instead of reaching here");
} catch(H2OModelBuilderIllegalArgumentException e) {}
} finally {
if (tfr != null) tfr.remove();
if (gbm1 != null) gbm1.delete();
if (gbm2 != null) gbm2.delete();
if (gbm3 != null) gbm3.delete();
Scope.exit();
}
}
@Test
public void testNfoldsCVAndValidation() {
Frame tfr = null, vfr = null;
GBMModel gbm = null;
Scope.enter();
try {
tfr = parse_test_file("smalldata/junit/weights.csv");
vfr = parse_test_file("smalldata/junit/weights.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._valid = vfr._key;
parms._response_column = "response";
parms._seed = 12345;
parms._min_rows = 1;
parms._max_depth = 2;
parms._nfolds = 3;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
try {
Log.info("Trying N-fold cross-validation AND Validation dataset provided.");
gbm = new GBM(parms).trainModel().get();
} catch(H2OModelBuilderIllegalArgumentException e) {
Assert.fail("Should not toss H2OModelBuilderIllegalArgumentException.");
}
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) {
gbm.deleteCrossValidationModels();
gbm.delete();
}
Scope.exit();
}
}
@Test
public void testNfoldsConsecutiveModelsSame() {
Frame tfr = null;
Vec old = null;
GBMModel gbm1 = null;
GBMModel gbm2 = null;
Scope.enter();
try {
tfr = parse_test_file("smalldata/junit/cars_20mpg.csv");
tfr.remove("name").remove(); // Remove unique id
tfr.remove("economy").remove();
old = tfr.remove("economy_20mpg");
tfr.add("economy_20mpg", old.toCategoricalVec()); // response to last column
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "economy_20mpg";
parms._min_rows = 1;
parms._seed = 12345;
parms._max_depth = 2;
parms._nfolds = 3;
parms._ntrees = 3;
parms._learn_rate = 1e-3f;
gbm1 = new GBM(parms).trainModel().get();
gbm2 = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm1 = (ModelMetricsBinomial)gbm1._output._cross_validation_metrics;
ModelMetricsBinomial mm2 = (ModelMetricsBinomial)gbm2._output._cross_validation_metrics;
assertEquals(mm1.auc_obj()._auc, mm2.auc_obj()._auc, 1e-12);
assertEquals(mm1.mse(), mm2.mse(), 1e-12);
//assertEquals(mm1.r2(), mm2.r2(), 1e-12);
assertEquals(mm1.logloss(), mm2.logloss(), 1e-12);
} finally {
if (tfr != null) tfr.remove();
if (old != null) old.remove();
if (gbm1 != null) {
gbm1.deleteCrossValidationModels();
gbm1.delete();
}
if (gbm2 != null) {
gbm2.deleteCrossValidationModels();
gbm2.delete();
}
Scope.exit();
}
}
@Test
public void testNfoldsColumn() {
Frame tfr = null;
GBMModel gbm1 = null;
try {
tfr = parse_test_file("smalldata/junit/cars_20mpg.csv");
tfr.remove("name").remove(); // Remove unique id
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "economy_20mpg";
parms._fold_column = "cylinders";
Vec old = tfr.remove("cylinders");
tfr.add("cylinders",old.toCategoricalVec());
DKV.put(tfr);
parms._ntrees = 10;
parms._keep_cross_validation_fold_assignment = true;
GBM job1 = new GBM(parms);
gbm1 = job1.trainModel().get();
Assert.assertTrue(gbm1._output._cross_validation_models.length == 5);
old.remove();
} finally {
if (tfr != null) tfr.remove();
if (gbm1 != null) {
gbm1.deleteCrossValidationModels();
gbm1.delete();
gbm1._output._cross_validation_fold_assignment_frame_id.remove();
}
}
}
@Test
public void testNfoldsColumnNumbersFrom0() {
Frame tfr = null;
Vec old = null;
GBMModel gbm1 = null;
try {
tfr = parse_test_file("smalldata/junit/cars_20mpg.csv");
tfr.remove("name").remove(); // Remove unique id
new MRTask() {
@Override
public void map(Chunk c) {
for (int i=0;i<c.len();++i) {
if (c.at8(i) == 3) c.set(i, 0);
if (c.at8(i) == 4) c.set(i, 1);
if (c.at8(i) == 5) c.set(i, 2);
if (c.at8(i) == 6) c.set(i, 3);
if (c.at8(i) == 8) c.set(i, 4);
}
}
}.doAll(tfr.vec("cylinders"));
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "economy_20mpg";
parms._fold_column = "cylinders";
parms._ntrees = 10;
GBM job1 = new GBM(parms);
gbm1 = job1.trainModel().get();
Assert.assertTrue(gbm1._output._cross_validation_models.length == 5);
} finally {
if (tfr != null) tfr.remove();
if (old != null) old.remove();
if (gbm1 != null) {
gbm1.deleteCrossValidationModels();
gbm1.delete();
}
}
}
@Test
public void testNfoldsColumnCategorical() {
Frame tfr = null;
Vec old = null;
GBMModel gbm1 = null;
try {
tfr = parse_test_file("smalldata/junit/cars_20mpg.csv");
tfr.remove("name").remove(); // Remove unique id
old = tfr.remove("cylinders");
tfr.add("folds", old.toCategoricalVec());
old.remove();
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "economy_20mpg";
parms._fold_column = "folds";
parms._ntrees = 10;
GBM job1 = new GBM(parms);
gbm1 = job1.trainModel().get();
Assert.assertTrue(gbm1._output._cross_validation_models.length == 5);
} finally {
if (tfr != null) tfr.remove();
if (old != null) old.remove();
if (gbm1 != null) {
gbm1.deleteCrossValidationModels();
gbm1.delete();
}
}
}
@Test
public void testNFoldAirline() {
Frame tfr = null, vfr = null;
GBMModel gbm = null;
Scope.enter();
try {
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._seed = 234;
parms._min_rows = 2;
parms._nfolds = 3;
parms._max_depth = 5;
parms._ntrees = 5;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._cross_validation_metrics;
assertEquals(0.7309795467719639, mm.auc_obj()._auc, 1e-4); // 1 node
assertEquals(0.22511756378273942, mm.mse(), 1e-4);
assertEquals(0.6425515048581261, mm.logloss(), 1e-4);
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) {
gbm.deleteCrossValidationModels();
gbm.delete();
}
Scope.exit();
}
}
// just a simple sanity check - not a golden test
@Test
public void testDistributions() {
Frame tfr = null, vfr = null, res= null;
GBMModel gbm = null;
for (DistributionFamily dist : new DistributionFamily[]{
DistributionFamily.AUTO,
gaussian,
DistributionFamily.poisson,
DistributionFamily.gamma,
DistributionFamily.tweedie
}) {
Scope.enter();
try {
tfr = parse_test_file("smalldata/glm_test/cancar_logIn.csv");
vfr = parse_test_file("smalldata/glm_test/cancar_logIn.csv");
for (String s : new String[]{
"Merit", "Class"
}) {
Scope.track(tfr.replace(tfr.find(s), tfr.vec(s).toCategoricalVec()));
Scope.track(vfr.replace(vfr.find(s), vfr.vec(s).toCategoricalVec()));
}
DKV.put(tfr);
DKV.put(vfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "Cost";
parms._seed = 0xdecaf;
parms._distribution = dist;
parms._min_rows = 1;
parms._ntrees = 30;
// parms._offset_column = "logInsured"; //POJO scoring not supported for offsets
parms._learn_rate = 1e-3f;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
res = gbm.score(vfr);
Assert.assertTrue(gbm.testJavaScoring(vfr,res,1e-15));
res.remove();
ModelMetricsRegression mm = (ModelMetricsRegression)gbm._output._training_metrics;
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (res != null) res.remove();
if (gbm != null) gbm.delete();
Scope.exit();
}
}
}
@Test
public void testStochasticGBM() {
Frame tfr = null, vfr = null;
GBMModel gbm = null;
float[] sample_rates = new float[]{0.2f, 0.4f, 0.6f, 0.8f, 1.0f};
float[] col_sample_rates = new float[]{0.2f, 0.4f, 0.6f, 0.8f, 1.0f};
Map<Double, Pair<Float,Float>> hm = new TreeMap<>();
for (float sample_rate : sample_rates) {
for (float col_sample_rate : col_sample_rates) {
Scope.enter();
try {
tfr = parse_test_file("./smalldata/gbm_test/ecology_model.csv");
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "Angaus"; //regression
parms._seed = 123;
parms._min_rows = 2;
parms._max_depth = 10;
parms._ntrees = 2;
parms._col_sample_rate = col_sample_rate;
parms._sample_rate = sample_rate;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
ModelMetricsRegression mm = (ModelMetricsRegression)gbm._output._training_metrics;
hm.put(mm.mse(), new Pair<>(sample_rate, col_sample_rate));
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) gbm.delete();
Scope.exit();
}
}
}
double fullDataMSE = hm.entrySet().iterator().next().getKey();
Iterator<Map.Entry<Double, Pair<Float, Float>>> it;
int i=0;
Pair<Float, Float> last = null;
// iterator over results (min to max MSE) - best to worst
for (it=hm.entrySet().iterator(); it.hasNext(); ++i) {
Map.Entry<Double, Pair<Float,Float>> n = it.next();
if (i>0) Assert.assertTrue(n.getKey() > fullDataMSE); //any sampling should make training set MSE worse
Log.info( "MSE: " + n.getKey() + ", "
+ ", row sample: " + ((Pair)n.getValue())._1()
+ ", col sample: " + ((Pair)n.getValue())._2());
last=n.getValue();
}
// worst training MSE should belong to the most sampled case
Assert.assertTrue(last._1()==sample_rates[0]);
Assert.assertTrue(last._2()==col_sample_rates[0]);
}
@Test
public void testStochasticGBMHoldout() {
Frame tfr = null;
Key[] ksplits = new Key[0];
try{
tfr=parse_test_file("./smalldata/gbm_test/ecology_model.csv");
SplitFrame sf = new SplitFrame(tfr,new double[] { 0.5, 0.5 },new Key[] { Key.make("train.hex"), Key.make("test.hex")});
// Invoke the job
sf.exec().get();
ksplits = sf._destination_frames;
GBMModel gbm = null;
float[] sample_rates = new float[]{0.2f, 0.4f, 0.8f, 1.0f};
float[] col_sample_rates = new float[]{0.4f, 0.8f, 1.0f};
float[] col_sample_rates_per_tree = new float[]{0.4f, 0.6f, 1.0f};
Map<Double, Triple<Float>> hm = new TreeMap<>();
for (float sample_rate : sample_rates) {
for (float col_sample_rate : col_sample_rates) {
for (float col_sample_rate_per_tree : col_sample_rates_per_tree) {
Scope.enter();
try {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = ksplits[0];
parms._valid = ksplits[1];
parms._response_column = "Angaus"; //regression
parms._seed = 42;
parms._min_rows = 2;
parms._max_depth = 12;
parms._ntrees = 6;
parms._col_sample_rate = col_sample_rate;
parms._col_sample_rate_per_tree = col_sample_rate_per_tree;
parms._sample_rate = sample_rate;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
// too slow, but passes (now)
// // Build a POJO, validate same results
// Frame pred = gbm.score(tfr);
// Assert.assertTrue(gbm.testJavaScoring(tfr,pred,1e-15));
// pred.remove();
ModelMetricsRegression mm = (ModelMetricsRegression)gbm._output._validation_metrics;
hm.put(mm.mse(), new Triple<>(sample_rate, col_sample_rate, col_sample_rate_per_tree));
} finally {
if (gbm != null) gbm.delete();
Scope.exit();
}
}
}
}
Iterator<Map.Entry<Double, Triple<Float>>> it;
Triple<Float> last = null;
// iterator over results (min to max MSE) - best to worst
for (it=hm.entrySet().iterator(); it.hasNext();) {
Map.Entry<Double, Triple<Float>> n = it.next();
Log.info( "MSE: " + n.getKey()
+ ", row sample: " + n.getValue().v1
+ ", col sample: " + n.getValue().v2
+ ", col sample per tree: " + n.getValue().v3);
last=n.getValue();
}
// worst validation MSE should belong to the most overfit case (1.0, 1.0, 1.0)
// Assert.assertTrue(last.v1==sample_rates[sample_rates.length-1]);
// Assert.assertTrue(last.v2==col_sample_rates[col_sample_rates.length-1]);
// Assert.assertTrue(last.v3==col_sample_rates_per_tree[col_sample_rates_per_tree.length-1]);
} finally {
if (tfr != null) tfr.remove();
for (Key k : ksplits)
if (k!=null) k.remove();
}
}
// PUBDEV-2476 Check reproducibility for the same # of chunks (i.e., same # of nodes) and same parameters
@Test public void testChunks() {
Frame tfr;
int[] chunks = new int[]{1,2,2,39,39,500};
final int N = chunks.length;
double[] mses = new double[N];
for (int i=0; i<N; ++i) {
Scope.enter();
// Load data, hack frames
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
// rebalance to a given number of chunks
Key dest = Key.make("df.rebalanced.hex");
RebalanceDataSet rb = new RebalanceDataSet(tfr, dest, chunks[i]);
H2O.submitTask(rb);
rb.join();
tfr.delete();
tfr = DKV.get(dest).get();
assertEquals(tfr.vec(0).nChunks(), chunks[i]);
// Scope.track(tfr.replace(54, tfr.vecs()[54].toCategoricalVec())._key);
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "C55";
parms._seed = 1234;
parms._auto_rebalance = false;
parms._col_sample_rate_per_tree = 0.5f;
parms._col_sample_rate = 0.3f;
parms._ntrees = 5;
parms._max_depth = 5;
// Build a first model; all remaining models should be equal
GBM job = new GBM(parms);
GBMModel drf = job.trainModel().get();
assertEquals(drf._output._ntrees, parms._ntrees);
mses[i] = drf._output._scored_train[drf._output._scored_train.length-1]._mse;
drf.delete();
if (tfr != null) tfr.remove();
Scope.exit();
}
for (int i=0; i<mses.length; ++i) {
Log.info("trial: " + i + " -> MSE: " + mses[i]);
}
for(double mse : mses)
assertEquals(mse, mses[0], 1e-10);
}
@Test public void testLaplace2() {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Frame pred=null, res=null;
Scope.enter();
try {
Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv");
train.remove("Site").remove(); // Remove unique ID
train.remove("Method").remove(); // Remove categorical
DKV.put(train); // Update frame after hacking it
parms._train = train._key;
parms._response_column = "DSDist"; // Train on the outcome
parms._distribution = laplace;
parms._sample_rate = 0.6f;
parms._col_sample_rate = 0.8f;
parms._col_sample_rate_per_tree = 0.8f;
parms._seed = 1234;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv" );
res = gbm.score(pred);
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(pred, res, 1e-15));
Assert.assertTrue(Math.abs(((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance - 23.05805) < 1e-4);
} finally {
parms._train.remove();
if( gbm != null ) gbm .delete();
if( pred != null ) pred.remove();
if( res != null ) res .remove();
Scope.exit();
}
}
@Test public void testQuantileRegression() {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Frame pred=null, res=null;
Scope.enter();
try {
Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv");
train.remove("Site").remove(); // Remove unique ID
train.remove("Method").remove(); // Remove categorical
DKV.put(train); // Update frame after hacking it
parms._train = train._key;
parms._response_column = "DSDist"; // Train on the outcome
parms._distribution = DistributionFamily.quantile;
parms._quantile_alpha = 0.4;
parms._sample_rate = 0.6f;
parms._col_sample_rate = 0.8f;
parms._col_sample_rate_per_tree = 0.8f;
parms._seed = 1234;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv" );
res = gbm.score(pred);
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(pred, res, 1e-15));
Assert.assertTrue(Math.abs(((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance - 10.69611) < 1e-4);
} finally {
parms._train.remove();
if( gbm != null ) gbm .delete();
if( pred != null ) pred.remove();
if( res != null ) res .remove();
Scope.exit();
}
}
@Test public void missingAndUnseenValues() {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Frame train=null, test=null, train_preds=null, test_preds=null;
Scope.enter();
try {
{
CreateFrame cf = new CreateFrame();
cf.rows = 100;
cf.cols = 10;
cf.integer_range = 1000;
cf.categorical_fraction = 1.0;
cf.integer_fraction = 0.0;
cf.binary_fraction = 0.0;
cf.time_fraction = 0.0;
cf.string_fraction = 0.0;
cf.binary_ones_fraction = 0.0;
cf.missing_fraction = 0.2;
cf.factors = 3;
cf.response_factors = 2;
cf.positive_response = false;
cf.has_response = true;
cf.seed = 1235;
cf.seed_for_column_types = 1234;
train = cf.execImpl().get();
}
{
CreateFrame cf = new CreateFrame();
cf.rows = 100;
cf.cols = 10;
cf.integer_range = 1000;
cf.categorical_fraction = 1.0;
cf.integer_fraction = 0.0;
cf.binary_fraction = 0.0;
cf.time_fraction = 0.0;
cf.string_fraction = 0.0;
cf.binary_ones_fraction = 0.0;
cf.missing_fraction = 0.2;
cf.factors = 3;
cf.response_factors = 2;
cf.positive_response = false;
cf.has_response = true;
cf.seed = 4321; //different test set
cf.seed_for_column_types = 1234;
test = cf.execImpl().get();
}
parms._train = train._key;
parms._response_column = "response"; // Train on the outcome
parms._distribution = DistributionFamily.multinomial;
parms._max_depth = 20;
parms._min_rows = 1;
parms._ntrees = 5;
parms._seed = 1;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
train_preds = gbm.score(train);
test_preds = gbm.score(test);
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(train, train_preds, 1e-15));
Key old = gbm._key;
gbm._key = Key.make(gbm._key + "ha");
Assert.assertTrue(gbm.testJavaScoring(test, test_preds, 1e-15));
DKV.remove(old);
} finally {
if( gbm != null ) gbm .delete();
if( train != null ) train.remove();
if( test != null ) test.remove();
if( train_preds != null ) train_preds .remove();
if( test_preds != null ) test_preds .remove();
Scope.exit();
}
}
@Test public void minSplitImprovement() {
Frame tfr = null;
Key[] ksplits = null;
GBMModel gbm = null;
try {
Scope.enter();
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
int resp = 54;
// tfr = parse_test_file("bigdata/laptop/mnist/train.csv.gz");
// int resp = 784;
Scope.track(tfr.replace(resp, tfr.vecs()[resp].toCategoricalVec()));
DKV.put(tfr);
SplitFrame sf = new SplitFrame(tfr, new double[]{0.5, 0.5}, new Key[]{Key.make("train.hex"), Key.make("valid.hex")});
// Invoke the job
sf.exec().get();
ksplits = sf._destination_frames;
double[] msi = new double[]{0, 1e-1};
final int N = msi.length;
double[] loglosses = new double[N];
for (int i = 0; i < N; ++i) {
// Load data, hack frames
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = ksplits[0];
parms._valid = ksplits[1];
parms._response_column = tfr.names()[resp];
parms._learn_rate = 0.05f;
parms._min_split_improvement = msi[i];
parms._ntrees = 10;
parms._score_tree_interval = parms._ntrees;
parms._max_depth = 5;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
loglosses[i] = gbm._output._scored_valid[gbm._output._scored_valid.length - 1]._logloss;
if (gbm!=null) gbm.delete();
}
for (int i = 0; i < msi.length; ++i) {
Log.info("min_split_improvement: " + msi[i] + " -> validation logloss: " + loglosses[i]);
}
int idx = ArrayUtils.minIndex(loglosses);
Log.info("Optimal min_split_improvement: " + msi[idx]);
assertTrue(0 == idx);
} finally {
if (gbm!=null) gbm.delete();
if (tfr!=null) tfr.delete();
if (ksplits[0]!=null) ksplits[0].remove();
if (ksplits[1]!=null) ksplits[1].remove();
Scope.exit();
}
}
@Test public void histoTypes() {
Frame tfr = null;
Key[] ksplits = null;
GBMModel gbm = null;
try {
Scope.enter();
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
int resp = 54;
// tfr = parse_test_file("bigdata/laptop/mnist/train.csv.gz");
// int resp = 784;
Scope.track(tfr.replace(resp, tfr.vecs()[resp].toCategoricalVec()));
DKV.put(tfr);
SplitFrame sf = new SplitFrame(tfr, new double[]{0.5, 0.5}, new Key[]{Key.make("train.hex"), Key.make("valid.hex")});
// Invoke the job
sf.exec().get();
ksplits = sf._destination_frames;
SharedTreeModel.SharedTreeParameters.HistogramType[] histoType = SharedTreeModel.SharedTreeParameters.HistogramType.values();
final int N = histoType.length;
double[] loglosses = new double[N];
for (int i = 0; i < N; ++i) {
// Load data, hack frames
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = ksplits[0];
parms._valid = ksplits[1];
parms._response_column = tfr.names()[resp];
parms._learn_rate = 0.05f;
parms._histogram_type = histoType[i];
parms._ntrees = 10;
parms._score_tree_interval = parms._ntrees;
parms._max_depth = 5;
parms._seed = 0xDECAFFEE;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
loglosses[i] = gbm._output._scored_valid[gbm._output._scored_valid.length - 1]._logloss;
if (gbm!=null) gbm.delete();
}
for (int i = 0; i < histoType.length; ++i) {
Log.info("histoType: " + histoType[i] + " -> validation logloss: " + loglosses[i]);
}
int idx = ArrayUtils.minIndex(loglosses);
Log.info("Optimal randomization: " + histoType[idx]);
assertTrue(4 == idx);
} finally {
if (tfr!=null) tfr.delete();
if (ksplits[0]!=null) ksplits[0].remove();
if (ksplits[1]!=null) ksplits[1].remove();
Scope.exit();
}
}
@Test public void sampleRatePerClass() {
Frame tfr = null;
Key[] ksplits = null;
GBMModel gbm = null;
try {
Scope.enter();
tfr = parse_test_file("smalldata/covtype/covtype.20k.data");
int resp = 54;
Scope.track(tfr.replace(resp, tfr.vecs()[resp].toCategoricalVec()));
DKV.put(tfr);
SplitFrame sf = new SplitFrame(tfr, new double[]{0.5, 0.5}, new Key[]{Key.make("train.hex"), Key.make("valid.hex")});
// Invoke the job
sf.exec().get();
ksplits = sf._destination_frames;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = ksplits[0];
parms._valid = ksplits[1];
parms._response_column = tfr.names()[resp];
parms._learn_rate = 0.05f;
parms._min_split_improvement = 1e-5;
parms._ntrees = 10;
parms._score_tree_interval = parms._ntrees;
parms._max_depth = 5;
parms._sample_rate_per_class = new double[]{0.1f,0.1f,0.2f,0.4f,1f,0.3f,0.2f};
GBM job = new GBM(parms);
gbm = job.trainModel().get();
if (gbm!=null) gbm.delete();
} finally {
if (gbm!=null) gbm.delete();
if (tfr!=null) tfr.delete();
if (ksplits[0]!=null) ksplits[0].remove();
if (ksplits[1]!=null) ksplits[1].remove();
Scope.exit();
}
}
// PUBDEV-2822
@Test public void testNA() {
String xy = ",0\n1,0\n2,0\n3,0\n4,-10\n,0";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Frame preds = gbm.score(df);
Log.info(df);
Log.info(preds);
Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15));
Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(4) - -10) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 0) < 1e-6);
preds.remove();
gbm.remove();
df.remove();
}
// PUBDEV-2822
@Test public void testNARight() {
String xy = ",10\n1,0\n2,0\n3,0\n4,10\n,10";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Frame preds = gbm.score(df);
Log.info(df);
Log.info(preds);
Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15));
Assert.assertTrue(preds.vec(0).at(0) == 10);
Assert.assertTrue(preds.vec(0).at(1) == 0);
Assert.assertTrue(preds.vec(0).at(2) == 0);
Assert.assertTrue(preds.vec(0).at(3) == 0);
Assert.assertTrue(preds.vec(0).at(4) == 10);
Assert.assertTrue(preds.vec(0).at(5) == 10);
preds.remove();
gbm.remove();
df.remove();
}
// PUBDEV-2822
@Test public void testNALeft() {
String xy = ",0\n1,0\n2,0\n3,0\n4,10\n,0";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Frame preds = gbm.score(df);
Log.info(df);
Log.info(preds);
Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15));
Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 10) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 0) < 1e-6);
preds.remove();
gbm.remove();
df.remove();
}
// PUBDEV-2822
@Test public void testNAvsRest() {
String xy = ",5\n1,0\n2,0\n3,0\n4,0\n,3";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Frame preds = gbm.score(df);
Log.info(df);
Log.info(preds);
Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15));
Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 4) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 4) < 1e-6);
preds.remove();
gbm.remove();
df.remove();
}
// PUBDEV-2822
@Test public void testOnevsRest() {
String xy = "-9,5\n1,0\n2,0\n3,0\n4,0\n-9,3";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Frame preds = gbm.score(df);
Log.info(df);
Log.info(preds);
Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15));
Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 4) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 4) < 1e-6);
preds.remove();
gbm.remove();
df.remove();
}
// PUBDEV-2822
@Test public void testNACategorical() {
String xy = ",0\nA,0\nB,0\nA,0\nD,-10\n,0";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Log.info(df.toTwoDimTable());
Frame preds = gbm.score(df);
Log.info(preds.toTwoDimTable());
Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15));
Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(4) - -10) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 0) < 1e-6);
preds.remove();
gbm.remove();
df.remove();
}
// PUBDEV-2822
@Test public void testNARightCategorical() {
String xy = ",10\nA,0\nB,0\nA,0\n4,10\n,10";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Frame preds = gbm.score(df);
Log.info(df);
Log.info(preds);
Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15));
Assert.assertTrue(preds.vec(0).at(0) == 10);
Assert.assertTrue(preds.vec(0).at(1) == 0);
Assert.assertTrue(preds.vec(0).at(2) == 0);
Assert.assertTrue(preds.vec(0).at(3) == 0);
Assert.assertTrue(preds.vec(0).at(4) == 10);
Assert.assertTrue(preds.vec(0).at(5) == 10);
preds.remove();
gbm.remove();
df.remove();
}
// PUBDEV-2822
@Test public void testNALeftCategorical() {
String xy = ",0\nA,0\nB,0\nA,0\nD,10\n,0";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Frame preds = gbm.score(df);
Log.info(df);
Log.info(preds);
Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15));
Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 10) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 0) < 1e-6);
preds.remove();
gbm.remove();
df.remove();
}
// PUBDEV-2822
@Test public void testNAvsRestCategorical() {
String xy = ",5\nA,0\nB,0\nA,0\nD,0\n,3";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Frame preds = gbm.score(df);
Log.info(df);
Log.info(preds);
Assert.assertTrue(gbm.testJavaScoring(df,preds,1e-15));
Assert.assertTrue(Math.abs(preds.vec(0).at(0) - 4) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(2) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 4) < 1e-6);
preds.remove();
gbm.remove();
df.remove();
}
// PUBDEV-2822
@Test public void testUnseenNACategorical() {
String xy = "B,-5\nA,0\nB,0\nA,0\nD,0\nA,3";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
String test = ",5\n,0\nB,0\n,0\nE,0\n,3";
Key te = Key.make("test");
Frame df2 = ParseDataset.parse(te, makeByteVec(Key.make("te"), test));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Scope.enter(); //AdaptTestTrain leaks when it does inplace Vec adaptation, need a Scope to catch that stuff
Frame preds = gbm.score(df);
Frame preds2 = gbm.score(df2);
Log.info(df);
Log.info(preds);
Log.info(df2);
Log.info(preds2);
Assert.assertTrue(gbm.testJavaScoring(df, preds, 1e-15));
Assert.assertTrue(gbm.testJavaScoring(df2, preds2, 1e-15));
Assert.assertTrue(Math.abs(preds.vec(0).at(0) - -2.5) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 1) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(2) - -2.5) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 1) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6);
Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 1) < 1e-6);
preds.remove();
preds2.remove();
gbm.remove();
df.remove();
df2.remove();
Scope.exit();
}
@Test public void unseenMissing() {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Frame train=null, test=null, train_preds=null, test_preds=null;
Scope.enter();
try {
{
CreateFrame cf = new CreateFrame();
cf.rows = 100;
cf.cols = 10;
cf.integer_range = 1000;
cf.categorical_fraction = 1.0;
cf.integer_fraction = 0.0;
cf.binary_fraction = 0.0;
cf.time_fraction = 0.0;
cf.string_fraction = 0.0;
cf.binary_ones_fraction = 0.0;
cf.missing_fraction = 0.0;
cf.factors = 3;
cf.response_factors = 2;
cf.positive_response = false;
cf.has_response = true;
cf.seed = 1235;
cf.seed_for_column_types = 1234;
train = cf.execImpl().get();
}
{
CreateFrame cf = new CreateFrame();
cf.rows = 100;
cf.cols = 10;
cf.integer_range = 1000;
cf.categorical_fraction = 1.0;
cf.integer_fraction = 0.0;
cf.binary_fraction = 0.0;
cf.time_fraction = 0.0;
cf.string_fraction = 0.0;
cf.binary_ones_fraction = 0.0;
cf.missing_fraction = 0.8;
cf.factors = 3;
cf.response_factors = 2;
cf.positive_response = false;
cf.has_response = true;
cf.seed = 4321; //different test set
cf.seed_for_column_types = 1234;
test = cf.execImpl().get();
}
parms._train = train._key;
parms._response_column = "response"; // Train on the outcome
parms._distribution = DistributionFamily.multinomial;
parms._max_depth = 20;
parms._min_rows = 1;
parms._ntrees = 5;
parms._seed = 1;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
train_preds = gbm.score(train);
test_preds = gbm.score(test);
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(train, train_preds, 1e-15));
Key old = gbm._key;
gbm._key = Key.make(gbm._key + "ha");
Assert.assertTrue(gbm.testJavaScoring(test, test_preds, 1e-15));
DKV.remove(old);
} finally {
if( gbm != null ) gbm .delete();
if( train != null ) train.remove();
if( test != null ) test.remove();
if( train_preds != null ) train_preds .remove();
if( test_preds != null ) test_preds .remove();
Scope.exit();
}
}
//PUBDEV-3066
@Test public void testAnnealingStop() {
Frame tfr=null;
final int N = 1;
Scope.enter();
try {
// Load data, hack frames
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
for (int i=0; i<N; ++i) {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._nbins = 10;
parms._nbins_cats = 500;
parms._ntrees = 100;
parms._learn_rate_annealing = 0.5;
parms._max_depth = 5;
parms._min_rows = 10;
parms._distribution = DistributionFamily.bernoulli;
parms._balance_classes = true;
parms._seed = 0;
// Build a first model; all remaining models should be equal
GBMModel gbm = new GBM(parms).trainModel().get();
Assert.assertNotEquals(gbm._output._ntrees, parms._ntrees);
gbm.delete();
}
} finally {
if (tfr != null) tfr.remove();
}
Scope.exit();
}
@Ignore
public void testModifiedHuber() {
Frame tfr = null, vfr = null;
GBMModel gbm = null;
Scope.enter();
try {
tfr = parse_test_file("./smalldata/airlines/allyears2k_headers.zip");
for (String s : new String[]{
"DepTime", "ArrTime", "ActualElapsedTime",
"AirTime", "ArrDelay", "DepDelay", "Cancelled",
"CancellationCode", "CarrierDelay", "WeatherDelay",
"NASDelay", "SecurityDelay", "LateAircraftDelay", "IsArrDelayed"
}) {
tfr.remove(s).remove();
}
DKV.put(tfr);
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = "IsDepDelayed";
parms._seed = 1234;
parms._distribution = DistributionFamily.modified_huber;
parms._min_rows = 1;
parms._learn_rate = .1;
parms._max_depth = 5;
parms._ntrees = 10;
// Build a first model; all remaining models should be equal
gbm = new GBM(parms).trainModel().get();
Frame train_preds = gbm.score(tfr);
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(tfr, train_preds, 1e-15));
train_preds.remove();
ModelMetricsBinomial mm = (ModelMetricsBinomial)gbm._output._training_metrics;
// assertEquals(0.59998, mm.auc_obj()._auc, 1e-4); // 1 node
// assertEquals(0.31692, mm.mse(), 1e-4);
// assertEquals(0.79069, mm.logloss(), 1e-4);
} finally {
if (tfr != null) tfr.remove();
if (vfr != null) vfr.remove();
if (gbm != null) {
gbm.deleteCrossValidationModels();
gbm.delete();
}
Scope.exit();
}
}
@Ignore
public void testModifiedHuberStability() {
String xy = "A,Y\nB,N\nA,N\nB,N\nA,Y\nA,Y";
Key tr = Key.make("train");
Frame df = ParseDataset.parse(tr, makeByteVec(Key.make("xy"), xy));
String test = "A,Y\nB,N\nA,N\nB,N\nA,Y\nA,Y";
Key te = Key.make("test");
Frame df2 = ParseDataset.parse(te, makeByteVec(Key.make("te"), test));
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tr;
parms._response_column = "C2";
parms._min_rows = 1;
parms._learn_rate = 1;
parms._distribution = DistributionFamily.modified_huber;
parms._ntrees = 1;
GBM job = new GBM(parms);
GBMModel gbm = job.trainModel().get();
Scope.enter(); //AdaptTestTrain leaks when it does inplace Vec adaptation, need a Scope to catch that stuff
Frame preds = gbm.score(df);
Frame preds2 = gbm.score(df2);
Log.info(df);
Log.info(preds);
Log.info(df2);
Log.info(preds2);
Assert.assertTrue(gbm.testJavaScoring(df, preds, 1e-15));
Assert.assertTrue(gbm.testJavaScoring(df2, preds2, 1e-15));
// Assert.assertTrue(Math.abs(preds.vec(0).at(0) - -2.5) < 1e-6);
// Assert.assertTrue(Math.abs(preds.vec(0).at(1) - 1) < 1e-6);
// Assert.assertTrue(Math.abs(preds.vec(0).at(2) - -2.5) < 1e-6);
// Assert.assertTrue(Math.abs(preds.vec(0).at(3) - 1) < 1e-6);
// Assert.assertTrue(Math.abs(preds.vec(0).at(4) - 0) < 1e-6);
// Assert.assertTrue(Math.abs(preds.vec(0).at(5) - 1) < 1e-6);
preds.remove();
preds2.remove();
gbm.remove();
df.remove();
df2.remove();
Scope.exit();
}
@Test public void testHuber2() {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Frame pred=null, res=null;
Scope.enter();
try {
Frame train = parse_test_file("smalldata/gbm_test/ecology_model.csv");
train.remove("Site").remove(); // Remove unique ID
train.remove("Method").remove(); // Remove categorical
DKV.put(train); // Update frame after hacking it
parms._train = train._key;
parms._response_column = "DSDist"; // Train on the outcome
parms._distribution = huber;
parms._huber_alpha = 0.5;
parms._sample_rate = 0.6f;
parms._col_sample_rate = 0.8f;
parms._col_sample_rate_per_tree = 0.8f;
parms._seed = 1234;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
pred = parse_test_file("smalldata/gbm_test/ecology_eval.csv" );
res = gbm.score(pred);
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(pred, res, 1e-15));
Assert.assertTrue(Math.abs(((ModelMetricsRegression)gbm._output._training_metrics)._MSE - 1485) < 1);
Assert.assertTrue(Math.abs(((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance - 256.88) < 1);
} finally {
parms._train.remove();
if( gbm != null ) gbm .delete();
if( pred != null ) pred.remove();
if( res != null ) res .remove();
Scope.exit();
}
}
@Test
public void testLaplace() {
Frame tfr = null;
GBMModel gbm = null;
try {
tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = tfr.lastVecName();
parms._seed = 0xdecaf;
parms._distribution = laplace;
gbm = new GBM(parms).trainModel().get();
Assert.assertEquals(8.05716257,((ModelMetricsRegression)gbm._output._training_metrics)._MSE,1e-5);
Assert.assertEquals(1.42298/*MAE*/,((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance,1e-5);
} finally {
if (tfr != null) tfr.delete();
if (gbm != null) gbm.deleteCrossValidationModels();
if (gbm != null) gbm.delete();
}
}
@Test
public void testGaussian() {
Frame tfr = null;
GBMModel gbm = null;
try {
tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = tfr.lastVecName();
parms._seed = 0xdecaf;
parms._distribution = gaussian;
gbm = new GBM(parms).trainModel().get();
Assert.assertEquals(2.9423857564,((ModelMetricsRegression) gbm._output._training_metrics)._MSE,1e-5);
Assert.assertEquals(2.9423857564,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-5);
} finally {
if (tfr != null) tfr.delete();
if (gbm != null) gbm.deleteCrossValidationModels();
if (gbm != null) gbm.delete();
}
}
@Test
public void testHuberDeltaLarge() {
Frame tfr = null;
GBMModel gbm = null;
try {
tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = tfr.lastVecName();
parms._seed = 0xdecaf;
parms._distribution = huber;
parms._huber_alpha = 1; // nothing is an outlier - same as gaussian
gbm = new GBM(parms).trainModel().get();
Assert.assertEquals(2.9423857564,((ModelMetricsRegression) gbm._output._training_metrics)._MSE,1e-2);
// huber loss with delta -> max(error) goes to MSE
Assert.assertEquals(2.9423857564,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-2);
} finally {
if (tfr != null) tfr.delete();
if (gbm != null) gbm.deleteCrossValidationModels();
if (gbm != null) gbm.delete();
}
}
@Test
public void testHuberDeltaTiny() {
Frame tfr = null;
GBMModel gbm = null;
try {
tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = tfr.lastVecName();
parms._seed = 0xdecaf;
parms._distribution = huber;
parms._huber_alpha = 1e-2; //everything is an outlier and we should get laplace loss
gbm = new GBM(parms).trainModel().get();
Assert.assertEquals(8.05716257,((ModelMetricsRegression)gbm._output._training_metrics)._MSE,0.3);
// Huber loss can be derived from MAE since no obs weights
double delta = 0.0047234; //hardcoded from output
double MAE = 1.42298; //see laplace above
Assert.assertEquals((2*MAE-delta)*delta,((ModelMetricsRegression)gbm._output._training_metrics)._mean_residual_deviance,2e-4);
} finally {
if (tfr != null) tfr.delete();
if (gbm != null) gbm.deleteCrossValidationModels();
if (gbm != null) gbm.delete();
}
}
@Test
public void testHuber() {
Frame tfr = null;
GBMModel gbm = null;
try {
tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = tfr.lastVecName();
parms._seed = 0xdecaf;
parms._distribution = huber;
parms._huber_alpha = 0.9; //that's the default
gbm = new GBM(parms).trainModel().get();
Assert.assertEquals(4.447062185,((ModelMetricsRegression)gbm._output._training_metrics)._MSE,1e-5);
Assert.assertEquals(1.962926332,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-4);
} finally {
if (tfr != null) tfr.delete();
if (gbm != null) gbm.deleteCrossValidationModels();
if (gbm != null) gbm.delete();
}
}
@Test
public void testHuberNoise() {
Frame tfr = null;
GBMModel gbm = null;
try {
tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = tfr.lastVecName();
parms._seed = 0xdecaf;
parms._distribution = huber;
parms._huber_alpha = 0.9; //that's the default
parms._pred_noise_bandwidth = 0.2;
gbm = new GBM(parms).trainModel().get();
Assert.assertEquals(4.8056900203,((ModelMetricsRegression)gbm._output._training_metrics)._MSE,1e-5);
Assert.assertEquals(2.0080997,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-4);
} finally {
if (tfr != null) tfr.delete();
if (gbm != null) gbm.deleteCrossValidationModels();
if (gbm != null) gbm.delete();
}
}
@Test
public void testDeviances() {
for (DistributionFamily dist : DistributionFamily.values()) {
if (dist == modified_huber) continue;
Frame tfr = null;
Frame res = null;
Frame preds = null;
GBMModel gbm = null;
try {
tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
String resp = tfr.lastVecName();
if (dist==modified_huber || dist==bernoulli || dist==multinomial) {
resp = dist==multinomial?"rad":"chas";
Vec v = tfr.remove(resp);
tfr.add(resp, v.toCategoricalVec());
v.remove();
DKV.put(tfr);
}
parms._response_column = resp;
parms._distribution = dist;
gbm = new GBM(parms).trainModel().get();
preds = gbm.score(tfr);
res = gbm.computeDeviances(tfr,preds,"myDeviances");
double meanDeviance = res.anyVec().mean();
if (gbm._output.nclasses()==2)
Assert.assertEquals(meanDeviance,((ModelMetricsBinomial) gbm._output._training_metrics)._logloss,1e-6*Math.abs(meanDeviance));
else if (gbm._output.nclasses()>2)
Assert.assertEquals(meanDeviance,((ModelMetricsMultinomial) gbm._output._training_metrics)._logloss,1e-6*Math.abs(meanDeviance));
else
Assert.assertEquals(meanDeviance,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-6*Math.abs(meanDeviance));
} finally {
if (tfr != null) tfr.delete();
if (res != null) res.delete();
if (preds != null) preds.delete();
if (gbm != null) gbm.delete();
}
}
}
@Test
public void testCatEncoding() {
for (Model.Parameters.CategoricalEncodingScheme c : Model.Parameters.CategoricalEncodingScheme.values()) {
if (c == Model.Parameters.CategoricalEncodingScheme.OneHotInternal) continue;
Frame tfr = null;
GBMModel gbm = null;
Frame fr2 = null;
try {
tfr = parse_test_file("./smalldata/junit/weather.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = tfr.lastVecName();
parms._ntrees = 5;
parms._categorical_encoding = c;
gbm = new GBM(parms).trainModel().get();
// Done building model; produce a score column with predictions
fr2 = gbm.score(tfr);
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(tfr,fr2,1e-15));
} finally {
if (tfr != null) tfr.delete();
if (fr2 != null) fr2.delete();
if (gbm != null) gbm.deleteCrossValidationModels();
if (gbm != null) gbm.delete();
}
}
}
@Test
public void testCatEncodingCV() {
for (Model.Parameters.CategoricalEncodingScheme c : Model.Parameters.CategoricalEncodingScheme.values()) {
if (c == Model.Parameters.CategoricalEncodingScheme.OneHotInternal) continue;
Frame tfr = null;
GBMModel gbm = null;
try {
tfr = parse_test_file("./smalldata/junit/weather.csv");
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
parms._train = tfr._key;
parms._response_column = tfr.lastVecName();
parms._ntrees = 5;
parms._categorical_encoding = c;
parms._nfolds = 3;
gbm = new GBM(parms).trainModel().get();
} finally {
if (tfr != null) tfr.delete();
if (gbm != null) gbm.deleteCrossValidationModels();
if (gbm != null) gbm.delete();
}
}
}
// A test of the validity of categorical splits
@Test public void testCategoricalSplits() throws FileNotFoundException {
Frame fr=null;
GBMModel model = null;
Scope.enter();
try {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
fr = parse_test_file("smalldata/gbm_test/ecology_model.csv");
fr.remove("Site").remove();
fr.remove("SegSumT").remove();
fr.remove("SegTSeas").remove();
fr.remove("SegLowFlow").remove();
fr.remove("DSDist").remove();
fr.remove("DSMaxSlope").remove();
fr.remove("USAvgT").remove();
fr.remove("USRainDays").remove();
fr.remove("USSlope").remove();
// fr.remove("USNative").remove();
fr.remove("DSDam").remove();
// fr.remove("LocSed").remove();
fr.remove("Method").remove();
int ci = fr.find("Angaus");
Scope.track(fr.replace(ci, fr.vecs()[ci].toCategoricalVec())); // Convert response 'Angaus' to categorical
DKV.put(fr);
parms._train = fr._key;
parms._response_column = "Angaus";
parms._ntrees = 1;
parms._min_rows = 10;
parms._max_depth = 13;
parms._distribution = DistributionFamily.multinomial;
model = new GBM(parms).trainModel().get();
// StreamingSchema ss = new StreamingSchema(model.getMojo(), "model.zip");
// FileOutputStream fos = new FileOutputStream("model.zip");
// ss.getStreamWriter().writeTo(fos);
} finally {
if( model != null ) model.delete();
if( fr != null ) fr.remove();
Scope.exit();
}
}
// A test of the validity of categorical splits
@Test public void testCategoricalSplits2() throws FileNotFoundException {
Frame fr=null;
GBMModel model = null;
Scope.enter();
try {
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
fr = parse_test_file("smalldata/airlines/allyears2k_headers.zip");
Frame fr2 = new Frame(Key.<Frame>make(), new String[]{"C","R"}, new Vec[]{fr.vec("Origin"),fr.vec("IsDepDelayed")});
int ci = fr2.find("R");
Scope.track(fr2.replace(ci, fr2.vecs()[ci].toCategoricalVec())); // Convert response 'Angaus' to categorical
DKV.put(fr2);
parms._train = fr2._key;
parms._response_column = "R";
parms._ntrees = 1;
parms._min_rows = 1000;
parms._max_depth = 4;
parms._distribution = DistributionFamily.bernoulli;
model = new GBM(parms).trainModel().get();
DKV.remove(fr2._key);
// StreamingSchema ss = new StreamingSchema(model.getMojo(), "model.zip");
// FileOutputStream fos = new FileOutputStream("model.zip");
// ss.getStreamWriter().writeTo(fos);
} finally {
if( model != null ) model.delete();
if( fr != null ) fr.remove();
Scope.exit();
}
}
@Test public void highCardinalityLowNbinsCats() { highCardinality(2000); }
@Test public void highCardinalityHighNbinsCats() { highCardinality(6000); }
public void highCardinality(int nbins_cats) {
GBMModel gbm = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Frame train=null, test=null, train_preds=null, test_preds=null;
Scope.enter();
try {
{
CreateFrame cf = new CreateFrame();
cf.rows = 10000;
cf.cols = 10;
cf.integer_range = 1000;
cf.categorical_fraction = 1.0;
cf.integer_fraction = 0.0;
cf.binary_fraction = 0.0;
cf.time_fraction = 0.0;
cf.string_fraction = 0.0;
cf.binary_ones_fraction = 0.0;
cf.missing_fraction = 0.2;
cf.factors = 3000;
cf.response_factors = 2;
cf.positive_response = false;
cf.has_response = true;
cf.seed = 1235;
cf.seed_for_column_types = 1234;
train = cf.execImpl().get();
}
{
CreateFrame cf = new CreateFrame();
cf.rows = 10000;
cf.cols = 10;
cf.integer_range = 1000;
cf.categorical_fraction = 1.0;
cf.integer_fraction = 0.0;
cf.binary_fraction = 0.0;
cf.time_fraction = 0.0;
cf.string_fraction = 0.0;
cf.binary_ones_fraction = 0.0;
cf.missing_fraction = 0.2;
cf.factors = 5000;
cf.response_factors = 2;
cf.positive_response = false;
cf.has_response = true;
cf.seed = 5321;
cf.seed_for_column_types = 1234;
test = cf.execImpl().get();
}
parms._train = train._key;
parms._response_column = "response"; // Train on the outcome
parms._max_depth = 20; //allow it to overfit
parms._min_rows = 1;
parms._ntrees = 1;
parms._nbins_cats = nbins_cats;
parms._seed = 0x2834234;
GBM job = new GBM(parms);
gbm = job.trainModel().get();
train_preds = gbm.score(train);
test_preds = gbm.score(test);
new MRTask() {
public void map(Chunk c) {
for (int i=0;i<c._len;++i)
if (c.isNA(i))
c.set(i, 0.5);
}
}.doAll(train.vec("response"));
new MRTask() {
public void map(Chunk c) {
for (int i=0;i<c._len;++i)
if (c.isNA(i))
c.set(i, 0.5);
}
}.doAll(test.vec("response"));
Log.info("Train AUC: " + ModelMetricsBinomial.make(train_preds.vec(2), train.vec("response")).auc());
Log.info("Test AUC: " + ModelMetricsBinomial.make(test_preds.vec(2), test.vec("response")).auc());
// Build a POJO, validate same results
Assert.assertTrue(gbm.testJavaScoring(train, train_preds, 1e-15));
Key old = gbm._key;
gbm._key = Key.make(gbm._key + "ha");
Assert.assertTrue(gbm.testJavaScoring(test, test_preds, 1e-15));
DKV.remove(old);
} finally {
if( gbm != null ) gbm .delete();
if( train != null ) train.remove();
if( test != null ) test.remove();
if( train_preds != null ) train_preds .remove();
if( test_preds != null ) test_preds .remove();
Scope.exit();
}
}
@Test public void lowCardinality() throws IOException {
for (boolean sort_cats : new boolean[]{true, false}) {
int[] vals = new int[]{2,10,20,25,26,27,100};
double[] maes = new double[vals.length];
int i=0;
for (int nbins_cats : vals) {
GBMModel model = null;
GBMModel.GBMParameters parms = new GBMModel.GBMParameters();
Frame train, train_preds=null;
Scope.enter();
train = parse_test_file("smalldata/gbm_test/alphabet_cattest.csv");
try {
parms._train = train._key;
parms._response_column = "y"; // Train on the outcome
parms._max_depth = 2;
parms._min_rows = 1;
parms._ntrees = 1;
parms._learn_rate = 1;
parms._nbins_cats = nbins_cats;
if (sort_cats)
parms._categorical_encoding = Model.Parameters.CategoricalEncodingScheme.SortByResponse;
GBM job = new GBM(parms);
model = job.trainModel().get();
StreamingSchema ss = new StreamingSchema(model.getMojo(), "model.zip");
FileOutputStream fos = new FileOutputStream("model.zip");
ss.getStreamWriter().writeTo(fos);
train_preds = model.score(train);
Assert.assertTrue(model.testJavaScoring(train, train_preds, 1e-15));
double mae = ModelMetricsRegression.make(train_preds.vec(0), train.vec("y"), gaussian).mae();
Log.info("Train MAE: " + mae);
maes[i++] = mae;
if (nbins_cats >= 25 || sort_cats)
Assert.assertEquals(0, mae, 1e-8); // sorting of categoricals is enough
else
Assert.assertTrue(mae > 0);
} finally {
if( model != null ) model.delete();
if( train != null ) train.remove();
if( train_preds != null ) train_preds .remove();
new File("model.zip").delete();
Scope.exit();
}
}
Log.info(Arrays.toString(vals));
Log.info(Arrays.toString(maes));
}
}
}