package hex.glm; import hex.*; import hex.DataInfo.TransformType; import hex.deeplearning.DeepLearningModel.DeepLearningParameters.MissingValuesHandling; import hex.glm.GLMModel.GLMParameters.Link; import hex.glm.GLMModel.GLMParameters.Solver; import hex.glm.GLMModel.GLMWeightsFun; import hex.glm.GLMTask.*; import org.junit.*; import hex.glm.GLMModel.GLMParameters; import hex.glm.GLMModel.GLMParameters.Family; import water.*; import water.H2O.H2OCountedCompleter; import water.fvec.*; import water.parser.BufferedString; import water.parser.ParseDataset; import water.util.ArrayUtils; import java.util.*; import java.util.concurrent.ExecutionException; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; public class GLMTest extends TestUtil { @BeforeClass public static void setup() { stall_till_cloudsize(1); } public static void testScoring(GLMModel m, Frame fr) { Scope.enter(); // standard predictions Frame fr2 = new Frame(fr); Frame preds = Scope.track(m.score(fr2)); m.adaptTestForTrain(fr2,true,false); fr2.remove(fr2.numCols()-1); // remove response int p = m._output._dinfo._cats + m._output._dinfo._nums; int p2 = fr2.numCols() - (m._output._dinfo._weights?1:0)- (m._output._dinfo._offset?1:0); assert p == p2: p + " != " + p2; fr2.add(preds.names(),preds.vecs()); // test score0 new TestScore0(m,m._output._dinfo._weights,m._output._dinfo._offset).doAll(fr2); // test pojo if((!m._output._dinfo._weights && !m._output._dinfo._offset)) Assert.assertTrue(m.testJavaScoring(fr,preds,1e-15)); Scope.exit(); } // class to test score0 since score0 is now not being called by the standard bulk scoring public static class TestScore0 extends MRTask { final GLMModel _m; final boolean _weights; final boolean _offset; public TestScore0(GLMModel m, boolean w, boolean o) {_m = m; _weights = w; _offset = o;} private void checkScore(long rid, double [] predictions, double [] outputs){ int start = 0; if(_m._parms._family == Family.binomial && Math.abs(predictions[2] - _m.defaultThreshold()) < 1e-10) start = 1; if(_m._parms._family == Family.multinomial) { double [] maxs = new double[2]; for(int j = 1; j < predictions.length; ++j) { if(predictions[j] > maxs[0]) { if(predictions[j] > maxs[1]) { maxs[0] = maxs[1]; maxs[1] = predictions[j]; } else maxs[0] = predictions[j]; } } if((maxs[1] - maxs[0]) < 1e-10) start = 1; } for (int j = start; j < predictions.length; ++j) assertEquals("mismatch at row " + (rid) + ", p = " + j + ": " + outputs[j] + " != " + predictions[j] + ", predictions = " + Arrays.toString(predictions) + ", output = " + Arrays.toString(outputs), outputs[j], predictions[j], 1e-6); } @Override public void map(Chunk [] chks) { int nout = _m._parms._family == Family.multinomial ? _m._output.nclasses() + 1 : _m._parms._family == Family.binomial ? 3 : 1; Chunk[] outputChks = Arrays.copyOfRange(chks, chks.length - nout, chks.length); chks = Arrays.copyOf(chks, chks.length - nout); Chunk off = new C0DChunk(0, chks[0]._len); Chunk w = new C0DChunk(1, chks[0]._len); double[] tmp = new double[_m._output._dinfo._cats + _m._output._dinfo._nums]; double[] predictions = new double[nout]; double[] outputs = new double[nout]; if (_offset) { off = chks[chks.length - 1]; chks = Arrays.copyOf(chks, chks.length - 1); } if (_weights) { w = chks[chks.length - 1]; chks = Arrays.copyOf(chks, chks.length - 1); } for (int i = 0; i < chks[0]._len; ++i) { if (_weights || _offset) _m.score0(chks, w.atd(i), off.atd(i), i, tmp, predictions); else _m.score0(chks, i, tmp, predictions); for (int j = 0; j < predictions.length; ++j) outputs[j] = outputChks[j].atd(i); checkScore(i + chks[0].start(), predictions, outputs); } } } //------------------- simple tests on synthetic data------------------------------------ @Test public void testGaussianRegression() throws InterruptedException, ExecutionException { Key raw = Key.make("gaussian_test_data_raw"); Key parsed = Key.make("gaussian_test_data_parsed"); GLMModel model = null; Frame fr = null, res = null; try { // make data so that the expected coefficients is icept = col[0] = 1.0 FVecTest.makeByteVec(raw, "x,y\n0,0\n1,0.1\n2,0.2\n3,0.3\n4,0.4\n5,0.5\n6,0.6\n7,0.7\n8,0.8\n9,0.9"); fr = ParseDataset.parse(parsed, raw); GLMParameters params = new GLMParameters(Family.gaussian); params._train = fr._key; // params._response = 1; params._response_column = fr._names[1]; params._lambda = new double[]{0}; // params._standardize= false; model = new GLM(params).trainModel().get(); HashMap<String, Double> coefs = model.coefficients(); assertEquals(0.0, coefs.get("Intercept"), 1e-4); assertEquals(0.1, coefs.get("x"), 1e-4); testScoring(model,fr); } finally { if (fr != null) fr.remove(); if (res != null) res.remove(); if (model != null) model.remove(); } } /** * Test Poisson regression on simple and small synthetic dataset. * Equation is: y = exp(x+1); */ @Test public void testPoissonRegression() throws InterruptedException, ExecutionException { Key raw = Key.make("poisson_test_data_raw"); Key parsed = Key.make("poisson_test_data_parsed"); GLMModel model = null; Frame fr = null, res = null; try { // make data so that the expected coefficients is icept = col[0] = 1.0 FVecTest.makeByteVec(raw, "x,y\n0,2\n1,4\n2,8\n3,16\n4,32\n5,64\n6,128\n7,256"); fr = ParseDataset.parse(parsed, raw); Vec v = fr.vec(0); System.out.println(v.min() + ", " + v.max() + ", mean = " + v.mean()); GLMParameters params = new GLMParameters(Family.poisson); params._train = fr._key; // params._response = 1; params._response_column = fr._names[1]; params._lambda = new double[]{0}; params._standardize = false; model = new GLM(params).trainModel().get(); for (double c : model.beta()) assertEquals(Math.log(2), c, 1e-2); // only 1e-2 precision cause the perfect solution is too perfect -> will trigger grid search testScoring(model,fr); model.delete(); fr.delete(); // Test 2, example from http://www.biostat.umn.edu/~dipankar/bmtry711.11/lecture_13.pdf FVecTest.makeByteVec(raw, "x,y\n1,0\n2,1\n3,2\n4,3\n5,1\n6,4\n7,9\n8,18\n9,23\n10,31\n11,20\n12,25\n13,37\n14,45\n150,7.193936e+16\n"); fr = ParseDataset.parse(parsed, raw); GLMParameters params2 = new GLMParameters(Family.poisson); params2._train = fr._key; // params2._response = 1; params2._response_column = fr._names[1]; params2._lambda = new double[]{0}; params2._standardize = true; params2._beta_epsilon = 1e-5; model = new GLM(params2).trainModel().get(); assertEquals(0.3396, model.beta()[1], 1e-1); assertEquals(0.2565, model.beta()[0], 1e-1); // test scoring testScoring(model,fr); } finally { if (fr != null) fr.delete(); if (res != null) res.delete(); if (model != null) model.delete(); } } /** * Test Gamma regression on simple and small synthetic dataset. * Equation is: y = 1/(x+1); * * @throws ExecutionException * @throws InterruptedException */ @Test public void testGammaRegression() throws InterruptedException, ExecutionException { GLMModel model = null; Frame fr = null, res = null; try { // make data so that the expected coefficients is icept = col[0] = 1.0 Key raw = Key.make("gamma_test_data_raw"); Key parsed = Key.make("gamma_test_data_parsed"); FVecTest.makeByteVec(raw, "x,y\n0,1\n1,0.5\n2,0.3333333\n3,0.25\n4,0.2\n5,0.1666667\n6,0.1428571\n7,0.125"); fr = ParseDataset.parse(parsed, raw); // /public GLM2(String desc, Key dest, Frame src, Family family, Link link, double alpha, double lambda) { // double [] vals = new double[] {1.0,1.0}; //public GLM2(String desc, Key dest, Frame src, Family family, Link link, double alpha, double lambda) { GLMParameters params = new GLMParameters(Family.gamma); // params._response = 1; params._response_column = fr._names[1]; params._train = parsed; params._lambda = new double[]{0}; model = new GLM(params).trainModel().get(); for (double c : model.beta()) assertEquals(1.0, c, 1e-4); // test scoring testScoring(model,fr); } finally { if (fr != null) fr.delete(); if (res != null) res.delete(); if (model != null) model.delete(); } } //// //simple tweedie test // @Test public void testTweedieRegression() throws InterruptedException, ExecutionException{ // Key raw = Key.make("gaussian_test_data_raw"); // Key parsed = Key.make("gaussian_test_data_parsed"); // Key<GLMModel> modelKey = Key.make("gaussian_test"); // Frame fr = null; // GLMModel model = null; // try { // // make data so that the expected coefficients is icept = col[0] = 1.0 // FVecTest.makeByteVec(raw, "x,y\n0,0\n1,0.1\n2,0.2\n3,0.3\n4,0.4\n5,0.5\n6,0.6\n7,0.7\n8,0.8\n9,0.9\n0,0\n1,0\n2,0\n3,0\n4,0\n5,0\n6,0\n7,0\n8,0\n9,0"); // fr = ParseDataset.parse(parsed, new Key[]{raw}); // double [] powers = new double [] {1.5,1.1,1.9}; // double [] intercepts = new double []{3.643,1.318,9.154}; // double [] xs = new double []{-0.260,-0.0284,-0.853}; // for(int i = 0; i < powers.length; ++i){ // DataInfo dinfo = new DataInfo(fr, 1, false, DataInfo.TransformType.NONE); // GLMParameters glm = new GLMParameters(Family.tweedie); // // new GLM2("GLM test of gaussian(linear) regression.",Key.make(),modelKey,dinfo,glm,new double[]{0},0).fork().get(); // model = DKV.get(modelKey).get(); // testHTML(model); // HashMap<String, Double> coefs = model.coefficients(); // assertEquals(intercepts[i],coefs.get("Intercept"),1e-3); // assertEquals(xs[i],coefs.get("x"),1e-3); // } // }finally{ // if( fr != null ) fr.delete(); // if(model != null)model.delete(); // } // } @Test public void testAllNAs() { Key raw = Key.make("gamma_test_data_raw"); Key parsed = Key.make("gamma_test_data_parsed"); FVecTest.makeByteVec(raw, "x,y,z\n1,0,NA\n2,NA,1\nNA,3,2\n4,3,NA\n5,NA,1\nNA,6,4\n7,NA,9\n8,NA,18\nNA,9,23\n10,31,NA\nNA,11,20\n12,NA,25\nNA,13,37\n14,45,NA\n"); Frame fr = ParseDataset.parse(parsed, raw); GLM job = null; try { GLMParameters params = new GLMParameters(Family.poisson); // params._response = 1; params._response_column = fr._names[1]; params._train = parsed; params._lambda = new double[]{0}; params._missing_values_handling = MissingValuesHandling.Skip; GLM glm = new GLM( params); glm.trainModel().get(); assertFalse("should've thrown IAE", true); } catch (IllegalArgumentException e) { assertTrue(e.getMessage(), e.getMessage().contains("No rows left in the dataset")); } finally { fr.delete(); } } // Make sure all three implementations of ginfo computation in GLM get the same results @Test public void testGradientTask() { Key parsed = Key.make("cars_parsed"); Frame fr = null; DataInfo dinfo = null; try { fr = parse_test_file(parsed, "smalldata/junit/mixcat_train.csv"); GLMParameters params = new GLMParameters(Family.binomial, Family.binomial.defaultLink, new double[]{0}, new double[]{0}, 0, 0); // params._response = fr.find(params._response_column); params._train = parsed; params._lambda = new double[]{0}; params._use_all_factor_levels = true; fr.add("Useless", fr.remove("Useless")); dinfo = new DataInfo(fr, null, 1, params._use_all_factor_levels || params._lambda_search, params._standardize ? DataInfo.TransformType.STANDARDIZE : DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, true, false, false, false, false, false); DKV.put(dinfo._key,dinfo); double [] beta = MemoryManager.malloc8d(dinfo.fullN()+1); Random rnd = new Random(987654321); for (int i = 0; i < beta.length; ++i) beta[i] = 1 - 2 * rnd.nextDouble(); GLMGradientTask grtSpc = new GLMBinomialGradientTask(null,dinfo, params, params._lambda[0], beta).doAll(dinfo._adaptedFrame); GLMGradientTask grtGen = new GLMGenericGradientTask(null,dinfo, params, params._lambda[0], beta).doAll(dinfo._adaptedFrame); for (int i = 0; i < beta.length; ++i) assertEquals("gradients differ", grtSpc._gradient[i], grtGen._gradient[i], 1e-4); params = new GLMParameters(Family.gaussian, Family.gaussian.defaultLink, new double[]{0}, new double[]{0}, 0, 0); params._use_all_factor_levels = false; dinfo.remove(); dinfo = new DataInfo(fr, null, 1, params._use_all_factor_levels || params._lambda_search, params._standardize ? DataInfo.TransformType.STANDARDIZE : DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, true, false, false, false, false, false); DKV.put(dinfo._key,dinfo); beta = MemoryManager.malloc8d(dinfo.fullN()+1); rnd = new Random(1987654321); for (int i = 0; i < beta.length; ++i) beta[i] = 1 - 2 * rnd.nextDouble(); grtSpc = new GLMGaussianGradientTask(null,dinfo, params, params._lambda[0], beta).doAll(dinfo._adaptedFrame); grtGen = new GLMGenericGradientTask(null,dinfo, params, params._lambda[0], beta).doAll(dinfo._adaptedFrame); for (int i = 0; i < beta.length; ++i) assertEquals("gradients differ: " + Arrays.toString(grtSpc._gradient) + " != " + Arrays.toString(grtGen._gradient), grtSpc._gradient[i], grtGen._gradient[i], 1e-4); dinfo.remove(); } finally { if (fr != null) fr.delete(); if (dinfo != null) dinfo.remove(); } } @Test public void testMultinomialGradient(){ Key parsed = Key.make("covtype"); Frame fr = null; double [][] beta = new double[][]{ { 5.886754459, -0.270479620, -0.075466082, -0.157524534, -0.225843747, -0.975387326, -0.018808013, -0.597839451, 0.931896624, 1.060006010, 1.513888539, 0.588802780, 0.157815155, -2.158268564, -0.504962385, -1.218970183, -0.840958642, -0.425931637, -0.355548831, -0.845035489, -0.065364107, 0.215897656, 0.213009374, 0.006831714, 1.212368946, 0.006106444, -0.350643486, -0.268207009, -0.252099054, -1.374010836, 0.257935860, 0.397459631, 0.411530391, 0.728368253, 0.292076224, 0.170774269, -0.059574793, 0.273670163, 0.180844505, -0.186483071, 0.369186813, 0.161909512, 0.249411716, -0.094481604, 0.413354360, -0.419043967, 0.044517794, -0.252596992, -0.371926422, 0.253835004, 0.588162090, 0.123330837, 2.856812217 }, { 1.89790254, -0.29776886, 0.15613197, 0.37602123, -0.36464436, -0.30240244, -0.57284370, 0.62408956, -0.22369305, 0.33644602, 0.79886400, 0.65351945, -0.53682819, -0.58319898, -1.07762513, -0.28527470, 0.46563482, -0.76956081, -0.72513805, 0.29857876, 0.03993456, 0.15835864, -0.24797599, -0.02483503, 0.93822490, -0.12406087, -0.75837978, -0.23516944, -0.48520212, 0.73571466, 0.19652011, 0.21602846, -0.32743154, 0.49421903, -0.02262943, 0.08093216, 0.11524497, 0.21657128, 0.18072853, 0.30872666, 0.17947687, 0.20156151, 0.16812179, -0.12286908, 0.29630502, 0.09992565, -0.00603293, 0.20700058, -0.49706211, -0.14534034, -0.18819217, 0.03642680, 7.31828340 }, { -6.098728943, 0.284144173, 0.114373474, 0.328977319, 0.417830082, 0.285696150, -0.652674822, 0.319136906, -0.942440279, -1.619235397, -1.272568201, -0.079855555, 1.191263550, 0.205102353, 0.991773314, 0.930363203, 1.014021007, 0.651243292, 0.646532457, 0.914336030, 0.012171754, -0.053042102, 0.777710362, 0.527369151, -0.019496049, 0.186290583, 0.554926655, 0.476911685, 0.529207520, -0.133243060, -0.198957274, -0.561552913, -0.069239959, -0.236600870, -0.969503908, -0.848089244, 0.001498592, -0.241007311, -0.129271912, -0.259961677, -0.895676033, -0.865827509, -0.972629899, 0.307756211, -1.809423763, -0.199557594, 0.024221965, -0.024834485, 0.047044475, 0.028951561, -0.157701002, 0.007940593, -2.073329675, }, { -8.36044440, 0.10541672, -0.01628680, -0.43787017, 0.42383466, 2.45802808, 0.59818831, 0.61971728, -0.62598983, 0.20261555, -0.21909545, 0.35125447, -3.29155913, 3.74668257, 0.18126128, -0.13948924, 0.20465077, -0.39930635, 0.15704570, -0.01036891, 0.02822546, -0.02349234, -0.93922249, -0.20025910, 0.25184125, 0.06415974, 0.35271290, 0.04609060, 0.03018497, -0.10641540, 0.00354805, -0.12194129, 0.05115876, 0.23981864, -0.10007012, 0.04773226, 0.01217421, 0.02367464, 0.05552397, 0.05343606, -0.05818705, -0.30055029, -0.03898723, 0.02322906, -0.04908215, 0.04274038, 0.25045428, 0.08561191, 0.15228160, 0.67005377, 0.59311621, 0.58814959, -4.83776046 }, { -0.39251919, 0.07053038, 0.09397355, 0.19394977, -0.02030732, -0.87489691, 0.21295049, 0.31800509, -0.05347208, -1.03491602, 2.20106706, -1.20895873, 1.06158893, -3.29214054, -0.69334082, 0.62309414, -1.64753442, 0.10189669, -0.44746013, -1.04084383, -0.01997483, -0.23356180, 0.34384724, 0.37566329, -1.79316510, 0.46183758, -0.58814389, 0.12072985, 0.48349078, 1.18956325, 0.41962148, 0.18767160, -0.25252495, -1.13671540, 0.71488183, 0.27405258, -0.03527945, 0.43124949, -0.28740586, 0.35165348, 1.17594079, 1.13893507, 0.49423372, 0.30525649, 0.70809680, 0.16660330, -0.37726163, -0.14687217, -0.17079711, -1.01897715, -1.17494223, -0.72698683, 1.64022531 }, { -5.892381502, 0.295534637, -0.112763568, 0.080283203, 0.197113227, 0.525435203, 0.727252262, -1.190672917, 1.137103389, -0.648526151, -2.581362158, -0.268338673, 2.010179009, 0.902074450, 0.816138328, 0.557071470, 0.389932578, 0.009422297, 0.542270816, 0.550653667, 0.005211720, -0.071954379, 0.320008238, 0.155814784, -0.264213966, 0.320538295, 0.569730803, 0.444518874, 0.247279544, -0.319484330, -0.372129988, 0.340944707, -0.158424299, -0.479426774, 0.026966661, 0.273389077, -0.004744599, -0.339321329, -0.119323949, -0.210123558, -1.218998166, -0.740525896, 0.134778587, 0.252701229, 0.527468284, 0.214164427, -0.080104361, -0.021448994, 0.004509104, -0.189729053, -0.335041198, -0.080698796, -1.192518082 }, { 12.9594170391, -0.1873774300, -0.1599625360, -0.3838368119, -0.4279825390, -1.1164727575, -0.2940645257, -0.0924364781, -0.2234047720, 1.7036099945, -0.4407937881, -0.0364237384, -0.5924593214, 1.1797487023, 0.2867554171, -0.4667946900, 0.4142538835, 0.8322365174, 0.1822980332, 0.1326797653, -0.0002045542, 0.0077943238, -0.4673767424, -0.8405848140, -0.3255599769, -0.9148717663, 0.2197967986, -0.5848745645, -0.5528616430, 0.0078757154, -0.3065382365, -0.4586101971, 0.3449315968, 0.3903371200, 0.0582787537, 0.0012089013, -0.0293189213, -0.3648369414, 0.1189047254, -0.0572478953, 0.4482567793, 0.4044976082, -0.0349286763, -0.6715923088, -0.0867185553, 0.0951677966, 0.1442048837, 0.1531401571, 0.8359504674, 0.4012062075, 0.6745982951, 0.0518378060, -3.7117127004 } }; double [] exp_grad = new double[]{ -8.955455e-05, 6.429112e-04, 4.384381e-04, 1.363695e-03, 4.714468e-04, -2.264769e-03, 4.412849e-04, 1.461760e-03, -2.957754e-05, -2.244325e-03, -2.744438e-03, 9.109376e-04, 1.920764e-03, 7.562221e-04, 1.840414e-04, 2.455081e-04, 3.077885e-04, 2.833261e-04, 1.248686e-04, 2.509248e-04, 9.681260e-06, -1.097335e-04, 1.005934e-03, 5.623159e-04, -2.568397e-03, 1.113900e-03, 1.263858e-04, 9.075801e-05, 8.056571e-05, 1.848318e-04, -1.291357e-04, -3.710570e-04, 5.693621e-05, 1.328082e-04, 3.244018e-04, 4.130594e-04, 9.681066e-06, 5.215260e-04, 4.054695e-04, 2.904901e-05, -3.074865e-03, -1.247025e-04, 1.044981e-03, 8.612937e-04, 1.376526e-03, 4.543256e-05, -4.596319e-06, 3.062111e-05, 5.649646e-05, 5.392599e-04, 9.681357e-04, 2.298219e-04, -1.369109e-03, -6.884926e-04, -9.921529e-04, -5.369346e-04, -1.732447e-03, 5.677645e-04, 1.655432e-03, -4.786890e-04, -8.688757e-04, 2.922016e-04, 3.601210e-03, 4.050781e-03, -6.409806e-04, -2.788663e-03, -1.426483e-03, -1.946904e-04, -8.279536e-04, -3.148338e-04, 2.263577e-06, -1.320917e-04, 3.635088e-04, -1.024655e-05, 1.079612e-04, -1.607591e-03, -1.801967e-04, 2.548311e-03, -1.007139e-03, -1.336990e-04, 2.538803e-04, -4.851292e-04, -9.168206e-04, 1.027708e-04, 1.061545e-03, -4.098038e-05, 1.070448e-04, 3.220238e-04, -7.011285e-04, -1.024153e-05, -7.967380e-04, -2.708138e-04, -2.698165e-04, 3.088978e-03, 4.260939e-04, -5.868815e-04, -1.562233e-03, -1.007565e-03, -2.034456e-04, -6.198011e-04, -3.277194e-05, -5.976557e-05, -1.143198e-03, -1.025416e-03, 3.671158e-04, 1.448332e-03, 1.940231e-03, -6.130695e-04, -2.086460e-03, -2.969848e-04, 1.455597e-04, 1.745515e-03, 2.123991e-03, 9.036201e-04, -5.270206e-04, 1.053891e-03, 1.358911e-03, 2.528711e-04, 1.326987e-04, -1.825879e-03, -6.085616e-04, -1.347628e-04, 3.499544e-04, 3.616313e-04, -7.008672e-04, -1.211077e-03, 1.117824e-05, 3.535679e-05, -2.668903e-03, -2.399884e-04, 3.979678e-04, 2.519517e-04, 1.113206e-04, 6.029871e-04, 3.512828e-04, 2.134159e-04, 7.590052e-05, 1.729959e-04, 4.472972e-05, 2.094373e-04, 3.136961e-04, 1.835530e-04, 1.117824e-05, 8.225263e-05, 4.330828e-05, 3.354142e-05, 7.452883e-04, 4.631413e-04, 2.054077e-04, -5.520636e-05, 2.818063e-04, 5.246077e-05, 1.131811e-04, 3.535664e-05, 6.523360e-05, 3.072416e-04, 2.913399e-04, 2.422760e-04, -1.580841e-03, -1.117356e-04, 2.573351e-04, 8.117137e-04, 1.168873e-04, -4.216143e-04, -5.847717e-05, 3.501109e-04, 2.344622e-04, -1.330097e-04, -5.948309e-04, -2.349808e-04, -4.495448e-05, -1.916493e-04, 5.017336e-04, -8.440468e-05, 4.767465e-04, 2.485018e-04, 2.060573e-04, -1.527142e-04, -9.268231e-06, -1.985972e-06, -6.285478e-06, -2.214673e-05, 5.822250e-04, -7.069316e-05, -4.387924e-05, -2.774128e-04, -5.455282e-04, 3.186328e-04, -3.793242e-05, -1.349306e-05, -3.070112e-05, -7.951882e-06, -3.723186e-05, -5.571437e-05, -3.260780e-05, -1.987225e-06, -1.462245e-05, -7.699184e-06, -5.962867e-06, -1.316053e-04, -8.108570e-05, -3.651228e-05, -5.312255e-05, -5.009791e-05, -9.325808e-06, -2.012086e-05, -6.285571e-06, -1.159698e-05, -5.462022e-05, -5.179310e-05, -4.307092e-05, 2.810360e-04, 3.869942e-04, -3.450936e-05, -7.805675e-05, 6.405561e-04, -2.284402e-04, -1.866295e-04, -4.858359e-04, 3.496890e-04, 7.352780e-04, 5.767877e-04, -8.477014e-04, -5.512698e-05, 1.091158e-03, -1.900036e-04, -4.632766e-05, 1.086153e-05, -7.743051e-05, -7.545391e-04, -3.143243e-05, -6.316374e-05, -2.435782e-06, -7.707894e-06, 4.451785e-04, 2.043479e-04, -8.673378e-05, -3.314975e-05, -3.181369e-05, -5.422704e-04, -9.020739e-05, 6.747588e-04, 5.997742e-06, -9.729086e-04, -9.751490e-06, -4.565744e-05, -4.181943e-04, 7.522183e-04, -2.436958e-06, 2.531532e-04, -9.441600e-06, 2.317743e-04, 4.254207e-04, -3.224488e-04, 3.979052e-04, 2.066697e-04, 2.486194e-05, 1.189306e-04, -2.465884e-05, -7.708071e-06, -1.422152e-05, -6.697064e-05, -6.351172e-05, -5.281060e-05, 3.446379e-04, -1.212986e-03, 9.206612e-04, 6.469824e-04, -6.605882e-04, -1.646537e-05, -6.854543e-04, -2.079925e-03, -1.031449e-03, 3.926585e-04, -1.556234e-03, -1.129748e-03, -2.113480e-04, -4.922559e-04, 1.938461e-03, 6.900824e-04, 1.497533e-04, -6.140808e-04, -3.365137e-04, 8.516225e-04, 5.874586e-04, -9.342693e-06, -2.955083e-05, 2.692614e-03, -9.928211e-04, -3.326157e-04, -3.572773e-04, 1.641113e-04, 7.442831e-05, -2.543959e-04, -1.783712e-04, -6.343638e-05, 9.077554e-05, -3.738480e-05, -1.750387e-04, -6.568480e-04, -2.035799e-04, -9.342694e-06, -6.874421e-05, -3.619677e-05, -2.803369e-05, -6.228932e-04, -3.870861e-04, -1.103792e-03, 9.585360e-04, -7.037269e-05, 2.736606e-04, -9.459508e-05, -2.955084e-05, -5.452180e-05, -2.567899e-04, -2.434930e-04, -2.024919e-04, 1.321256e-03, -2.244563e-04, -1.811758e-04, 8.043173e-04, 5.688820e-04, -5.182511e-04, -2.056167e-04, 1.290635e-04, -1.049207e-03, -7.305304e-04, -8.364983e-04, -4.528248e-04, -2.113987e-04, 3.279472e-04, 2.459491e-04, 5.986061e-05, 7.984705e-05, 1.001005e-04, 2.377746e-04, 4.061439e-05, 8.161668e-05, 3.151497e-06, 9.959707e-06, 1.549140e-04, 6.411739e-05, 1.121613e-04, 7.559378e-05, 4.110778e-05, 6.574476e-05, 7.925128e-05, 6.011770e-05, 2.139605e-05, 4.934971e-05, -5.597385e-06, -1.913622e-04, 1.706349e-04, -4.115145e-04, 3.149101e-06, 2.317293e-05, -1.246264e-04, 9.448371e-06, -4.303234e-04, 2.608783e-05, 7.889196e-05, -3.559375e-04, -5.551586e-04, -2.777131e-04, 6.505911e-04, 1.033867e-05, 1.837583e-05, 6.750772e-04, 1.247379e-04, -5.408403e-04, -4.453114e-04, }; Vec origRes = null; try { fr = parse_test_file(parsed, "smalldata/covtype/covtype.20k.data"); fr.remove("C21").remove(); fr.remove("C29").remove(); GLMParameters params = new GLMParameters(Family.multinomial); params._response_column = "C55"; // params._response = fr.find(params._response_column); params._ignored_columns = new String[]{}; params._train = parsed; params._lambda = new double[]{0}; params._alpha = new double[]{0}; origRes = fr.remove("C55"); Vec res = fr.add("C55",origRes.toCategoricalVec()); double [] means = new double [res.domain().length]; long [] bins = res.bins(); double sumInv = 1.0/ArrayUtils.sum(bins); for(int i = 0; i < bins.length; ++i) means[i] = bins[i]*sumInv; DataInfo dinfo = new DataInfo(fr, null, 1, true, TransformType.STANDARDIZE, DataInfo.TransformType.NONE, true, false, false, false, false, false); GLMTask.GLMMultinomialGradientTask gmt = new GLMTask.GLMMultinomialGradientTask(null,dinfo,0,beta,1.0/fr.numRows()).doAll(dinfo._adaptedFrame); assertEquals(0.6421113,gmt._likelihood/fr.numRows(),1e-8); System.out.println("likelihood = " + gmt._likelihood/fr.numRows()); double [] g = gmt.gradient(); for(int i = 0; i < g.length; ++i) assertEquals("Mismatch at coefficient '" + "' (" + i + ")",exp_grad[i], g[i], 1e-8); } finally { if(origRes != null)origRes.remove(); if (fr != null) fr.delete(); } } //------------ TEST on selected files form small data and compare to R results ------------------------------------ /** * Simple test for poisson, gamma and gaussian families (no regularization, test both lsm solvers). * Basically tries to predict horse power based on other parameters of the cars in the dataset. * Compare against the results from standard R glm implementation. * * @throws ExecutionException * @throws InterruptedException */ @Test public void testCars() throws InterruptedException, ExecutionException { Scope.enter(); Key parsed = Key.make("cars_parsed"); Frame fr = null; GLMModel model = null; Frame score = null; try { fr = parse_test_file(parsed, "smalldata/junit/cars.csv"); GLMParameters params = new GLMParameters(Family.poisson, Family.poisson.defaultLink, new double[]{0}, new double[]{0},0,0); params._response_column = "power (hp)"; // params._response = fr.find(params._response_column); params._ignored_columns = new String[]{"name"}; params._train = parsed; params._lambda = new double[]{0}; params._alpha = new double[]{0}; params._missing_values_handling = MissingValuesHandling.Skip; model = new GLM( params).trainModel().get(); HashMap<String, Double> coefs = model.coefficients(); String[] cfs1 = new String[]{"Intercept", "economy (mpg)", "cylinders", "displacement (cc)", "weight (lb)", "0-60 mph (s)", "year"}; double[] vls1 = new double[]{4.9504805, -0.0095859, -0.0063046, 0.0004392, 0.0001762, -0.0469810, 0.0002891}; for (int i = 0; i < cfs1.length; ++i) assertEquals(vls1[i], coefs.get(cfs1[i]), 1e-4); // test gamma double[] vls2 = new double[]{8.992e-03, 1.818e-04, -1.125e-04, 1.505e-06, -1.284e-06, 4.510e-04, -7.254e-05}; testScoring(model,fr); model.delete(); params = new GLMParameters(Family.gamma, Family.gamma.defaultLink, new double[]{0}, new double[]{0},0,0); params._response_column = "power (hp)"; // params._response = fr.find(params._response_column); params._ignored_columns = new String[]{"name"}; params._train = parsed; params._lambda = new double[]{0}; params._beta_epsilon = 1e-5; params._missing_values_handling = MissingValuesHandling.Skip; model = new GLM( params).trainModel().get(); coefs = model.coefficients(); for (int i = 0; i < cfs1.length; ++i) assertEquals(vls2[i], coefs.get(cfs1[i]), 1e-4); testScoring(model,fr); model.delete(); // test gaussian double[] vls3 = new double[]{166.95862, -0.00531, -2.46690, 0.12635, 0.02159, -4.66995, -0.85724}; params = new GLMParameters(Family.gaussian); params._response_column = "power (hp)"; // params._response = fr.find(params._response_column); params._ignored_columns = new String[]{"name"}; params._train = parsed; params._lambda = new double[]{0}; params._missing_values_handling = MissingValuesHandling.Skip; model = new GLM( params).trainModel().get(); coefs = model.coefficients(); for (int i = 0; i < cfs1.length; ++i) assertEquals(vls3[i], coefs.get(cfs1[i]), 1e-4); // test scoring } finally { if (fr != null) fr.delete(); if (score != null) score.delete(); if (model != null) model.delete(); Scope.exit(); } } // Leask xval keys // @Test public void testXval() { // GLMModel model = null; // Frame fr = parse_test_file("smalldata/glm_test/prostate_cat_replaced.csv"); // Frame score = null; // try{ // Scope.enter(); // // R results //// Coefficients: //// (Intercept) ID AGE RACER2 RACER3 DPROS DCAPS PSA VOL GLEASON //// -8.894088 0.001588 -0.009589 0.231777 -0.459937 0.556231 0.556395 0.027854 -0.011355 1.010179 // String [] cfs1 = new String [] {"Intercept","AGE", "RACE.R2","RACE.R3", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"}; // double [] vals = new double [] {-8.14867, -0.01368, 0.32337, -0.38028, 0.55964, 0.49548, 0.02794, -0.01104, 0.97704}; // GLMParameters params = new GLMParameters(Family.binomial); // params._n_folds = 10; // params._response_column = "CAPSULE"; // params._ignored_columns = new String[]{"ID"}; // params._train = fr._key; // params._lambda = new double[]{0}; // model = new GLM(params,Key.make("prostate_model")).trainModel().get(); // HashMap<String, Double> coefs = model.coefficients(); // for(int i = 0; i < cfs1.length; ++i) // assertEquals(vals[i], coefs.get(cfs1[i]),1e-4); // GLMValidation val = model.trainVal(); //// assertEquals(512.3, val.nullDeviance(),1e-1); //// assertEquals(378.3, val.residualDeviance(),1e-1); //// assertEquals(396.3, val.AIC(),1e-1); //// score = model.score(fr); //// //// hex.ModelMetrics mm = hex.ModelMetrics.getFromDKV(model,fr); //// //// AUCData adata = mm._aucdata; //// assertEquals(val.auc(),adata.AUC(),1e-2); //// GLMValidation val2 = new GLMValidationTsk(params,model._ymu,rank(model.beta())).doAll(new Vec[]{fr.vec("CAPSULE"),score.vec("1")})._val; //// assertEquals(val.residualDeviance(),val2.residualDeviance(),1e-6); //// assertEquals(val.nullDeviance(),val2.nullDeviance(),1e-6); // } finally { // fr.delete(); // if(model != null)model.delete(); // if(score != null)score.delete(); // Scope.exit(); // } // } /** * Test bounds on prostate dataset, 2 cases : * 1) test against known result in glmnet (with elastic net regularization) with elastic net penalty * 2) test with no regularization, check the ginfo in the end. */ @Test public void testBounds() { // glmnet's result: // res2 <- glmnet(x=M,y=D$CAPSULE,lower.limits=-.5,upper.limits=.5,family='binomial') // res2$beta[,58] // AGE RACE DPROS PSA VOL GLEASON // -0.00616326 -0.50000000 0.50000000 0.03628192 -0.01249324 0.50000000 // res2$a0[100] // res2$a0[58] // s57 // -4.155864 // lambda = 0.001108, null dev = 512.2888, res dev = 379.7597 GLMModel model = null; Key parsed = Key.make("prostate_parsed"); Key modelKey = Key.make("prostate_model"); Frame fr = parse_test_file(parsed, "smalldata/logreg/prostate.csv"); Key betaConsKey = Key.make("beta_constraints"); String[] cfs1 = new String[]{"AGE", "RACE", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON", "Intercept"}; double[] vals = new double[]{-0.006502588, -0.500000000, 0.500000000, 0.400000000, 0.034826559, -0.011661747, 0.500000000, -4.564024}; // [AGE, RACE, DPROS, DCAPS, PSA, VOL, GLEASON, Intercept] FVecTest.makeByteVec(betaConsKey, "names, lower_bounds, upper_bounds\n AGE, -.5, .5\n RACE, -.5, .5\n DCAPS, -.4, .4\n DPROS, -.5, .5 \nPSA, -.5, .5\n VOL, -.5, .5\nGLEASON, -.5, .5"); Frame betaConstraints = ParseDataset.parse(Key.make("beta_constraints.hex"), betaConsKey); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = true; params._family = Family.binomial; params._beta_constraints = betaConstraints._key; params._response_column = "CAPSULE"; params._ignored_columns = new String[]{"ID"}; params._train = fr._key; params._objective_epsilon = 0; params._alpha = new double[]{1}; params._lambda = new double[]{0.001607}; params._obj_reg = 1.0/380; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); // Map<String, Double> coefs = model.coefficients(); // for (int i = 0; i < cfs1.length; ++i) // assertEquals(vals[i], coefs.get(cfs1[i]), 1e-1); ModelMetricsBinomialGLM val = (ModelMetricsBinomialGLM) model._output._training_metrics; assertEquals(512.2888, val._nullDev, 1e-1); // 388.4952716196743 assertTrue(val._resDev <= 388.5); model.delete(); params._lambda = new double[]{0}; params._alpha = new double[]{0}; FVecTest.makeByteVec(betaConsKey, "names, lower_bounds, upper_bounds\n RACE, -.5, .5\n DCAPS, -.4, .4\n DPROS, -.5, .5 \nPSA, -.5, .5\n VOL, -.5, .5"); betaConstraints = ParseDataset.parse(Key.make("beta_constraints.hex"), betaConsKey); glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); double[] beta = model.beta(); System.out.println("beta = " + Arrays.toString(beta)); fr.add("CAPSULE", fr.remove("CAPSULE")); fr.remove("ID").remove(); DKV.put(fr._key, fr); // now check the ginfo DataInfo dinfo = new DataInfo(fr, null, 1, true, TransformType.NONE, DataInfo.TransformType.NONE, true, false, false, false, false, false); GLMGradientTask lt = new GLMBinomialGradientTask(null,dinfo,params,0,beta).doAll(dinfo._adaptedFrame); double [] grad = lt._gradient; String [] names = model.dinfo().coefNames(); BufferedString tmpStr = new BufferedString(); outer: for (int i = 0; i < names.length; ++i) { for (int j = 0; j < betaConstraints.numRows(); ++j) { if (betaConstraints.vec("names").atStr(tmpStr, j).toString().equals(names[i])) { if (Math.abs(beta[i] - betaConstraints.vec("lower_bounds").at(j)) < 1e-4 || Math.abs(beta[i] - betaConstraints.vec("upper_bounds").at(j)) < 1e-4) { continue outer; } } } assertEquals(0, grad[i], 1e-2); } } finally { fr.delete(); betaConstraints.delete(); if (model != null) model.delete(); } } @Test public void testCoordinateDescent_airlines() { GLMModel model = null; Key parsed = Key.make("airlines_parsed"); Key<GLMModel> modelKey = Key.make("airlines_model"); Frame fr = parse_test_file(parsed, "smalldata/airlines/AirlinesTrain.csv.zip"); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = true; params._family = Family.binomial; params._solver = Solver.COORDINATE_DESCENT_NAIVE; params._response_column = "IsDepDelayed"; params._ignored_columns = new String[]{"IsDepDelayed_REC"}; params._train = fr._key; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); System.out.println(model._output._training_metrics); } finally { fr.delete(); if (model != null) model.delete(); } } @Test public void testCoordinateDescent_airlines_CovUpdates() { GLMModel model = null; Key parsed = Key.make("airlines_parsed"); Key<GLMModel> modelKey = Key.make("airlines_model"); Frame fr = parse_test_file(parsed, "smalldata/airlines/AirlinesTrain.csv.zip"); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = true; params._family = Family.binomial; params._solver = Solver.COORDINATE_DESCENT; params._response_column = "IsDepDelayed"; params._ignored_columns = new String[]{"IsDepDelayed_REC"}; params._train = fr._key; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); System.out.println(model._output._training_metrics); } finally { fr.delete(); if (model != null) model.delete(); } } @Test public void testCoordinateDescent_anomaly() { GLMModel model = null; Key parsed = Key.make("anomaly_parsed"); Key<GLMModel> modelKey = Key.make("anomaly_model"); Frame fr = parse_test_file(parsed, "smalldata/anomaly/ecg_discord_train.csv"); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = true; params._family = Family.gaussian; params._solver = Solver.COORDINATE_DESCENT_NAIVE; params._response_column = "C1"; params._train = fr._key; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); System.out.println(model._output._training_metrics); } finally { fr.delete(); if (model != null) model.delete(); } } @Test public void testCoordinateDescent_anomaly_CovUpdates() { GLMModel model = null; Key parsed = Key.make("anomaly_parsed"); Key<GLMModel> modelKey = Key.make("anomaly_model"); Frame fr = parse_test_file(parsed, "smalldata/anomaly/ecg_discord_train.csv"); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = true; params._family = Family.gaussian; params._solver = Solver.COORDINATE_DESCENT; params._response_column = "C1"; params._train = fr._key; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); assertTrue(glm.isStopped()); System.out.println(model._output._training_metrics); } finally { fr.delete(); if (model != null) model.delete(); } } @Test public void testProximal() { // glmnet's result: // res2 <- glmnet(x=M,y=D$CAPSULE,lower.limits=-.5,upper.limits=.5,family='binomial') // res2$beta[,58] // AGE RACE DPROS PSA VOL GLEASON // -0.00616326 -0.50000000 0.50000000 0.03628192 -0.01249324 0.50000000 // res2$a0[100] // res2$a0[58] // s57 // -4.155864 // lambda = 0.001108, null dev = 512.2888, res dev = 379.7597 Key parsed = Key.make("prostate_parsed"); Key<GLMModel> modelKey = Key.make("prostate_model"); GLMModel model = null; Frame fr = parse_test_file(parsed, "smalldata/logreg/prostate.csv"); fr.remove("ID").remove(); DKV.put(fr._key, fr); Key betaConsKey = Key.make("beta_constraints"); FVecTest.makeByteVec(betaConsKey, "names, beta_given, rho\n AGE, 0.1, 1\n RACE, -0.1, 1 \n DPROS, 10, 1 \n DCAPS, -10, 1 \n PSA, 0, 1\n VOL, 0, 1\nGLEASON, 0, 1\n Intercept, 0, 0 \n"); Frame betaConstraints = ParseDataset.parse(Key.make("beta_constraints.hex"), betaConsKey); try { // H2O differs on intercept and race, same residual deviance though GLMParameters params = new GLMParameters(); params._standardize = false; params._family = Family.binomial; params._beta_constraints = betaConstraints._key; params._response_column = "CAPSULE"; params._ignored_columns = new String[]{"ID"}; params._train = fr._key; params._alpha = new double[]{0}; params._lambda = new double[]{0}; params._obj_reg = 1.0/380; params._objective_epsilon = 0; GLM glm = new GLM( params, modelKey); model = glm.trainModel().get(); double[] beta_1 = model.beta(); params._solver = Solver.L_BFGS; params._max_iterations = 1000; glm = new GLM( params, modelKey); model = glm.trainModel().get(); fr.add("CAPSULE", fr.remove("CAPSULE")); // now check the ginfo DataInfo dinfo = new DataInfo(fr, null, 1, true, TransformType.NONE, DataInfo.TransformType.NONE, true, false, false, false, false, false); GLMGradientTask lt = new GLMBinomialGradientTask(null,dinfo, params, 0, beta_1).doAll(dinfo._adaptedFrame); double[] grad = lt._gradient; for (int i = 0; i < beta_1.length; ++i) assertEquals(0, grad[i] + betaConstraints.vec("rho").at(i) * (beta_1[i] - betaConstraints.vec("beta_given").at(i)), 1e-4); } finally { betaConstraints.delete(); fr.delete(); if (model != null) model.delete(); } } // // test categorical autoexpansions, run on airlines which has several categorical columns, // // once on explicitly expanded data, once on h2o autoexpanded and compare the results // @Test public void testSparseCategoricals() { // GLMModel model1 = null, model2 = null, model3 = null, model4 = null; // // Frame frMM = parse_test_file("smalldata/glm_tets/train-2.csv"); // //// Vec xy = frG.remove("xy"); // frMM.remove("").remove(); // frMM.add("IsDepDelayed", frMM.remove("IsDepDelayed")); // DKV.put(frMM._key,frMM); // Frame fr = parse_test_file("smalldata/airlines/AirlinesTrain.csv.zip"), res = null; // // Distance + Origin + Dest + UniqueCarrier // String [] ignoredCols = new String[]{"fYear", "fMonth", "fDayofMonth", "fDayOfWeek", "DepTime","ArrTime","IsDepDelayed_REC"}; // try{ // Scope.enter(); // GLMParameters params = new GLMParameters(Family.gaussian); // params._response_column = "IsDepDelayed"; // params._ignored_columns = ignoredCols; // params._train = fr._key; // params._l2pen = new double[]{1e-5}; // params._standardize = false; // model1 = new GLM(params,glmkey("airlines_cat_nostd")).trainModel().get(); // Frame score1 = model1.score(fr); // ModelMetricsRegressionGLM mm = (ModelMetricsRegressionGLM) ModelMetrics.getFromDKV(model1, fr); // Assert.assertEquals(model1.validation().residual_deviance, mm._resDev, 1e-4); // System.out.println("NDOF = " + model1.validation().nullDOF() + ", numRows = " + score1.numRows()); // Assert.assertEquals(model1.validation().residual_deviance, mm._MSE * score1.numRows(), 1e-4); // mm.remove(); // res = model1.score(fr); // // Build a POJO, validate same results // Assert.assertTrue(model1.testJavaScoring(fr, res, 1e-15)); // // params._train = frMM._key; // params._ignored_columns = new String[]{"X"}; // model2 = new GLM(params,glmkey("airlines_mm")).trainModel().get(); // params._standardize = true; // params._train = frMM._key; // params._use_all_factor_levels = true; // // test the gram // DataInfo dinfo = new DataInfo(Key.make(),frMM, null, 1, true, DataInfo.TransformType.STANDARDIZE, DataInfo.TransformType.NONE, true); // GLMIterationTask glmt = new GLMIterationTask(null,dinfo,1e-5,params,false,null,0,null, null).doAll(dinfo._adaptedFrame); // for(int i = 0; i < glmt._xy.length; ++i) { // for(int j = 0; j <= i; ++j ) { // assertEquals(frG.vec(j).at(i), glmt._gram.get(i, j), 1e-5); // } // assertEquals(xy.at(i), glmt._xy[i], 1e-5); // } // frG.delete(); // xy.remove(); // params._standardize = true; // params._family = Family.binomial; // params._link = Link.logit; // model3 = new GLM(params,glmkey("airlines_mm")).trainModel().get(); // params._train = fr._key; // params._ignored_columns = ignoredCols; // model4 = new GLM(params,glmkey("airlines_mm")).trainModel().get(); // assertEquals(model3.validation().null_deviance,model4.validation().nullDeviance(),1e-4); // assertEquals(model4.validation().residual_deviance, model3.validation().residualDeviance(), model3.validation().null_deviance * 1e-3); // HashMap<String, Double> coefs1 = model1.coefficients(); // HashMap<String, Double> coefs2 = model2.coefficients(); // GLMValidation val1 = model1.validation(); // GLMValidation val2 = model2.validation(); // // compare against each other // for(String s:coefs2.keySet()) { // String s1 = s; // if(s.startsWith("Origin")) // s1 = "Origin." + s.substring(6); // if(s.startsWith("Dest")) // s1 = "Dest." + s.substring(4); // if(s.startsWith("UniqueCarrier")) // s1 = "UniqueCarrier." + s.substring(13); // assertEquals("coeff " + s1 + " differs, " + coefs1.get(s1) + " != " + coefs2.get(s), coefs1.get(s1), coefs2.get(s),1e-4); // DKV.put(frMM._key,frMM); // update the frame in the KV after removing the vec! // } // assertEquals(val1.nullDeviance(), val2.nullDeviance(),1e-4); // assertEquals(val1.residualDeviance(), val2.residualDeviance(),1e-4); // assertEquals(val1._aic, val2._aic,1e-2); // // compare result against glmnet // assertEquals(5336.918,val1.residualDeviance(),1); // assertEquals(6051.613,val1.nullDeviance(),1); // // // // lbfgs //// params._solver = Solver.L_BFGS; //// params._train = fr._key; //// params._lambda = new double[]{.3}; //// model3 = new GLM(params,glmkey("lbfgs_cat")).trainModel().get(); //// params._train = frMM._key; //// model4 = new GLM(params,glmkey("lbfgs_mm")).trainModel().get(); //// HashMap<String, Double> coefs3 = model3.coefficients(); //// HashMap<String, Double> coefs4 = model4.coefficients(); //// // compare against each other //// for(String s:coefs4.keySet()) { //// String s1 = s; //// if(s.startsWith("Origin")) //// s1 = "Origin." + s.substring(6); //// if(s.startsWith("Dest")) //// s1 = "Dest." + s.substring(4); //// if(s.startsWith("UniqueCarrier")) //// s1 = "UniqueCarrier." + s.substring(13); //// assertEquals("coeff " + s1 + " differs, " + coefs3.get(s1) + " != " + coefs4.get(s), coefs3.get(s1), coefs4.get(s),1e-4); //// } // // } finally { // fr.delete(); // frMM.delete(); // if(res != null)res.delete(); // if(model1 != null)model1.delete(); // if(model2 != null)model2.delete(); // if(model3 != null)model3.delete(); // if(model4 != null)model4.delete(); //// if(score != null)score.delete(); // Scope.exit(); // } // } /** * Test we get correct gram on dataset which contains categoricals and sparse and dense numbers */ @Test public void testSparseGramComputation() { Random rnd = new Random(123456789l); double[] d0 = MemoryManager.malloc8d(1000); double[] d1 = MemoryManager.malloc8d(1000); double[] d2 = MemoryManager.malloc8d(1000); double[] d3 = MemoryManager.malloc8d(1000); double[] d4 = MemoryManager.malloc8d(1000); double[] d5 = MemoryManager.malloc8d(1000); double[] d6 = MemoryManager.malloc8d(1000); double[] d7 = MemoryManager.malloc8d(1000); double[] d8 = MemoryManager.malloc8d(1000); double[] d9 = MemoryManager.malloc8d(1000); long[] c1 = MemoryManager.malloc8(1000); long[] c2 = MemoryManager.malloc8(1000); String[] dom = new String[]{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"}; for (int i = 0; i < d1.length; ++i) { c1[i] = rnd.nextInt(dom.length); c2[i] = rnd.nextInt(dom.length); d0[i] = rnd.nextDouble(); d1[i] = rnd.nextDouble(); } for (int i = 0; i < 30; ++i) { d2[rnd.nextInt(d2.length)] = rnd.nextDouble(); d3[rnd.nextInt(d2.length)] = rnd.nextDouble(); d4[rnd.nextInt(d2.length)] = rnd.nextDouble(); d5[rnd.nextInt(d2.length)] = rnd.nextDouble(); d6[rnd.nextInt(d2.length)] = rnd.nextDouble(); d7[rnd.nextInt(d2.length)] = rnd.nextDouble(); d8[rnd.nextInt(d2.length)] = rnd.nextDouble(); d9[rnd.nextInt(d2.length)] = 1; } Vec.VectorGroup vg_1 = Vec.VectorGroup.VG_LEN1; Vec v01 = Vec.makeVec(c1, dom, vg_1.addVec()); Vec v02 = Vec.makeVec(c2, dom,vg_1.addVec()); Vec v03 = Vec.makeVec(d0, vg_1.addVec()); Vec v04 = Vec.makeVec(d1, vg_1.addVec()); Vec v05 = Vec.makeVec(d2, vg_1.addVec()); Vec v06 = Vec.makeVec(d3, vg_1.addVec()); Vec v07 = Vec.makeVec(d4, vg_1.addVec()); Vec v08 = Vec.makeVec(d5, vg_1.addVec()); Vec v09 = Vec.makeVec(d6, vg_1.addVec()); Vec v10 = Vec.makeVec(d7, vg_1.addVec()); Vec v11 = Vec.makeVec(d8, vg_1.addVec()); Vec v12 = Vec.makeVec(d9, vg_1.addVec()); Frame f = new Frame(Key.<Frame>make("TestData"), null, new Vec[]{v01, v02, v03, v04, v05, v05, v06, v07, v08, v09, v10, v11, v12}); DKV.put(f); DataInfo dinfo = new DataInfo(f, null, 1, true, DataInfo.TransformType.STANDARDIZE, DataInfo.TransformType.NONE, true, false, false, false, false, false); GLMParameters params = new GLMParameters(Family.gaussian); // public GLMIterationTask(Key jobKey, DataInfo dinfo, GLMWeightsFun glmw,double [] beta, double lambda) { final GLMIterationTask glmtSparse = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), null).setSparse(true).doAll(dinfo._adaptedFrame); final GLMIterationTask glmtDense = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), null).setSparse(false).doAll(dinfo._adaptedFrame); for (int i = 0; i < glmtDense._xy.length; ++i) { for (int j = 0; j <= i; ++j) { assertEquals(glmtDense._gram.get(i, j), glmtSparse._gram.get(i, j), 1e-8); } assertEquals(glmtDense._xy[i], glmtSparse._xy[i], 1e-8); } final double[] beta = MemoryManager.malloc8d(dinfo.fullN() + 1); // now do the same but weighted, use LSM solution as beta to generate meaningfull weights H2O.submitTask(new H2OCountedCompleter() { @Override public void compute2() { new GLM.GramSolver(glmtDense._gram, glmtDense._xy, true, 1e-5, 0, null, null, null, null).solve(null, beta); tryComplete(); } }).join(); final GLMIterationTask glmtSparse2 = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), beta).setSparse(true).doAll(dinfo._adaptedFrame); final GLMIterationTask glmtDense2 = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), beta).setSparse(false).doAll(dinfo._adaptedFrame); for (int i = 0; i < glmtDense2._xy.length; ++i) { for (int j = 0; j <= i; ++j) { assertEquals(glmtDense2._gram.get(i, j), glmtSparse2._gram.get(i, j), 1e-8); } assertEquals(glmtDense2._xy[i], glmtSparse2._xy[i], 1e-8); } dinfo.remove(); f.delete(); } // test categorical autoexpansions, run on airlines which has several categorical columns, // once on explicitly expanded data, once on h2o autoexpanded and compare the results @Test public void testAirlines() { GLMModel model1 = null, model2 = null, model3 = null, model4 = null; Frame frMM = parse_test_file(Key.make("AirlinesMM"), "smalldata/airlines/AirlinesTrainMM.csv.zip"); Frame frG = parse_test_file(Key.make("gram"), "smalldata/airlines/gram_std.csv", true); Vec xy = frG.remove("xy"); frMM.remove("C1").remove(); Vec v; frMM.add("IsDepDelayed", (v = frMM.remove("IsDepDelayed")).makeCopy(null)); v.remove(); DKV.put(frMM._key, frMM); Frame fr = parse_test_file(Key.make("Airlines"), "smalldata/airlines/AirlinesTrain.csv.zip"), res = null; fr.add("IsDepDelayed",(v =fr.remove("IsDepDelayed")).makeCopy(null)); v.remove(); DKV.put(fr._key,fr); // Distance + Origin + Dest + UniqueCarrier String[] ignoredCols = new String[]{"fYear", "fMonth", "fDayofMonth", "fDayOfWeek", "DepTime", "ArrTime", "IsDepDelayed_REC"}; try { Scope.enter(); GLMParameters params = new GLMParameters(Family.gaussian); params._response_column = "IsDepDelayed"; params._ignored_columns = ignoredCols; params._train = fr._key; params._lambda = new double[]{0}; params._alpha = new double[]{0}; params._standardize = false; params._use_all_factor_levels = false; model1 = new GLM(params).trainModel().get(); testScoring(model1,fr); Frame score1 = model1.score(fr); ModelMetricsRegressionGLM mm = (ModelMetricsRegressionGLM) ModelMetrics.getFromDKV(model1, fr); Assert.assertEquals(((ModelMetricsRegressionGLM) model1._output._training_metrics)._resDev, mm._resDev, 1e-4); Assert.assertEquals(((ModelMetricsRegressionGLM) model1._output._training_metrics)._resDev, mm._MSE * score1.numRows(), 1e-4); score1.delete(); mm.remove(); res = model1.score(fr); // Build a POJO, validate same results params._train = frMM._key; params._ignored_columns = new String[]{"X"}; model2 = new GLM( params).trainModel().get(); HashMap<String, Double> coefs1 = model1.coefficients(); testScoring(model2,frMM); HashMap<String, Double> coefs2 = model2.coefficients(); boolean failed = false; // compare against each other for (String s : coefs2.keySet()) { String s1 = s; if (s.startsWith("Origin")) s1 = "Origin." + s.substring(6); if (s.startsWith("Dest")) s1 = "Dest." + s.substring(4); if (s.startsWith("UniqueCarrier")) s1 = "UniqueCarrier." + s.substring(13); if(Math.abs(coefs1.get(s1) - coefs2.get(s)) > 1e-4) { System.out.println("coeff " + s1 + " differs, " + coefs1.get(s1) + " != " + coefs2.get(s)); failed = true; } // assertEquals("coeff " + s1 + " differs, " + coefs1.get(s1) + " != " + coefs2.get(s), coefs1.get(s1), coefs2.get(s), 1e-4); } assertFalse(failed); params._standardize = true; params._train = frMM._key; params._use_all_factor_levels = true; // test the gram DataInfo dinfo = new DataInfo(frMM, null, 1, true, DataInfo.TransformType.STANDARDIZE, DataInfo.TransformType.NONE, true, false, false, false, false, false); GLMIterationTask glmt = new GLMIterationTask(null, dinfo, new GLMWeightsFun(params), null).doAll(dinfo._adaptedFrame); for(int i = 0; i < glmt._xy.length; ++i) { for(int j = 0; j <= i; ++j ) { assertEquals(frG.vec(j).at(i), glmt._gram.get(i, j), 1e-5); } assertEquals(xy.at(i), glmt._xy[i], 1e-5); } xy.remove(); params = (GLMParameters) params.clone(); params._standardize = false; params._family = Family.binomial; params._link = Link.logit; model3 = new GLM( params).trainModel().get(); testScoring(model3,frMM); params._train = fr._key; params._ignored_columns = ignoredCols; model4 = new GLM( params).trainModel().get(); testScoring(model4,fr); assertEquals(nullDeviance(model3), nullDeviance(model4), 1e-4); assertEquals(residualDeviance(model4), residualDeviance(model3), nullDeviance(model3) * 1e-3); assertEquals(nullDeviance(model1), nullDeviance(model2), 1e-4); assertEquals(residualDeviance(model1), residualDeviance(model2), 1e-4); // assertEquals(val1._aic, val2._aic,1e-2); // compare result against glmnet assertEquals(5336.918, residualDeviance(model1), 1); assertEquals(6051.613, nullDeviance(model2), 1); // lbfgs // params._solver = Solver.L_BFGS; // params._train = fr._key; // params._lambda = new double[]{.3}; // model3 = new GLM(params,glmkey("lbfgs_cat")).trainModel().get(); // params._train = frMM._key; // mdoel4 = new GLM(params,glmkey("lbfgs_mm")).trainModel().get(); // HashMap<String, Double> coefs3 = model3.coefficients(); // HashMap<String, Double> coefs4 = model4.coefficients(); // // compare against each other // for(String s:coefs4.keySet()) { // String s1 = s; // if(s.startsWith("Origin")) // s1 = "Origin." + s.substring(6); // if(s.startsWith("Dest")) // s1 = "Dest." + s.substring(4); // if(s.startsWith("UniqueCarrier")) // s1 = "UniqueCarrier." + s.substring(13); // assertEquals("coeff " + s1 + " differs, " + coefs3.get(s1) + " != " + coefs4.get(s), coefs3.get(s1), coefs4.get(s),1e-4); // } } finally { fr.delete(); frMM.delete(); frG.delete(); if (res != null) res.delete(); if (model1 != null) model1.delete(); if (model2 != null) model2.delete(); if (model3 != null) model3.delete(); if (model4 != null) model4.delete(); // if(score != null)score.delete(); Scope.exit(); } } // test categorical autoexpansions, run on airlines which has several categorical columns, // once on explicitly expanded data, once on h2o autoexpanded and compare the results @Test public void test_COD_Airlines_SingleLambda() { GLMModel model1 = null; Frame fr = parse_test_file(Key.make("Airlines"), "smalldata/airlines/AirlinesTrain.csv.zip"); // Distance + Origin + Dest + UniqueCarrier String[] ignoredCols = new String[]{"IsDepDelayed_REC"}; try { Scope.enter(); GLMParameters params = new GLMParameters(Family.binomial); params._response_column = "IsDepDelayed"; params._ignored_columns = ignoredCols; params._train = fr._key; params._valid = fr._key; params._lambda = new double[] {0.01};//null; //new double[]{0.02934};//{0.02934494}; // null; params._alpha = new double[]{1}; params._standardize = false; params._solver = Solver.COORDINATE_DESCENT_NAIVE; params._lambda_search = true; params._nlambdas = 5; GLM glm = new GLM( params); model1 = glm.trainModel().get(); double [] beta = model1.beta(); double l1pen = ArrayUtils.l1norm(beta,true); double l2pen = ArrayUtils.l2norm2(beta,true); //System.out.println( " lambda min " + params._l2pen[params._l2pen.length-1] ); //System.out.println( " lambda_max " + model1._lambda_max); //System.out.println(" intercept " + beta[beta.length-1]); // double objective = model1._output._training_metrics./model1._nobs + // params._l2pen[params._l2pen.length-1]*params._alpha[0]*l1pen + params._l2pen[params._l2pen.length-1]*(1-params._alpha[0])*l2pen/2 ; // System.out.println( " objective value " + objective); // assertEquals(0.670921, objective,1e-4); } finally { fr.delete(); if (model1 != null) model1.delete(); } } @Test public void test_COD_Airlines_SingleLambda_CovUpdates() { GLMModel model1 = null; Frame fr = parse_test_file(Key.make("Airlines"), "smalldata/airlines/AirlinesTrain.csv.zip"); // Distance + Origin + Dest + UniqueCarrier String[] ignoredCols = new String[]{"IsDepDelayed_REC"}; try { Scope.enter(); GLMParameters params = new GLMParameters(Family.binomial); params._response_column = "IsDepDelayed"; params._ignored_columns = ignoredCols; params._train = fr._key; params._valid = fr._key; params._lambda = new double[] {0.01};//null; //new double[]{0.02934};//{0.02934494}; // null; params._alpha = new double[]{1}; params._standardize = false; params._solver = Solver.COORDINATE_DESCENT; params._lambda_search = true; GLM glm = new GLM( params); model1 = glm.trainModel().get(); double [] beta = model1.beta(); double l1pen = ArrayUtils.l1norm(beta,true); double l2pen = ArrayUtils.l2norm2(beta,true); // double objective = job.likelihood()/model1._nobs + // params._l2pen[params._l2pen.length-1]*params._alpha[0]*l1pen + params._l2pen[params._l2pen.length-1]*(1-params._alpha[0])*l2pen/2 ; // System.out.println( " objective value " + objective); // assertEquals(0.670921, objective,1e-2); } finally { fr.delete(); if (model1 != null) model1.delete(); } } @Test public void test_COD_Airlines_LambdaSearch() { GLMModel model1 = null; Frame fr = parse_test_file(Key.make("Airlines"), "smalldata/airlines/AirlinesTrain.csv.zip"); // Distance + Origin + Dest + UniqueCarrier String[] ignoredCols = new String[]{"IsDepDelayed_REC"}; try { Scope.enter(); GLMParameters params = new GLMParameters(Family.binomial); params._response_column = "IsDepDelayed"; params._ignored_columns = ignoredCols; params._train = fr._key; params._valid = fr._key; params._lambda = null; // new double [] {0.25}; params._alpha = new double[]{1}; params._standardize = false; params._solver = Solver.COORDINATE_DESCENT_NAIVE;//IRLSM params._lambda_search = true; params._nlambdas = 5; GLM glm = new GLM( params); model1 = glm.trainModel().get(); GLMModel.Submodel sm = model1._output._submodels[model1._output._submodels.length-1]; double [] beta = sm.beta; System.out.println("lambda " + sm.lambda_value); double l1pen = ArrayUtils.l1norm(beta,true); double l2pen = ArrayUtils.l2norm2(beta,true); // double objective = job.likelihood()/model1._nobs + // gives likelihood of the last lambda // params._l2pen[params._l2pen.length-1]*params._alpha[0]*l1pen + params._l2pen[params._l2pen.length-1]*(1-params._alpha[0])*l2pen/2 ; // assertEquals(0.65689, objective,1e-4); } finally { fr.delete(); if (model1 != null) model1.delete(); } } @Test public void test_COD_Airlines_LambdaSearch_CovUpdates() { GLMModel model1 = null; Frame fr = parse_test_file(Key.make("Airlines"), "smalldata/airlines/AirlinesTrain.csv.zip"); // Distance + Origin + Dest + UniqueCarrier String[] ignoredCols = new String[]{"IsDepDelayed_REC"}; try { Scope.enter(); GLMParameters params = new GLMParameters(Family.binomial); params._response_column = "IsDepDelayed"; params._ignored_columns = ignoredCols; params._train = fr._key; params._valid = fr._key; params._lambda = null; // new double [] {0.25}; params._alpha = new double[]{1}; params._standardize = false; params._solver = Solver.COORDINATE_DESCENT; params._lambda_search = true; params._nlambdas = 5; GLM glm = new GLM( params); model1 = glm.trainModel().get(); GLMModel.Submodel sm = model1._output._submodels[model1._output._submodels.length-1]; double [] beta = sm.beta; System.out.println("lambda " + sm.lambda_value); double l1pen = ArrayUtils.l1norm(beta,true); double l2pen = ArrayUtils.l2norm2(beta,true); // double objective = job.likelihood()/model1._nobs + // gives likelihood of the last lambda // params._l2pen[params._l2pen.length-1]*params._alpha[0]*l1pen + params._l2pen[params._l2pen.length-1]*(1-params._alpha[0])*l2pen/2 ; // assertEquals(0.65689, objective,1e-4); } finally { fr.delete(); if (model1 != null) model1.delete(); } } public static double residualDeviance(GLMModel m) { if (m._parms._family == Family.binomial) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM) m._output._training_metrics; return metrics._resDev; } else { ModelMetricsRegressionGLM metrics = (ModelMetricsRegressionGLM) m._output._training_metrics; return metrics._resDev; } } public static double residualDevianceTest(GLMModel m) { if(m._parms._family == Family.binomial) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM)m._output._validation_metrics; return metrics._resDev; } else { ModelMetricsRegressionGLM metrics = (ModelMetricsRegressionGLM)m._output._validation_metrics; return metrics._resDev; } } public static double nullDevianceTest(GLMModel m) { if(m._parms._family == Family.binomial) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM)m._output._validation_metrics; return metrics._nullDev; } else { ModelMetricsRegressionGLM metrics = (ModelMetricsRegressionGLM)m._output._validation_metrics; return metrics._nullDev; } } public static double aic(GLMModel m) { if (m._parms._family == Family.binomial) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM) m._output._training_metrics; return metrics._AIC; } else { ModelMetricsRegressionGLM metrics = (ModelMetricsRegressionGLM) m._output._training_metrics; return metrics._AIC; } } public static double nullDOF(GLMModel m) { if (m._parms._family == Family.binomial) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM) m._output._training_metrics; return metrics._nullDegressOfFreedom; } else { ModelMetricsRegressionGLM metrics = (ModelMetricsRegressionGLM) m._output._training_metrics; return metrics._nullDegressOfFreedom; } } public static double resDOF(GLMModel m) { if (m._parms._family == Family.binomial) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM) m._output._training_metrics; return metrics._residualDegressOfFreedom; } else { ModelMetricsRegressionGLM metrics = (ModelMetricsRegressionGLM) m._output._training_metrics; return metrics._residualDegressOfFreedom; } } public static double auc(GLMModel m) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM) m._output._training_metrics; return metrics.auc_obj()._auc; } public static double logloss(GLMModel m) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM) m._output._training_metrics; return metrics._logloss; } public static double mse(GLMModel m) { if (m._parms._family == Family.binomial) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM) m._output._training_metrics; return metrics._MSE; } else { ModelMetricsRegressionGLM metrics = (ModelMetricsRegressionGLM) m._output._training_metrics; return metrics._MSE; } } public static double nullDeviance(GLMModel m) { if (m._parms._family == Family.binomial) { ModelMetricsBinomialGLM metrics = (ModelMetricsBinomialGLM) m._output._training_metrics; return metrics._nullDev; } else { ModelMetricsRegressionGLM metrics = (ModelMetricsRegressionGLM) m._output._training_metrics; return metrics._nullDev; } } // test class private static final class GLMIterationTaskTest extends GLMIterationTask { final GLMModel _m; GLMMetricBuilder _val2; public GLMIterationTaskTest(Key jobKey, DataInfo dinfo, double lambda, GLMParameters glm, boolean validate, double[] beta, double ymu, GLMModel m) { // null, dinfo, new GLMWeightsFun(params), beta, 1e-5 super(jobKey, dinfo, new GLMWeightsFun(glm), beta); _m = m; } public void map(Chunk[] chks) { super.map(chks); _val2 = (GLMMetricBuilder) _m.makeMetricBuilder(chks[chks.length - 1].vec().domain()); double[] ds = new double[3]; float[] actual = new float[1]; for (int i = 0; i < chks[0]._len; ++i) { _m.score0(chks, i, null, ds); actual[0] = (float) chks[chks.length - 1].atd(i); _val2.perRow(ds, actual, _m); } } public void reduce(GLMIterationTask gmt) { super.reduce(gmt); GLMIterationTaskTest g = (GLMIterationTaskTest) gmt; _val2.reduce(g._val2); } } /** * Simple test for binomial family (no regularization, test both lsm solvers). * Runs the classical prostate, using dataset with race replaced by categoricals (probably as it's supposed to be?), in any case, * it gets to test correct processing of categoricals. * * Compare against the results from standard R glm implementation. * @throws ExecutionException * @throws InterruptedException */ @Test public void testProstate() throws InterruptedException, ExecutionException { GLMModel model = null, model2 = null, model3 = null, model4 = null; Frame fr = parse_test_file("smalldata/glm_test/prostate_cat_replaced.csv"); try{ Scope.enter(); // R results // Coefficients: // (Intercept) ID AGE RACER2 RACER3 DPROS DCAPS PSA VOL GLEASON // -8.894088 0.001588 -0.009589 0.231777 -0.459937 0.556231 0.556395 0.027854 -0.011355 1.010179 String [] cfs1 = new String [] {"Intercept","AGE", "RACE.R2","RACE.R3", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"}; double [] vals = new double [] {-8.14867, -0.01368, 0.32337, -0.38028, 0.55964, 0.49548, 0.02794, -0.01104, 0.97704}; GLMParameters params = new GLMParameters(Family.binomial); params._response_column = "CAPSULE"; params._ignored_columns = new String[]{"ID"}; params._train = fr._key; params._lambda = new double[]{0}; params._standardize = false; // params._missing_values_handling = MissingValuesHandling.Skip; GLM glm = new GLM(params); model = glm.trainModel().get(); assertTrue(model._output.bestSubmodel().iteration == 5); model.delete(); params._max_iterations = 4; glm = new GLM(params); model = glm.trainModel().get(); assertTrue(model._output.bestSubmodel().iteration == 4); System.out.println(model._output._model_summary); HashMap<String, Double> coefs = model.coefficients(); System.out.println(coefs); for(int i = 0; i < cfs1.length; ++i) assertEquals(vals[i], coefs.get(cfs1[i]),1e-4); assertEquals(512.3, nullDeviance(model),1e-1); assertEquals(378.3, residualDeviance(model),1e-1); assertEquals(371, resDOF(model),0); assertEquals(396.3, aic(model),1e-1); testScoring(model,fr); // test scoring model.score(fr).delete(); hex.ModelMetricsBinomial mm = hex.ModelMetricsBinomial.getFromDKV(model,fr); hex.AUC2 adata = mm._auc; assertEquals(model._output._training_metrics.auc_obj()._auc, adata._auc, 1e-8); assertEquals(0.7588625640559653, adata.pr_auc(), 1e-8); assertEquals(model._output._training_metrics._MSE, mm._MSE, 1e-8); assertEquals(((ModelMetricsBinomialGLM)model._output._training_metrics)._resDev, ((ModelMetricsBinomialGLM)mm)._resDev, 1e-8); model.score(fr).delete(); mm = hex.ModelMetricsBinomial.getFromDKV(model,fr); assertEquals(model._output._training_metrics.auc_obj()._auc, adata._auc, 1e-8); assertEquals(model._output._training_metrics._MSE, mm._MSE, 1e-8); assertEquals(((ModelMetricsBinomialGLM)model._output._training_metrics)._resDev, ((ModelMetricsBinomialGLM)mm)._resDev, 1e-8); double prior = 1e-5; params._prior = prior; // test the same data and model with prior, should get the same model except for the intercept glm = new GLM(params); model2 = glm.trainModel().get(); for(int i = 0; i < model2.beta().length-1; ++i) assertEquals(model.beta()[i], model2.beta()[i], 1e-8); assertEquals(model.beta()[model.beta().length-1] -Math.log(model._ymu[0] * (1-prior)/(prior * (1-model._ymu[0]))),model2.beta()[model.beta().length-1],1e-10); // run with lambda search, check the final submodel params._lambda_search = true; params._lambda = null; params._alpha = new double[]{0}; params._prior = -1; params._obj_reg = -1; params._max_iterations = 500; params._objective_epsilon = 1e-6; // test the same data and model with prior, should get the same model except for the intercept glm = new GLM(params); model3 = glm.trainModel().get(); double lambda = model3._output._submodels[model3._output._best_lambda_idx].lambda_value; params._lambda_search = false; params._lambda = new double[]{lambda}; ModelMetrics mm3 = ModelMetrics.getFromDKV(model3,fr); assertEquals("mse don't match, " + model3._output._training_metrics._MSE + " != " + mm3._MSE,model3._output._training_metrics._MSE,mm3._MSE,1e-8); assertEquals("res-devs don't match, " + ((ModelMetricsBinomialGLM)model3._output._training_metrics)._resDev + " != " + ((ModelMetricsBinomialGLM)mm3)._resDev,((ModelMetricsBinomialGLM)model3._output._training_metrics)._resDev, ((ModelMetricsBinomialGLM)mm3)._resDev,1e-4); fr.add("CAPSULE", fr.remove("CAPSULE")); fr.remove("ID").remove(); DKV.put(fr._key,fr); DataInfo dinfo = new DataInfo(fr, null, 1, true, TransformType.NONE, DataInfo.TransformType.NONE, true, false, false, false, false, false); model3.score(fr).delete(); mm3 = ModelMetrics.getFromDKV(model3,fr); assertEquals("mse don't match, " + model3._output._training_metrics._MSE + " != " + mm3._MSE,model3._output._training_metrics._MSE,mm3._MSE,1e-8); assertEquals("res-devs don't match, " + ((ModelMetricsBinomialGLM)model3._output._training_metrics)._resDev + " != " + ((ModelMetricsBinomialGLM)mm3)._resDev,((ModelMetricsBinomialGLM)model3._output._training_metrics)._resDev, ((ModelMetricsBinomialGLM)mm3)._resDev,1e-4); // test the same data and model with prior, should get the same model except for the intercept glm = new GLM(params); model4 = glm.trainModel().get(); assertEquals("mse don't match, " + model3._output._training_metrics._MSE + " != " + model4._output._training_metrics._MSE,model3._output._training_metrics._MSE,model4._output._training_metrics._MSE,1e-6); assertEquals("res-devs don't match, " + ((ModelMetricsBinomialGLM)model3._output._training_metrics)._resDev + " != " + ((ModelMetricsBinomialGLM)model4._output._training_metrics)._resDev,((ModelMetricsBinomialGLM)model3._output._training_metrics)._resDev, ((ModelMetricsBinomialGLM)model4._output._training_metrics)._resDev,1e-4); model4.score(fr).delete(); ModelMetrics mm4 = ModelMetrics.getFromDKV(model4,fr); assertEquals("mse don't match, " + mm3._MSE + " != " + mm4._MSE,mm3._MSE,mm4._MSE,1e-6); assertEquals("res-devs don't match, " + ((ModelMetricsBinomialGLM)mm3)._resDev + " != " + ((ModelMetricsBinomialGLM)mm4)._resDev,((ModelMetricsBinomialGLM)mm3)._resDev, ((ModelMetricsBinomialGLM)mm4)._resDev,1e-4); // GLMValidation val2 = new GLMValidationTsk(params,model._ymu,rank(model.beta())).doAll(new Vec[]{fr.vec("CAPSULE"),score.vec("1")})._val; // assertEquals(val.residualDeviance(),val2.residualDeviance(),1e-6); // assertEquals(val.nullDeviance(),val2.nullDeviance(),1e-6); } finally { fr.delete(); if(model != null)model.delete(); if(model2 != null)model2.delete(); if(model3 != null)model3.delete(); if(model4 != null)model4.delete(); Scope.exit(); } } @Test public void testQuasibinomial(){ GLMParameters params = new GLMParameters(Family.quasibinomial); GLM glm = new GLM(params); params.validate(glm); params._link = Link.log; try { params.validate(glm); Assert.assertTrue("should've thrown IAE", false); } catch(IllegalArgumentException iae){ // do nothing } // test it behaves like binomial on binary data GLMModel model = null, model2 = null, model3 = null, model4 = null; Frame fr = parse_test_file("smalldata/glm_test/prostate_cat_replaced.csv"); try { Scope.enter(); // R results // Coefficients: // (Intercept) ID AGE RACER2 RACER3 DPROS DCAPS PSA VOL GLEASON // -8.894088 0.001588 -0.009589 0.231777 -0.459937 0.556231 0.556395 0.027854 -0.011355 1.010179 String[] cfs1 = new String[]{"Intercept", "AGE", "RACE.R2", "RACE.R3", "DPROS", "DCAPS", "PSA", "VOL", "GLEASON"}; double[] vals = new double[]{-8.14867, -0.01368, 0.32337, -0.38028, 0.55964, 0.49548, 0.02794, -0.01104, 0.97704}; params = new GLMParameters(Family.quasibinomial); params._response_column = "CAPSULE"; params._ignored_columns = new String[]{"ID"}; params._train = fr._key; params._lambda = new double[]{0}; params._standardize = false; params._link = Link.logit; // params._missing_values_handling = MissingValuesHandling.Skip; glm = new GLM(params); model = glm.trainModel().get(); HashMap<String, Double> coefs = model.coefficients(); System.out.println(coefs); for (int i = 0; i < cfs1.length; ++i) assertEquals(vals[i], coefs.get(cfs1[i]), 1e-4); assertEquals(512.3, nullDeviance(model), 1e-1); assertEquals(378.3, residualDeviance(model), 1e-1); assertEquals(371, resDOF(model), 0); } finally { fr.delete(); if(model != null)model.delete(); if(model2 != null)model2.delete(); if(model3 != null)model3.delete(); if(model4 != null)model4.delete(); Scope.exit(); } } @Test public void testSynthetic() throws Exception { GLMModel model = null; Frame fr = parse_test_file("smalldata/glm_test/glm_test2.csv"); Frame score = null; try { Scope.enter(); GLMParameters params = new GLMParameters(Family.binomial); params._response_column = "response"; // params._response = fr.find(params._response_column); params._ignored_columns = new String[]{"ID"}; params._train = fr._key; params._lambda = new double[]{0}; params._standardize = false; params._max_iterations = 20; GLM glm = new GLM( params); model = glm.trainModel().get(); double [] beta = model.beta(); System.out.println("beta = " + Arrays.toString(beta)); assertEquals(auc(model), 1, 1e-4); score = model.score(fr); hex.ModelMetricsBinomial mm = hex.ModelMetricsBinomial.getFromDKV(model,fr); hex.AUC2 adata = mm._auc; assertEquals(auc(model), adata._auc, 1e-2); } finally { fr.remove(); if(model != null)model.delete(); if(score != null)score.delete(); Scope.exit(); } } @Test //PUBDEV-1839 public void testCitibikeReproPUBDEV1839() throws Exception { GLMModel model = null; Frame tfr = parse_test_file("smalldata/jira/pubdev_1839_repro_train.csv"); Frame vfr = parse_test_file("smalldata/jira/pubdev_1839_repro_test.csv"); try { Scope.enter(); GLMParameters params = new GLMParameters(Family.poisson); params._response_column = "bikes"; params._train = tfr._key; params._valid = vfr._key; GLM glm = new GLM(params); model = glm.trainModel().get(); testScoring(model,vfr); } finally { tfr.remove(); vfr.remove(); if(model != null)model.delete(); Scope.exit(); } } @Test public void testCitibikeReproPUBDEV1953() throws Exception { GLMModel model = null; Frame tfr = parse_test_file("smalldata/glm_test/citibike_small_train.csv"); Frame vfr = parse_test_file("smalldata/glm_test/citibike_small_test.csv"); try { Scope.enter(); GLMParameters params = new GLMParameters(Family.poisson); params._response_column = "bikes"; params._train = tfr._key; params._valid = vfr._key; params._family = Family.poisson; GLM glm = new GLM( params); model = glm.trainModel().get(); testScoring(model,vfr); } finally { tfr.remove(); vfr.remove(); if(model != null)model.delete(); Scope.exit(); } } @Test public void testXval(){ GLMModel model = null; Frame fr = parse_test_file("smalldata/glm_test/prostate_cat_replaced.csv"); try{ GLMParameters params = new GLMParameters(Family.binomial); params._response_column = "CAPSULE"; params._ignored_columns = new String[]{"ID"}; params._train = fr._key; params._lambda_search = true; params._nfolds = 3; params._standardize = false; GLM glm = new GLM(params); model = glm.trainModel().get(); } finally { fr.delete(); if(model != null) { for(Key k:model._output._cross_validation_models) Keyed.remove(k); model.delete(); } } } /** * Test strong rules on arcene datasets (10k predictors, 100 rows). * Should be able to obtain good model (~100 predictors, ~1 explained deviance) with up to 250 active predictors. * Scaled down (higher lambda min, fewer lambdas) to run at reasonable speed (whole test takes 20s on my laptop). * * Test runs glm with gaussian on arcene dataset and verifies it gets all lambda while limiting maximum actove predictors to reasonably small number. * Compares the objective value to expected one. */ @Test public void testArcene() throws InterruptedException, ExecutionException{ Key parsed = Key.make("arcene_parsed"); Key<GLMModel> modelKey = Key.make("arcene_model"); GLMModel model = null; Frame fr = parse_test_file(parsed, "smalldata/glm_test/arcene.csv"); try{ Scope.enter(); // test LBFGS with l1 pen GLMParameters params = new GLMParameters(Family.gaussian); // params._response = 0; params._lambda = null; params._response_column = fr._names[0]; params._train = parsed; params._lambda_search = true; params._nlambdas = 35; params._lambda_min_ratio = 0.18; params._max_iterations = 100000; params._max_active_predictors = 10000; params._alpha = new double[]{1}; for(Solver s: new Solver[]{ Solver.IRLSM, Solver.COORDINATE_DESCENT}){//Solver.COORDINATE_DESCENT,}) { // LBFGS lambda-search is too slow now params._solver = s; GLM glm = new GLM( params, modelKey); glm.trainModel().get(); model = DKV.get(modelKey).get(); System.out.println(model._output._model_summary); // assert on that we got all submodels (if strong rules work, we should be able to get the results with this many active predictors) assertEquals(params._nlambdas, model._output._submodels.length); System.out.println(model._output._training_metrics); // assert on the quality of the result, technically should compare objective value, but this should be good enough for now } model.delete(); params._solver = Solver.COORDINATE_DESCENT; params._max_active_predictors = 100; params._lambda_min_ratio = 1e-2; params._nlambdas = 100; GLM glm = new GLM( params, modelKey); glm.trainModel().get(); model = DKV.get(modelKey).get(); assertTrue(model._output.rank() <= params._max_active_predictors); // System.out.println("============================================================================================================"); System.out.println(model._output._model_summary); // assert on that we got all submodels (if strong rules work, we should be able to get the results with this many active predictors) System.out.println(model._output._training_metrics); System.out.println("============================================================================================================"); model.delete(); params._max_active_predictors = 250; params._lambda = null; params._lambda_search = false; glm = new GLM( params, modelKey); glm.trainModel().get(); model = DKV.get(modelKey).get(); assertTrue(model._output.rank() <= params._max_active_predictors); // System.out.println("============================================================================================================"); System.out.println(model._output._model_summary); // assert on that we got all submodels (if strong rules work, we should be able to get the results with this many active predictors) System.out.println(model._output._training_metrics); System.out.println("============================================================================================================"); model.delete(); } finally { fr.delete(); if(model != null)model.delete(); Scope.exit(); } } /** Test large GLM POJO model generation. * Make a 10K predictor model, emit, javac, and score with it. */ @Test public void testBigPOJO() { GLMModel model = null; Frame fr = parse_test_file(Key.make("arcene_parsed"), "smalldata/glm_test/arcene.csv"), res=null; try{ Scope.enter(); // test LBFGS with l1 pen GLMParameters params = new GLMParameters(Family.gaussian); // params._response = 0; params._lambda = null; params._response_column = fr._names[0]; params._train = fr._key; params._max_active_predictors = 100000; params._alpha = new double[]{0}; params._solver = Solver.L_BFGS; GLM glm = new GLM(params); model = glm.trainModel().get(); res = model.score(fr); model.testJavaScoring(fr,res,0.0); } finally { fr.delete(); if(model != null) model.delete(); if( res != null ) res.delete(); Scope.exit(); } } @Test public void testAbalone() { Scope.enter(); GLMModel model = null; try { Frame fr = parse_test_file("smalldata/glm_test/Abalone.gz"); Scope.track(fr); GLMParameters params = new GLMParameters(Family.gaussian); params._train = fr._key; params._response_column = fr._names[8]; params._alpha = new double[]{1.0}; params._lambda_search = true; GLM glm = new GLM(params); model = glm.trainModel().get(); testScoring(model,fr); } finally { if( model != null ) model.delete(); Scope.exit(); } } @Test public void testZeroedColumn(){ Vec x = Vec.makeCon(Vec.newKey(),1,2,3,4,5); Vec y = Vec.makeCon(x.group().addVec(),0,1,0,1,0); Vec z = Vec.makeCon(Vec.newKey(),1,2,3,4,5); Vec w = Vec.makeCon(x.group().addVec(),1,0,1,0,1); Frame fr = new Frame(Key.<Frame>make("test"),new String[]{"x","y","z","w"},new Vec[]{x,y,z,w}); DKV.put(fr); GLMParameters parms = new GLMParameters(Family.gaussian); parms._train = fr._key; parms._lambda = new double[]{0}; parms._alpha = new double[]{0}; parms._compute_p_values = true; parms._response_column = "z"; parms._weights_column = "w"; GLMModel m = new GLM(parms).trainModel().get(); System.out.println(m.coefficients()); m.delete(); fr.delete(); } @Test public void testDeviances() { for (Family fam : Family.values()) { if(fam == Family.quasibinomial) continue; Frame tfr = null; Frame res = null; Frame preds = null; GLMModel gbm = null; try { tfr = parse_test_file("./smalldata/gbm_test/BostonHousing.csv"); GLMModel.GLMParameters parms = new GLMModel.GLMParameters(); parms._train = tfr._key; String resp = tfr.lastVecName(); if (fam==Family.binomial || fam==Family.multinomial) { resp = fam==Family.multinomial?"rad":"chas"; Vec v = tfr.remove(resp); tfr.add(resp, v.toCategoricalVec()); v.remove(); DKV.put(tfr); } parms._response_column = resp; parms._family = fam; gbm = new GLM(parms).trainModel().get(); preds = gbm.score(tfr); res = gbm.computeDeviances(tfr,preds,"myDeviances"); double meanDeviances = res.anyVec().mean(); if (gbm._output.nclasses()==2) Assert.assertEquals(meanDeviances,((ModelMetricsBinomial) gbm._output._training_metrics)._logloss,1e-6*Math.abs(meanDeviances)); else if (gbm._output.nclasses()>2) Assert.assertEquals(meanDeviances,((ModelMetricsMultinomial) gbm._output._training_metrics)._logloss,1e-6*Math.abs(meanDeviances)); else Assert.assertEquals(meanDeviances,((ModelMetricsRegression) gbm._output._training_metrics)._mean_residual_deviance,1e-6*Math.abs(meanDeviances)); } finally { if (tfr != null) tfr.delete(); if (res != null) res.delete(); if (preds != null) preds.delete(); if (gbm != null) gbm.delete(); } } } /** * train = data.frame(c('red', 'blue','blue'),c('x','x','y'),c(1,'0','0')) names(train)= c('color', 'letter', 'label') test = data.frame(c('red', 'blue','blue','yellow'),c('x','x','y','y'),c(1,'0','0','0')) names(test)= c('color', 'letter', 'label') htrain = as.h2o(train) htest = as.h2o(test) hh = h2o.glm(x = 1:2,y = 3,training_frame = htrain,family = "binomial",max_iterations = 15,alpha = 1,missing_values_handling = 'Skip') h2o.predict(hh,htest) */ @Test public void testUnseenLevels(){ Scope.enter(); try { Vec v0 = Vec.makeCon(Vec.newKey(), 1, 0, 0, 1,1); v0.setDomain(new String[]{"blue", "red"}); Frame trn = new Frame(Key.<Frame>make("train"), new String[]{"color", "label"}, new Vec[]{v0, v0.makeCopy(null)}); DKV.put(trn); Vec v3 = Vec.makeCon(Vec.newKey(), 1, 0, 0, 2); v3.setDomain(new String[]{"blue", "red", "yellow"}); Vec v5 = Vec.makeCon(v3.group().addVec(), 1, 0, 0, 0); Frame tst = new Frame(Key.<Frame>make("test"), new String[]{"color", "label"}, new Vec[]{v3, v5}); DKV.put(tst); GLMParameters parms = new GLMParameters(Family.gaussian); parms._train = trn._key; parms._response_column = "label"; parms._missing_values_handling = MissingValuesHandling.Skip; GLMModel m = new GLM(parms).trainModel().get(); System.out.println("coefficients = " + m.coefficients()); double icpt = m.coefficients().get("Intercept"); Frame preds = m.score(tst); Assert.assertEquals(icpt+m.coefficients().get("color.red"), preds.vec(0).at(0), 0); Assert.assertEquals(icpt+m.coefficients().get("color.blue"), preds.vec(0).at(1), 0); Assert.assertEquals(icpt+m.coefficients().get("color.blue"), preds.vec(0).at(2), 0); Assert.assertEquals(icpt, preds.vec(0).at(3), 0); parms._missing_values_handling = MissingValuesHandling.MeanImputation; GLMModel m2 = new GLM(parms).trainModel().get(); Frame preds2 = m2.score(tst); icpt = m2.coefficients().get("Intercept"); System.out.println("coefficients = " + m2.coefficients()); Assert.assertEquals(icpt+m2.coefficients().get("color.red"), preds2.vec(0).at(0), 0); Assert.assertEquals(icpt+m2.coefficients().get("color.blue"), preds2.vec(0).at(1), 0); Assert.assertEquals(icpt+m2.coefficients().get("color.blue"), preds2.vec(0).at(2), 0); Assert.assertEquals(icpt+m2.coefficients().get("color.red"), preds2.vec(0).at(3), 0); trn.delete(); tst.delete(); m.delete(); preds.delete(); preds2.delete(); m2.delete(); }finally { Scope.exit(); } } }