package hex.glm; import hex.*; import hex.DataInfo.Row; import hex.DataInfo.TransformType; import hex.api.MakeGLMModelHandler; import hex.deeplearning.DeepLearningModel.DeepLearningParameters.MissingValuesHandling; import hex.glm.GLMModel.GLMParameters.Family; import hex.glm.GLMModel.GLMParameters.Link; import org.apache.commons.math3.distribution.NormalDistribution; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.distribution.TDistribution; import water.*; import water.codegen.CodeGenerator; import water.codegen.CodeGeneratorPipeline; import water.exceptions.JCodeSB; import water.fvec.Frame; import water.fvec.InteractionWrappedVec; import water.fvec.Vec; import water.util.*; import water.util.ArrayUtils; import java.util.Arrays; import java.util.HashMap; import java.util.NoSuchElementException; /** * Created by tomasnykodym on 8/27/14. */ public class GLMModel extends Model<GLMModel,GLMModel.GLMParameters,GLMModel.GLMOutput> { public GLMModel(Key selfKey, GLMParameters parms, GLM job, double [] ymu, double ySigma, double lambda_max, long nobs) { super(selfKey, parms, job == null?new GLMOutput():new GLMOutput(job)); // modelKey, parms, null, Double.NaN, Double.NaN, Double.NaN, -1 _ymu = ymu; _ySigma = ySigma; _lambda_max = lambda_max; _nobs = nobs; _nullDOF = nobs - (parms._intercept?1:0); } public void setVcov(double[][] inv) {_output._vcov = inv;} public static class RegularizationPath extends Iced { public double [] _lambdas; public double [] _explained_deviance_train; public double [] _explained_deviance_valid; public double [][] _coefficients; public double [][] _coefficients_std; public String [] _coefficient_names; } public RegularizationPath getRegularizationPath() { RegularizationPath rp = new RegularizationPath(); rp._coefficient_names = _output._coefficient_names; int N = _output._submodels.length; int P = _output._dinfo.fullN() + 1; rp._lambdas = new double[N]; rp._coefficients = new double[N][]; rp._explained_deviance_train = new double[N]; if (_parms._valid != null) rp._explained_deviance_valid = new double[N]; if (_parms._standardize) rp._coefficients_std = new double[N][]; for (int i = 0; i < N; ++i) { Submodel sm = _output._submodels[i]; rp._lambdas[i] = sm.lambda_value; rp._coefficients[i] = sm.getBeta(MemoryManager.malloc8d(P)); if (_parms._standardize) { rp._coefficients_std[i] = rp._coefficients[i]; rp._coefficients[i] = _output._dinfo.denormalizeBeta(rp._coefficients_std[i]); } rp._explained_deviance_train[i] = 1 - (_output._training_metrics._nobs*sm.devianceTrain)/((GLMMetrics)_output._training_metrics).null_deviance(); if (rp._explained_deviance_valid != null) rp._explained_deviance_valid[i] = 1 - _output._validation_metrics._nobs*sm.devianceTest/((GLMMetrics)_output._validation_metrics).null_deviance(); } return rp; } @Override protected boolean toJavaCheckTooBig() { if(beta() != null && beta().length > 10000) { Log.warn("toJavaCheckTooBig must be overridden for this model type to render it in the browser"); return true; } return false; } public DataInfo dinfo() { return _output._dinfo; } private int rank(double [] ds) { int res = 0; for(double d:ds) if(d != 0) ++res; return res; } @Override public ModelMetrics.MetricBuilder makeMetricBuilder(String[] domain) { if(domain == null && _parms._family == Family.binomial) domain = binomialClassNames; return new GLMMetricBuilder(domain, _ymu, new GLMWeightsFun(_parms), _output.bestSubmodel().rank(), true, _parms._intercept); } protected double [] beta_internal(){ if(_parms._family == Family.multinomial) return ArrayUtils.flat(_output._global_beta_multinomial); return _output._global_beta; } public double [] beta() { return _output._global_beta;} public double [] beta(double lambda) { for(int i = 0 ; i < _output._submodels.length; ++i) if(_output._submodels[i].lambda_value == lambda) return _output._dinfo.denormalizeBeta(_output._submodels[i].getBeta(MemoryManager.malloc8d(_output._dinfo.fullN()+1))); throw new RuntimeException("no such lambda value, lambda = " + lambda); } public double [] beta_std(double lambda) { for(int i = 0 ; i < _output._submodels.length; ++i) if(_output._submodels[i].lambda_value == lambda) return _output._submodels[i].getBeta(MemoryManager.malloc8d(_output._dinfo.fullN()+1)); throw new RuntimeException("no such lambda value, lambda = " + lambda); } public String [] names(){ return _output._names;} @Override public double deviance(double w, double y, double f) { if (w == 0) { return 0; } else if (w == 1) { return _parms.deviance(y, f); } else { return Double.NaN; //TODO: add deviance(w, y, f) } } public GLMModel addSubmodel(Submodel sm) { _output._submodels = ArrayUtils.append(_output._submodels,sm); _output.setSubmodelIdx(_output._submodels.length-1); return this; } public GLMModel updateSubmodel(Submodel sm) { assert sm.lambda_value == _output._submodels[_output._submodels.length-1].lambda_value; _output._submodels[_output._submodels.length-1] = sm; return this; } public void update(double [] beta, double devianceTrain, double devianceTest,int iter){ int id = _output._submodels.length-1; _output._submodels[id] = new Submodel(_output._submodels[id].lambda_value,beta,iter,devianceTrain,devianceTest); _output.setSubmodelIdx(id); } public GLMModel clone2(){ GLMModel res = clone(); res._output = (GLMOutput)res._output.clone(); return res; } public static class GLMParameters extends Model.Parameters { public String algoName() { return "GLM"; } public String fullName() { return "Generalized Linear Modeling"; } public String javaName() { return GLMModel.class.getName(); } @Override public long progressUnits() { return GLM.WORK_TOTAL; } // public int _response; // TODO: the standard is now _response_column in SupervisedModel.SupervisedParameters public boolean _standardize = true; public Family _family; public Link _link = Link.family_default; public Solver _solver = Solver.AUTO; public double _tweedie_variance_power; public double _tweedie_link_power; public double [] _alpha = null; public double [] _lambda = null; public MissingValuesHandling _missing_values_handling = MissingValuesHandling.MeanImputation; public double _prior = -1; public boolean _lambda_search = false; public int _nlambdas = -1; public boolean _non_negative = false; public boolean _exactLambdas = false; public double _lambda_min_ratio = -1; // special public boolean _use_all_factor_levels = false; public int _max_iterations = -1; public boolean _intercept = true; public double _beta_epsilon = 1e-4; public double _objective_epsilon = -1; public double _gradient_epsilon = -1; public double _obj_reg = -1; public boolean _compute_p_values = false; public boolean _remove_collinear_columns = false; public String[] _interactions=null; public boolean _early_stopping = true; public Key<Frame> _beta_constraints = null; // internal parameter, handle with care. GLM will stop when there is more than this number of active predictors (after strong rule screening) public int _max_active_predictors = -1; public boolean _stdOverride; // standardization override by beta constraints public void validate(GLM glm) { if(_alpha != null && (1 < _alpha[0] || _alpha[0] < 0)) glm.error("_alpha","alpha parameter must from (inclusive) [0,1] range"); if(_compute_p_values && _solver != Solver.AUTO && _solver != Solver.IRLSM) glm.error("_compute_p_values","P values can only be computed with IRLSM solver, go solver = " + _solver); if(_compute_p_values && (_lambda == null || _lambda[0] > 0)) glm.error("_compute_p_values","P values can only be computed with NO REGULARIZATION (lambda = 0)"); if(_compute_p_values && _family == Family.multinomial) glm.error("_compute_p_values","P values are currently not supported for family=multinomial"); if(_compute_p_values && _non_negative) glm.error("_compute_p_values","P values are currently not supported for family=multinomial"); if(_weights_column != null && _offset_column != null && _weights_column.equals(_offset_column)) glm.error("_offset_column", "Offset must be different from weights"); if(_alpha != null && (_alpha[0] < 0 || _alpha[0] > 1)) glm.error("_alpha", "Alpha value must be between 0 and 1"); if(_lambda != null && _lambda[0] < 0) glm.error("_lambda", "Lambda value must be >= 0"); if(_obj_reg != -1 && _obj_reg <= 0) glm.error("obj_reg","Must be positive or -1 for default"); if(_prior != -1 && _prior <= 0 || _prior >= 1) glm.error("_prior","Prior must be in (exclusive) range (0,1)"); if(_prior != -1 && _family != Family.binomial) glm.error("_prior","Prior is only allowed with family = binomial."); if(_family != Family.tweedie) { glm.hide("_tweedie_variance_power","Only applicable with Tweedie family"); glm.hide("_tweedie_link_power","Only applicable with Tweedie family"); } if(_remove_collinear_columns && !_intercept) glm.error("_intercept","Remove colinear columns option is currently not supported without intercept"); if(_beta_constraints != null) { if(_family == Family.multinomial) glm.error("beta_constraints","beta constraints are not supported for family = multionomial"); Frame f = _beta_constraints.get(); if(f == null) glm.error("beta_constraints","Missing frame for beta constraints"); Vec v = f.vec("names"); if(v == null)glm.error("beta_constraints","Beta constraints parameter must have names column with valid coefficient names"); // todo: check the coefficient names v = f.vec("upper_bounds"); if(v != null && !v.isNumeric()) glm.error("beta_constraints","upper_bounds must be numeric if present");v = f.vec("upper_bounds"); v = f.vec("lower_bounds"); if(v != null && !v.isNumeric()) glm.error("beta_constraints","lower_bounds must be numeric if present"); v = f.vec("beta_given"); if(v != null && !v.isNumeric()) glm.error("beta_constraints","beta_given must be numeric if present");v = f.vec("upper_bounds"); v = f.vec("beta_start"); if(v != null && !v.isNumeric()) glm.error("beta_constraints","beta_start must be numeric if present"); } if(!_lambda_search) { glm.hide("_lambda_min_ratio", "only applies if lambda search is on."); glm.hide("_nlambdas", "only applies if lambda search is on."); glm.hide("_early_stopping","only applies if lambda search is on."); } if(_link != Link.family_default) { // check we have compatible link switch (_family) { case gaussian: if (_link != Link.identity && _link != Link.log && _link != Link.inverse) throw new IllegalArgumentException("Incompatible link function for selected family. Only identity, log and inverse links are allowed for family=gaussian."); break; case quasibinomial: case binomial: if (_link != Link.logit) // fixme: R also allows log, but it's not clear when can be applied and what should we do in case the predictions are outside of 0/1. throw new IllegalArgumentException("Incompatible link function for selected family. Only logit is allowed for family=" + _family + ". Got " + _link); break; case poisson: if (_link != Link.log && _link != Link.identity) throw new IllegalArgumentException("Incompatible link function for selected family. Only log and identity links are allowed for family=poisson."); break; case gamma: if (_link != Link.inverse && _link != Link.log && _link != Link.identity) throw new IllegalArgumentException("Incompatible link function for selected family. Only inverse, log and identity links are allowed for family=gamma."); break; case tweedie: if (_link != Link.tweedie) throw new IllegalArgumentException("Incompatible link function for selected family. Only tweedie link allowed for family=tweedie."); break; case multinomial: if(_link != Link.multinomial) throw new IllegalArgumentException("Incompatible link function for selected family. Only multinomial link allowed for family=multinomial."); break; default: H2O.fail(); } } } public GLMParameters(){ this(Family.gaussian, Link.family_default); assert _link == Link.family_default; _stopping_rounds = 3; _stopping_metric = ScoreKeeper.StoppingMetric.deviance; _stopping_tolerance = 1e-4; } public GLMParameters(Family f){this(f,f.defaultLink);} public GLMParameters(Family f, Link l){this(f,l, null, null, 0, 1);} public GLMParameters(Family f, Link l, double[] lambda, double[] alpha, double twVar, double twLnk) { this(f,l,lambda,alpha,twVar,twLnk,null); } public GLMParameters(Family f, Link l, double [] lambda, double [] alpha, double twVar, double twLnk, String[] interactions){ this._lambda = lambda; this._alpha = alpha; this._tweedie_variance_power = twVar; this._tweedie_link_power = twLnk; _interactions=interactions; _family = f; _link = l; } public final double variance(double mu){ switch(_family) { case gaussian: return 1; case binomial: case multinomial: case quasibinomial: return mu * (1 - mu); case poisson: return mu; case gamma: return mu * mu; case tweedie: return Math.pow(mu, _tweedie_variance_power); default: throw new RuntimeException("unknown family Id " + this._family); } } public final boolean canonical(){ switch(_family){ case gaussian: return _link == Link.identity; case binomial: return _link == Link.logit; case poisson: return _link == Link.log; case gamma: return _link == Link.inverse; // case tweedie: // return false; default: throw H2O.unimpl(); } } public final double deviance(double yr, double ym){ double y1 = yr == 0?.1:yr; switch(_family){ case gaussian: return (yr - ym) * (yr - ym); case quasibinomial: case binomial: return 2 * ((y_log_y(yr, ym)) + y_log_y(1 - yr, 1 - ym)); case poisson: if( yr == 0 ) return 2 * ym; return 2 * ((yr * Math.log(yr / ym)) - (yr - ym)); case gamma: if( yr == 0 ) return -2; return -2 * (Math.log(yr / ym) - (yr - ym) / ym); case tweedie: double theta = _tweedie_variance_power == 1 ?Math.log(y1/ym) :(Math.pow(y1,1.-_tweedie_variance_power) - Math.pow(ym,1 - _tweedie_variance_power))/(1-_tweedie_variance_power); double kappa = _tweedie_variance_power == 2 ?Math.log(y1/ym) :(Math.pow(yr,2-_tweedie_variance_power) - Math.pow(ym,2-_tweedie_variance_power))/(2 - _tweedie_variance_power); return 2 * (yr * theta - kappa); default: throw new RuntimeException("unknown family " + _family); } } public final double deviance(float yr, float ym){ return deviance((double)yr,(double)ym); } public final double likelihood(double yr, double ym){ return .5 * deviance(yr,ym);} public final double linkDeriv(double x) { // note: compute an inverse of what R does switch(_link) { case logit: // case multinomial: double div = (x * (1 - x)); if(div < 1e-6) return 1e6; // avoid numerical instability return 1.0 / div; case identity: return 1; case log: return 1.0 / x; case inverse: return -1.0 / (x * x); case tweedie: // double res = _tweedie_link_power == 0 // ?Math.max(2e-16,Math.exp(x)) // // (1/lambda) * eta^(1/lambda - 1) // :(1.0/_tweedie_link_power) * Math.pow(link(x), 1.0/_tweedie_link_power - 1.0); return _tweedie_link_power == 0 ?1.0/Math.max(2e-16,x) :_tweedie_link_power * Math.pow(x,_tweedie_link_power-1); default: throw H2O.unimpl(); } } public final double linkInv(double x) { switch(_link) { // case multinomial: // should not be used case identity: return x; case logit: return 1.0 / (Math.exp(-x) + 1.0); case log: return Math.exp(x); case inverse: double xx = (x < 0) ? Math.min(-1e-5, x) : Math.max(1e-5, x); return 1.0 / xx; case tweedie: return _tweedie_link_power == 0 ?Math.max(2e-16,Math.exp(x)) :Math.pow(x, 1/ _tweedie_link_power); default: throw new RuntimeException("unexpected link function id " + this); } } public final double linkInvDeriv(double x) { switch(_link) { case identity: return 1; case logit: double g = Math.exp(-x); double gg = (g + 1) * (g + 1); return g / gg; case log: //return (x == 0)?MAX_SQRT:1/x; return Math.max(Math.exp(x), Double.MIN_NORMAL); case inverse: double xx = (x < 0) ? Math.min(-1e-5, x) : Math.max(1e-5, x); return -1 / (xx * xx); // case tweedie: // double vp = (1. - _tweedie_link_power) / _tweedie_link_power; // return (1/ _tweedie_link_power) * Math.pow(x, vp); default: throw new RuntimeException("unexpected link function id " + this); } } // supported families public enum Family { gaussian(Link.identity), binomial(Link.logit), quasibinomial(Link.logit),poisson(Link.log), gamma(Link.inverse), multinomial(Link.multinomial), tweedie(Link.tweedie); public final Link defaultLink; Family(Link link){defaultLink = link;} } public static enum Link {family_default, identity, logit, log, inverse, tweedie, multinomial} public static enum Solver {AUTO, IRLSM, L_BFGS, COORDINATE_DESCENT_NAIVE, COORDINATE_DESCENT} // helper function static final double y_log_y(double y, double mu) { if(y == 0)return 0; if(mu < Double.MIN_NORMAL) mu = Double.MIN_NORMAL; return y * Math.log(y / mu); } } // GLMParameters public static class GLMWeights { public double mu = 0; public double w = 1; public double z = 0; public double l = 0; public double dev = Double.NaN; } public static class GLMWeightsFun extends Iced { final Family _family; final Link _link; final double _var_power; final double _link_power; public GLMWeightsFun(GLMParameters parms) {this(parms._family,parms._link, parms._tweedie_variance_power, parms._tweedie_link_power);} public GLMWeightsFun(Family fam, Link link, double var_power, double link_power) { _family = fam; _link = link; _var_power = var_power; _link_power = link_power; } public final double link(double x) { switch(_link) { case identity: return x; case logit: assert 0 <= x && x <= 1:"x out of bounds, expected <0,1> range, got " + x; return Math.log(x / (1 - x)); case multinomial: case log: return Math.log(x); case inverse: double xx = (x < 0) ? Math.min(-1e-5, x) : Math.max(1e-5, x); return 1.0 / xx; case tweedie: return _link_power == 0?Math.log(x):Math.pow(x, _link_power); default: throw new RuntimeException("unknown link function " + this); } } public final double linkDeriv(double x) { // note: compute an inverse of what R does switch(_link) { case logit: // case multinomial: double div = (x * (1 - x)); if(div < 1e-6) return 1e6; // avoid numerical instability return 1.0 / div; case identity: return 1; case log: return 1.0 / x; case inverse: return -1.0 / (x * x); case tweedie: return _link_power == 0 ?1.0/Math.max(2e-16,x) :_link_power * Math.pow(x,_link_power-1); default: throw H2O.unimpl(); } } public final double linkInv(double x) { switch(_link) { // case multinomial: // should not be used case identity: return x; case logit: return 1.0 / (Math.exp(-x) + 1.0); case log: return Math.exp(x); case inverse: double xx = (x < 0) ? Math.min(-1e-5, x) : Math.max(1e-5, x); return 1.0 / xx; case tweedie: return _link_power == 0 ?Math.max(2e-16,Math.exp(x)) :Math.pow(x, 1/ _link_power); default: throw new RuntimeException("unexpected link function id " + _link); } } public final double variance(double mu){ switch(_family) { case gaussian: return 1; case quasibinomial: case binomial: double res = mu * (1 - mu); return res < 1e-6?1e-6:res; case poisson: return mu; case gamma: return mu * mu; case tweedie: return Math.pow(mu,_var_power); default: throw new RuntimeException("unknown family Id " + this._family); } } public final double deviance(double yr, double ym){ double y1 = yr == 0?.1:yr; switch(_family){ case gaussian: return (yr - ym) * (yr - ym); case quasibinomial: if(yr == ym) return 0; if(ym > 1) return -2 * (yr*Math.log(ym)); double res = -2 * (yr*Math.log(ym) + (1-yr)*Math.log(1-ym)); return res; case binomial: return 2 * ((MathUtils.y_log_y(yr, ym)) + MathUtils.y_log_y(1 - yr, 1 - ym)); case poisson: if( yr == 0 ) return 2 * ym; return 2 * ((yr * Math.log(yr / ym)) - (yr - ym)); case gamma: if( yr == 0 ) return -2; return -2 * (Math.log(yr / ym) - (yr - ym) / ym); case tweedie: double theta = _var_power == 1 ?Math.log(y1/ym) :(Math.pow(y1,1.-_var_power) - Math.pow(ym,1 - _var_power))/(1-_var_power); double kappa = _var_power == 2 ?Math.log(y1/ym) :(Math.pow(yr,2-_var_power) - Math.pow(ym,2-_var_power))/(2 - _var_power); return 2 * (yr * theta - kappa); default: throw new RuntimeException("unknown family " + _family); } } public final double deviance(float yr, float ym){ return deviance((double)yr,(double)ym); } public final void likelihoodAndDeviance(double yr, GLMWeights x, double w) { double ym = x.mu; switch (_family) { case gaussian: x.dev = w * (yr - ym) * (yr - ym); x.l = .5 * x.dev; break; case quasibinomial: if(yr == ym) x.l = 0; else if (ym > 1) x.l = -(yr*Math.log(ym)); else x.l = - (yr*Math.log(ym) + (1-yr)*Math.log(1-ym)); x.dev = 2*x.l; break; case binomial: x.l = ym == yr?0:w*((MathUtils.y_log_y(yr, ym)) + MathUtils.y_log_y(1 - yr, 1 - ym)); x.dev = 2*x.l; break; case poisson: case gamma: case tweedie: x.dev = w*deviance(yr,ym); x.l = x.dev; break; default: throw new RuntimeException("unknown family " + _family); } } public final double likelihood(double yr, double ym) { switch (_family) { case gaussian: return .5 * (yr - ym) * (yr - ym); case binomial: case quasibinomial: if (yr == ym) return 0; return .5 * deviance(yr, ym); case poisson: if (yr == 0) return 2 * ym; return 2 * ((yr * Math.log(yr / ym)) - (yr - ym)); case gamma: if (yr == 0) return -2; return -2 * (Math.log(yr / ym) - (yr - ym) / ym); case tweedie: return deviance(yr, ym); //fixme: not really correct, not sure what the likelihood is right now default: throw new RuntimeException("unknown family " + _family); } } public GLMWeights computeWeights(double y, double eta, double off, double w, GLMWeights x) { double etaOff = eta + off; x.mu = linkInv(etaOff); double var = variance(x.mu);//Math.max(1e-5, variance(x.mu)); // avoid numerical problems with 0 variance double d = linkDeriv(x.mu); x.w = w / (var * d * d); x.z = eta + (y - x.mu) * d; likelihoodAndDeviance(y,x,w); return x; } } public static class Submodel extends Iced { public final double lambda_value; public final int iteration; public final double devianceTrain; public final double devianceTest; public final int [] idxs; public final double [] beta; public double [] getBeta(double [] beta) { if(idxs != null){ for(int i = 0; i < idxs.length; ++i) beta[idxs[i]] = this.beta[i]; // beta[beta.length-1] = this.beta[this.beta.length-1]; } else System.arraycopy(this.beta,0,beta,0,beta.length); return beta; } public int rank(){ return idxs != null?idxs.length:(ArrayUtils.countNonzeros(beta)); } public Submodel(double lambda , double [] beta, int iteration, double devTrain, double devTest){ this.lambda_value = lambda; this.iteration = iteration; this.devianceTrain = devTrain; this.devianceTest = devTest; int r = 0; if(beta != null){ // grab the indeces of non-zero coefficients for(int i = 0; i < beta.length; ++i)if(beta[i] != 0)++r; if(r < beta.length) { idxs = MemoryManager.malloc4(r); int j = 0; for (int i = 0; i < beta.length; ++i) if (beta[i] != 0) idxs[j++] = i; this.beta = ArrayUtils.select(beta,idxs); } else { this.beta = beta.clone(); idxs = null; } } else { this.beta = null; idxs = null; } } } public final double _lambda_max; public final double [] _ymu; public final long _nullDOF; public final double _ySigma; public final long _nobs; private static String[] binomialClassNames = new String[]{"0", "1"}; @Override protected String[][] scoringDomains(){ String [][] domains = _output._domains; if(_parms._family == Family.binomial && _output._domains[_output._dinfo.responseChunkId(0)] == null) { domains = domains.clone(); domains[_output._dinfo.responseChunkId(0)] = binomialClassNames; } return domains; } public void setZValues(double [] zValues, double dispersion, boolean dispersionEstimated) { _output._zvalues = zValues; _output._dispersion = dispersion; _output._dispersionEstimated = dispersionEstimated; } public static class GLMOutput extends Model.Output { Submodel[] _submodels = new Submodel[0]; DataInfo _dinfo; String[] _coefficient_names; public int _best_lambda_idx; // lambda which minimizes deviance on validation (if provided) or train (if not) public int _lambda_1se = -1; // lambda_best + sd(lambda); only applicable if running lambda search with nfold public int _selected_lambda_idx; // lambda which minimizes deviance on validation (if provided) or train (if not) public double lambda_best(){return _submodels.length == 0 ? -1 : _submodels[_best_lambda_idx].lambda_value;} public double lambda_1se(){return _lambda_1se == -1 || _lambda_1se >= _submodels.length?-1:_submodels.length == 0 ? -1 : _submodels[_lambda_1se].lambda_value;} public double lambda_selected(){return _submodels[_selected_lambda_idx].lambda_value;} double[] _global_beta; private double[] _zvalues; double [][] _vcov; private double _dispersion; private boolean _dispersionEstimated; public boolean hasPValues(){return _zvalues != null;} public double [] stdErr(){ double [] res = _zvalues.clone(); for(int i = 0; i < res.length; ++i) res[i] = _global_beta[i]/_zvalues[i]; return res; } @Override protected long checksum_impl() { long d = _global_beta == null?1:Arrays.hashCode(_global_beta); return d*super.checksum_impl(); } public double [] zValues(){return _zvalues.clone();} public double [] pValues(){ double [] res = zValues(); RealDistribution rd = _dispersionEstimated?new TDistribution(_training_metrics.residual_degrees_of_freedom()):new NormalDistribution(); for(int i = 0; i < res.length; ++i) res[i] = 2*rd.cumulativeProbability(-Math.abs(res[i])); return res; } double[][] _global_beta_multinomial; final int _nclasses; public boolean _binomial; public boolean _multinomial; public int rank() { return _submodels[_selected_lambda_idx].rank();} public boolean isStandardized() { return _dinfo._predictor_transform == TransformType.STANDARDIZE; } public String[] coefficientNames() { return _coefficient_names; } // GLM is always supervised public boolean isSupervised() { return true; } @Override public String[] interactions() { return _dinfo._interactionColumns; } public static Frame expand(Frame fr, String[] interactions, boolean useAll, boolean standardize, boolean skipMissing) { return MakeGLMModelHandler.oneHot(fr,interactions,useAll,standardize,false,skipMissing); } public GLMOutput(DataInfo dinfo, String[] column_names, String[][] domains, String[] coefficient_names, boolean binomial) { super(dinfo._weights, dinfo._offset, dinfo._fold); _dinfo = dinfo.clone(); setNames(column_names); _domains = domains; _coefficient_names = coefficient_names; _binomial = binomial; _nclasses = binomial?2:1; if(_binomial && domains[domains.length-1] != null) { assert domains[domains.length - 1].length == 2:"Unexpected domains " + Arrays.toString(domains); binomialClassNames = domains[domains.length - 1]; } } public GLMOutput(DataInfo dinfo, String[] column_names, String[][] domains, String[] coefficient_names, boolean binomial, double[] beta) { this(dinfo,column_names,domains,coefficient_names,binomial); assert !ArrayUtils.hasNaNsOrInfs(beta); _global_beta=beta; _submodels = new Submodel[]{new Submodel(0,beta,-1,Double.NaN,Double.NaN)}; } public GLMOutput() {_isSupervised = true; _nclasses = -1;} public GLMOutput(GLM glm) { super(glm); _dinfo = glm._dinfo.clone(); _dinfo._adaptedFrame = null; String[] cnames = glm._dinfo.coefNames(); String [] names = glm._dinfo._adaptedFrame._names; String [][] domains = glm._dinfo._adaptedFrame.domains(); int id = glm._generatedWeights == null?-1:ArrayUtils.find(names, glm._generatedWeights); if(id >= 0) { _dinfo._weights = false; String [] ns = new String[names.length-1]; String[][] ds = new String[domains.length-1][]; System.arraycopy(names,0,ns,0,id); System.arraycopy(domains,0,ds,0,id); System.arraycopy(names,id+1,ns,id,ns.length-id); System.arraycopy(domains,id+1,ds,id,ds.length-id); names = ns; domains = ds; } setNames(names); _domains = domains; _coefficient_names = Arrays.copyOf(cnames, cnames.length + 1); _coefficient_names[_coefficient_names.length-1] = "Intercept"; _binomial = glm._parms._family == Family.binomial; _nclasses = glm.nclasses(); _multinomial = _nclasses > 2; } /** * Variance Covariance matrix accessor. Available only if odel has been built with p-values. * @return */ public double [][] vcov(){return _vcov;} @Override public int nclasses() { return _nclasses; } @Override public String[] classNames() { String [] res = super.classNames(); if(res == null && _binomial) return binomialClassNames; return res; } public Submodel pickBestModel() { int bestId = 0; Submodel best = _submodels[0]; for(int i = 1; i < _submodels.length; ++i) { Submodel sm = _submodels[i]; if(!(sm.devianceTest > best.devianceTest) && sm.devianceTrain < best.devianceTrain){ bestId = i; best = sm; } } setSubmodelIdx(_best_lambda_idx = bestId); return best; } public double[] getNormBeta() {return _submodels[_selected_lambda_idx].getBeta(MemoryManager.malloc8d(_dinfo.fullN()+1));} public double[][] getNormBetaMultinomial() { return getNormBetaMultinomial(_selected_lambda_idx); } public double[][] getNormBetaMultinomial(int idx) { if(_submodels == null || _submodels.length == 0) // no model yet return null; double [][] res = new double[nclasses()][]; Submodel sm = _submodels[idx]; int N = _dinfo.fullN()+1; double [] beta = sm.beta; if(sm.idxs != null) beta = ArrayUtils.expandAndScatter(beta,nclasses()*(_dinfo.fullN()+1),sm.idxs); for(int i = 0; i < res.length; ++i) res[i] = Arrays.copyOfRange(beta,i*N,(i+1)*N); return res; } public double[][] get_global_beta_multinomial(){return _global_beta_multinomial;} public void setSubmodelIdx(int l){ _selected_lambda_idx = l; if(_multinomial) { _global_beta_multinomial = getNormBetaMultinomial(l); for(int i = 0; i < _global_beta_multinomial.length; ++i) _global_beta_multinomial[i] = _dinfo.denormalizeBeta(_global_beta_multinomial[i]); } else { if (_global_beta == null) _global_beta = MemoryManager.malloc8d(_coefficient_names.length); else Arrays.fill(_global_beta, 0); _submodels[l].getBeta(_global_beta); _global_beta = _dinfo.denormalizeBeta(_global_beta); } } public double [] beta() { return _global_beta;} public Submodel bestSubmodel(){ return _submodels[_best_lambda_idx];} public void setSubmodel(double lambdaCVEstimate) { for(int i = 0; i < _submodels.length; ++i) if(_submodels[i] != null && _submodels[i].lambda_value == lambdaCVEstimate) { setSubmodelIdx(i); return; } throw new NoSuchElementException("has no model for lambda = " + lambdaCVEstimate); } public Submodel getSubmodel(double lambdaCVEstimate) { for(int i = 0; i < _submodels.length; ++i) if(_submodels[i] != null && _submodels[i].lambda_value == lambdaCVEstimate) { return _submodels[i]; } return null; } } /** * get beta coefficients in a map indexed by name * @return the estimated coefficients */ public HashMap<String,Double> coefficients(){ HashMap<String, Double> res = new HashMap<>(); final double [] b = beta(); if(b != null) for(int i = 0; i < b.length; ++i)res.put(_output._coefficient_names[i],b[i]); return res; } // TODO: Shouldn't this be in schema? have it here for now to be consistent with others... /** * Re-do the TwoDim table generation with updated model. */ public TwoDimTable generateSummary(Key train, int iter){ String[] names = new String[]{"Family", "Link", "Regularization", "Number of Predictors Total", "Number of Active Predictors", "Number of Iterations", "Training Frame"}; String[] types = new String[]{"string", "string", "string", "int", "int", "int", "string"}; String[] formats = new String[]{"%s", "%s", "%s", "%d", "%d", "%d", "%s"}; if (_parms._lambda_search) { names = new String[]{"Family", "Link", "Regularization", "Lambda Search", "Number of Predictors Total", "Number of Active Predictors", "Number of Iterations", "Training Frame"}; types = new String[]{"string", "string", "string", "string", "int", "int", "int", "string"}; formats = new String[]{"%s", "%s", "%s", "%s", "%d", "%d", "%d", "%s"}; } _output._model_summary = new TwoDimTable("GLM Model", "summary", new String[]{""}, names, types, formats, ""); _output._model_summary.set(0, 0, _parms._family.toString()); _output._model_summary.set(0, 1, _parms._link.toString()); String regularization = "None"; if (_parms._lambda != null && !(_parms._lambda.length == 1 && _parms._lambda[0] == 0)) { // have regularization if (_parms._alpha[0] == 0) regularization = "Ridge ( lambda = "; else if (_parms._alpha[0] == 1) regularization = "Lasso (lambda = "; else regularization = "Elastic Net (alpha = " + MathUtils.roundToNDigits(_parms._alpha[0], 4) + ", lambda = "; regularization = regularization + MathUtils.roundToNDigits(_parms._lambda[_output._selected_lambda_idx], 4) + " )"; } _output._model_summary.set(0, 2, regularization); int lambdaSearch = 0; if (_parms._lambda_search) { lambdaSearch = 1; _output._model_summary.set(0, 3, "nlambda = " + _parms._nlambdas + ", lambda.max = " + MathUtils.roundToNDigits(_lambda_max, 4) + ", lambda.min = " + MathUtils.roundToNDigits(_output.lambda_best(), 4) + ", lambda.1se = " + MathUtils.roundToNDigits(_output.lambda_1se(), 4)); } int intercept = _parms._intercept ? 1 : 0; if(_output.nclasses() > 2) { _output._model_summary.set(0, 3 + lambdaSearch,_output.nclasses()*_output._coefficient_names.length); _output._model_summary.set(0, 4 + lambdaSearch, Integer.toString(_output.rank() - _output.nclasses()*intercept)); } else { _output._model_summary.set(0, 3 + lambdaSearch, beta().length - 1); _output._model_summary.set(0, 4 + lambdaSearch, Integer.toString(_output.rank() - intercept)); } _output._model_summary.set(0, 5 + lambdaSearch, Integer.valueOf(iter)); _output._model_summary.set(0, 6 + lambdaSearch, train.toString()); return _output._model_summary; } @Override public long checksum_impl(){ if(_parms._train == null) return 0; return super.checksum_impl(); } private static ThreadLocal<double[]> _eta = new ThreadLocal<>(); @Override protected double[] score0(double[] data, double[] preds){return score0(data,preds,1,0);} @Override protected double[] score0(double[] data, double[] preds, double w, double o) { if(_parms._family == Family.multinomial) { if(o != 0) throw H2O.unimpl("Offset is not implemented for multinomial."); double[] eta = _eta.get(); if(eta == null || eta.length < _output.nclasses()) _eta.set(eta = MemoryManager.malloc8d(_output.nclasses())); final double[][] bm = _output._global_beta_multinomial; double sumExp = 0; double maxRow = 0; for (int c = 0; c < bm.length; ++c) { double e = bm[c][bm[c].length-1]; double [] b = bm[c]; for(int i = 0; i < _output._dinfo._cats; ++i) { int l = _output._dinfo.getCategoricalId(i, data[i]); if (l >= 0) e += b[l]; } int coff = _output._dinfo._cats; int boff = _output._dinfo.numStart(); for(int i = 0; i < _output._dinfo._nums; ++i) { double d = data[coff+i]; if(!_output._dinfo._skipMissing && Double.isNaN(d)) d = _output._dinfo._numMeans[i]; e += d*b[boff+i]; } if(e > maxRow) maxRow = e; eta[c] = e; } for (int c = 0; c < bm.length; ++c) sumExp += eta[c] = Math.exp(eta[c]-maxRow); // intercept sumExp = 1.0 / sumExp; for (int c = 0; c < bm.length; ++c) preds[c + 1] = eta[c] * sumExp; preds[0] = ArrayUtils.maxIndex(eta); } else { double[] b = beta(); double eta = b[b.length - 1] + o; // intercept + offset for (int i = 0; i < _output._dinfo._cats && !Double.isNaN(eta); ++i) { int l = _output._dinfo.getCategoricalId(i, data[i]); if (l >= 0) eta += b[l]; } int numStart = _output._dinfo.numStart(); int ncats = _output._dinfo._cats; for (int i = 0; i < _output._dinfo._nums && !Double.isNaN(eta); ++i) { double d = data[ncats + i]; if (!_output._dinfo._skipMissing && Double.isNaN(d)) d = _output._dinfo._numMeans[i]; eta += b[numStart + i] * d; } double mu = _parms.linkInv(eta); if (_parms._family == Family.binomial) { // threshold for prediction preds[0] = mu >= defaultThreshold()?1:0; preds[1] = 1.0 - mu; // class 0 preds[2] = mu; // class 1 } else preds[0] = mu; } return preds; } @Override protected void toJavaPredictBody(SBPrintStream body, CodeGeneratorPipeline classCtx, CodeGeneratorPipeline fileCtx, final boolean verboseCode) { // Generate static fields classCtx.add(new CodeGenerator() { @Override public void generate(JCodeSB out) { JCodeGen.toClassWithArray(out, "public static", "BETA", beta_internal()); // "The Coefficients" JCodeGen.toClassWithArray(out, "static", "NUM_MEANS", _output._dinfo._numMeans,"Imputed numeric values"); JCodeGen.toClassWithArray(out, "static", "CAT_MODES", _output._dinfo.catNAFill(),"Imputed categorical values."); JCodeGen.toStaticVar(out, "CATOFFS", dinfo()._catOffsets, "Categorical Offsets"); } }); body.ip("final double [] b = BETA.VALUES;").nl(); if(_parms._missing_values_handling == MissingValuesHandling.MeanImputation){ body.ip("for(int i = 0; i < " + _output._dinfo._cats + "; ++i) if(Double.isNaN(data[i])) data[i] = CAT_MODES.VALUES[i];").nl(); body.ip("for(int i = 0; i < " + _output._dinfo._nums + "; ++i) if(Double.isNaN(data[i + " + _output._dinfo._cats + "])) data[i+" + _output._dinfo._cats + "] = NUM_MEANS.VALUES[i];").nl(); } if(_parms._family != Family.multinomial) { body.ip("double eta = 0.0;").nl(); if (!_parms._use_all_factor_levels) { // skip level 0 of all factors body.ip("for(int i = 0; i < CATOFFS.length-1; ++i) if(data[i] != 0) {").nl(); body.ip(" int ival = (int)data[i] - 1;").nl(); body.ip(" if(ival != data[i] - 1) throw new IllegalArgumentException(\"categorical value out of range\");").nl(); body.ip(" ival += CATOFFS[i];").nl(); body.ip(" if(ival < CATOFFS[i + 1])").nl(); body.ip(" eta += b[ival];").nl(); } else { // do not skip any levels body.ip("for(int i = 0; i < CATOFFS.length-1; ++i) {").nl(); body.ip(" int ival = (int)data[i];").nl(); body.ip(" if(ival != data[i]) throw new IllegalArgumentException(\"categorical value out of range\");").nl(); body.ip(" ival += CATOFFS[i];").nl(); body.ip(" if(ival < CATOFFS[i + 1])").nl(); body.ip(" eta += b[ival];").nl(); } body.ip("}").nl(); final int noff = dinfo().numStart() - dinfo()._cats; body.ip("for(int i = ").p(dinfo()._cats).p("; i < b.length-1-").p(noff).p("; ++i)").nl(); body.ip("eta += b[").p(noff).p("+i]*data[i];").nl(); body.ip("eta += b[b.length-1]; // reduce intercept").nl(); if(_parms._family != Family.tweedie) body.ip("double mu = hex.genmodel.GenModel.GLM_").p(_parms._link.toString()).p("Inv(eta"); else body.ip("double mu = hex.genmodel.GenModel.GLM_tweedieInv(eta," + _parms._tweedie_link_power); body.p(");").nl(); if (_parms._family == Family.binomial) { body.ip("preds[0] = (mu >= ").p(defaultThreshold()).p(") ? 1 : 0").p("; // threshold given by ROC").nl(); body.ip("preds[1] = 1.0 - mu; // class 0").nl(); body.ip("preds[2] = mu; // class 1").nl(); } else { body.ip("preds[0] = mu;").nl(); } } else { int P = _output._global_beta_multinomial[0].length; body.ip("preds[0] = 0;").nl(); body.ip("for(int c = 0; c < " + _output._nclasses + "; ++c){").nl(); body.ip(" preds[c+1] = 0;").nl(); if(dinfo()._cats > 0) { if (!_parms._use_all_factor_levels) { // skip level 0 of all factors body.ip(" for(int i = 0; i < CATOFFS.length-1; ++i) if(data[i] != 0) {").nl(); body.ip(" int ival = (int)data[i] - 1;").nl(); body.ip(" if(ival != data[i] - 1) throw new IllegalArgumentException(\"categorical value out of range\");").nl(); body.ip(" ival += CATOFFS[i];").nl(); body.ip(" if(ival < CATOFFS[i + 1])").nl(); body.ip(" preds[c+1] += b[ival+c*" + P + "];").nl(); } else { // do not skip any levels body.ip(" for(int i = 0; i < CATOFFS.length-1; ++i) {").nl(); body.ip(" int ival = (int)data[i];").nl(); body.ip(" if(ival != data[i]) throw new IllegalArgumentException(\"categorical value out of range\");").nl(); body.ip(" ival += CATOFFS[i];").nl(); body.ip(" if(ival < CATOFFS[i + 1])").nl(); body.ip(" preds[c+1] += b[ival+c*" + P + "];").nl(); } body.ip(" }").nl(); } final int noff = dinfo().numStart(); body.ip(" for(int i = 0; i < " + dinfo()._nums + "; ++i)").nl(); body.ip(" preds[c+1] += b[" + noff + "+i + c*" + P + "]*data[i];").nl(); body.ip(" preds[c+1] += b[" + (P-1) +" + c*" + P + "]; // reduce intercept").nl(); body.ip("}").nl(); body.ip("double max_row = 0;").nl(); body.ip("for(int c = 1; c < preds.length; ++c) if(preds[c] > max_row) max_row = preds[c];").nl(); body.ip("double sum_exp = 0;").nl(); body.ip("for(int c = 1; c < preds.length; ++c) { sum_exp += (preds[c] = Math.exp(preds[c]-max_row));}").nl(); body.ip("sum_exp = 1/sum_exp;").nl(); body.ip("double max_p = 0;").nl(); body.ip("for(int c = 1; c < preds.length; ++c) if((preds[c] *= sum_exp) > max_p){ max_p = preds[c]; preds[0] = c-1;};").nl(); } } @Override protected SBPrintStream toJavaInit(SBPrintStream sb, CodeGeneratorPipeline fileCtx) { sb.nl(); sb.ip("public boolean isSupervised() { return true; }").nl(); sb.ip("public int nfeatures() { return "+_output.nfeatures()+"; }").nl(); sb.ip("public int nclasses() { return "+_output.nclasses()+"; }").nl(); return sb; } private GLMScore makeScoringTask(Frame adaptFrm, boolean generatePredictions, Job j){ // Build up the names & domains. final boolean computeMetrics = adaptFrm.vec(_output.responseName()) != null && !adaptFrm.vec(_output.responseName()).isBad(); String [] domain = _output.nclasses()<=1 ? null : !computeMetrics ? _output._domains[_output._domains.length-1] : adaptFrm.lastVec().domain(); // Score the dataset, building the class distribution & predictions return new GLMScore(j, this, _output._dinfo.scoringInfo(_output._names,adaptFrm),domain,computeMetrics, generatePredictions); } /** Score an already adapted frame. Returns a new Frame with new result * vectors, all in the DKV. Caller responsible for deleting. Input is * already adapted to the Model's domain, so the output is also. Also * computes the metrics for this frame. * * @param adaptFrm Already adapted frame * @param computeMetrics * @return A Frame containing the prediction column, and class distribution */ @Override protected Frame predictScoreImpl(Frame fr, Frame adaptFrm, String destination_key, Job j, boolean computeMetrics) { String [] names = makeScoringNames(); String [][] domains = new String[names.length][]; GLMScore gs = makeScoringTask(adaptFrm,true,j);// doAll(names.length,Vec.T_NUM,adaptFrm); assert gs._dinfo._valid:"_valid flag should be set on data info when doing scoring"; gs.doAll(names.length,Vec.T_NUM,gs._dinfo._adaptedFrame); if (gs._computeMetrics) gs._mb.makeModelMetrics(this, fr, adaptFrm, gs.outputFrame()); domains[0] = gs._domain; return gs.outputFrame(Key.<Frame>make(destination_key),names, domains); } @Override public String [] makeScoringNames(){ String [] res = super.makeScoringNames(); if(_output._vcov != null) res = ArrayUtils.append(res,"StdErr"); return res; } /** Score an already adapted frame. Returns a MetricBuilder that can be used to make a model metrics. * @param adaptFrm Already adapted frame * @return MetricBuilder */ @Override protected ModelMetrics.MetricBuilder scoreMetrics(Frame adaptFrm) { GLMScore gs = makeScoringTask(adaptFrm,false,null);// doAll(names.length,Vec.T_NUM,adaptFrm); assert gs._dinfo._valid:"_valid flag should be set on data info when doing scoring"; return gs.doAll(gs._dinfo._adaptedFrame)._mb; } @Override public GLMMojoWriter getMojo() { return new GLMMojoWriter(this); } }