package hex.glm; import hex.*; import hex.deeplearning.DeepLearningModel.DeepLearningParameters.MissingValuesHandling; import hex.glm.GLMModel.*; import hex.optimization.ADMM.L1Solver; import hex.optimization.L_BFGS; import hex.glm.GLMModel.GLMParameters.Family; import hex.glm.GLMModel.GLMParameters.Link; import hex.glm.GLMModel.GLMParameters.Solver; import hex.glm.GLMTask.*; import hex.gram.Gram; import hex.gram.Gram.Cholesky; import hex.gram.Gram.NonSPDMatrixException; import hex.optimization.ADMM; import hex.optimization.ADMM.ProximalSolver; import hex.optimization.L_BFGS.*; import hex.optimization.OptimizationUtils.*; import jsr166y.CountedCompleter; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import water.*; import water.exceptions.H2OModelBuilderIllegalArgumentException; import water.fvec.*; import water.parser.BufferedString; import water.util.*; import water.util.ArrayUtils; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; /** * Created by tomasnykodym on 8/27/14. * * Generalized linear model implementation. */ public class GLM extends ModelBuilder<GLMModel,GLMParameters,GLMOutput> { protected boolean _cv; // flag signalling this is MB for one of the fold-models during cross-validation static NumberFormat lambdaFormatter = new DecimalFormat(".##E0"); static NumberFormat devFormatter = new DecimalFormat(".##"); public static final int SCORING_INTERVAL_MSEC = 15000; // scoreAndUpdateModel every minute unless socre every iteration is set public String _generatedWeights = null; public GLM(boolean startup_once){super(new GLMParameters(),startup_once);} public GLM(GLMModel.GLMParameters parms) { super(parms); init(false); } public GLM(GLMModel.GLMParameters parms,Key dest) { super(parms,dest); init(false); } private transient GLMDriver _driver; public boolean isSupervised() { return true; } @Override public ModelCategory[] can_build() { return new ModelCategory[]{ ModelCategory.Regression, ModelCategory.Binomial, }; } @Override public boolean havePojo() { return true; } @Override public boolean haveMojo() { return true; } private double _lambdaCVEstimate = Double.NaN; // lambda cross-validation estimate private boolean _doInit = true; // flag setting whether or not to run init private double [] _xval_test_deviances; private double [] _xval_test_sd; /** * GLM implementation of N-fold cross-validation. * We need to compute the sequence of lambdas for the main model so the folds share the same lambdas. * We also want to set the _cv flag so that the dependent jobs know they're being run withing CV (so e.g. they do not unlock the models in the end) * @return Cross-validation Job * (builds N+1 models, all have train+validation metrics, the main model has N-fold cross-validated validation metrics) */ @Override public void computeCrossValidation() { // init computes global list of lambdas init(true); _cv = true; if (error_count() > 0) throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(GLM.this); super.computeCrossValidation(); } /** * If run with lambda search, we need to take extra action performed after cross-val models are built. * Each of the folds have been computed with ots own private validation datasetd and it performed early stopping based on it. * => We need to: * 1. compute cross-validated lambda estimate * 2. set the lambda estimate too all n-folds models (might require extra model fitting if the particular model stopped too early!) * 3. compute cross-validated scoring history (cross-validated deviance standard error per lambda) * 4. unlock the n-folds models (they are changed here, so the unlocking happens here) */ @Override public void cv_computeAndSetOptimalParameters(ModelBuilder[] cvModelBuilders) { if(_parms._lambda_search) { _xval_test_deviances = new double[_parms._lambda.length]; _xval_test_sd = new double [_parms._lambda.length]; double bestTestDev = Double.POSITIVE_INFINITY; int lmin_max = 0; for (int i = 0; i < cvModelBuilders.length; ++i) { GLM g = (GLM) cvModelBuilders[i]; lmin_max = Math.max(lmin_max,g._model._output._best_lambda_idx); } int lidx = 0; int bestId = 0; int cnt = 0; for (; lidx < lmin_max; ++lidx) { double testDev = 0; for (int i = 0; i < cvModelBuilders.length; ++i) { GLM g = (GLM) cvModelBuilders[i]; double x = _parms._lambda[lidx]; if (g._model._output.getSubmodel(x) == null) g._driver.computeSubmodel(lidx, x); testDev += g._model._output.getSubmodel(x).devianceTest; } double testDevAvg = testDev / cvModelBuilders.length; double testDevSE = 0; // compute deviance standard error for (int i = 0; i < cvModelBuilders.length; ++i) { GLM g = (GLM) cvModelBuilders[i]; double x = _parms._lambda[lidx]; if (g._model._output.getSubmodel(x) == null) g._driver.computeSubmodel(lidx, x); // stopped too early, need to FIT extra model double diff = testDevAvg - (g._model._output.getSubmodel(x).devianceTest); testDevSE += diff*diff; } _xval_test_sd[lidx] = Math.sqrt(testDevSE/((cvModelBuilders.length-1)*cvModelBuilders.length)); _xval_test_deviances[lidx] = testDevAvg; if(testDevAvg < bestTestDev) { bestTestDev = testDevAvg; bestId = lidx; } // early stopping - no reason to move further if we're overfitting if(testDevAvg > bestTestDev && ++cnt == 3) { lmin_max = lidx; break; } } for (int i = 0; i < cvModelBuilders.length; ++i) { GLM g = (GLM) cvModelBuilders[i]; if(g._toRemove != null) for(Key k:g._toRemove) Keyed.remove(k); } _parms._lambda = Arrays.copyOf(_parms._lambda,lmin_max+1); _xval_test_deviances = Arrays.copyOf(_xval_test_deviances, lmin_max+1); _xval_test_sd = Arrays.copyOf(_xval_test_sd, lmin_max+1); for (int i = 0; i < cvModelBuilders.length; ++i) { GLM g = (GLM) cvModelBuilders[i]; g._model._output.setSubmodelIdx(bestId); g._model.update(_job); } double bestDev = _xval_test_deviances[bestId]; double bestDev1se = bestDev + _xval_test_sd[bestId]; int bestId1se = bestId; while(bestId1se > 0 && _xval_test_deviances[bestId1se-1] <= bestDev1se) --bestId1se; _lambdaCVEstimate = _parms._lambda[bestId]; _model._output._lambda_1se = bestId1se; _model._output._best_lambda_idx = bestId; } for (int i = 0; i < cvModelBuilders.length; ++i) { GLM g = (GLM) cvModelBuilders[i]; g._model.unlock(_job); } _doInit = false; _cv = false; } protected void checkMemoryFootPrint(DataInfo activeData) { if (_parms._solver == Solver.IRLSM || _parms._solver == Solver.COORDINATE_DESCENT) { int p = activeData.fullN(); HeartBeat hb = H2O.SELF._heartbeat; long mem_usage = (long) (hb._cpus_allowed * (p * p + activeData.largestCat()) * 8/*doubles*/ * (1 + .5 * Math.log((double) _train.lastVec().nChunks()) / Math.log(2.))); //one gram per core long max_mem = hb.get_free_mem(); if (mem_usage > max_mem) { String msg = "Gram matrices (one per thread) won't fit in the driver node's memory (" + PrettyPrint.bytes(mem_usage) + " > " + PrettyPrint.bytes(max_mem) + ") - try reducing the number of columns and/or the number of categorical factors (or switch to the L-BFGS solver)."; error("_train", msg); } } } static class TooManyPredictorsException extends RuntimeException {} DataInfo _dinfo; private transient DataInfo _validDinfo; // time per iteration in ms private static class ScoringHistory { private ArrayList<Integer> _scoringIters = new ArrayList<>(); private ArrayList<Long> _scoringTimes = new ArrayList<>(); private ArrayList<Double> _likelihoods = new ArrayList<>(); private ArrayList<Double> _objectives = new ArrayList<>(); public synchronized void addIterationScore(int iter, double likelihood, double obj) { if (_scoringIters.size() > 0 && _scoringIters.get(_scoringIters.size() - 1) == iter) return; // do not record twice, happens for the last iteration, need to record scoring history in checkKKTs because of gaussian fam. _scoringIters.add(iter); _scoringTimes.add(System.currentTimeMillis()); _likelihoods.add(likelihood); _objectives.add(obj); } public synchronized TwoDimTable to2dTable() { String[] cnames = new String[]{"timestamp", "duration", "iteration", "negative_log_likelihood", "objective"}; String[] ctypes = new String[]{"string", "string", "int", "double", "double"}; String[] cformats = new String[]{"%s", "%s", "%d", "%.5f", "%.5f"}; TwoDimTable res = new TwoDimTable("Scoring History", "", new String[_scoringIters.size()], cnames, ctypes, cformats, ""); int j = 0; DateTimeFormatter fmt = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); for (int i = 0; i < _scoringIters.size(); ++i) { int col = 0; res.set(i, col++, fmt.print(_scoringTimes.get(i))); res.set(i, col++, PrettyPrint.msecs(_scoringTimes.get(i) - _scoringTimes.get(0), true)); res.set(i, col++, _scoringIters.get(i)); res.set(i, col++, _likelihoods.get(i)); res.set(i, col++, _objectives.get(i)); } return res; } } private static class LambdaSearchScoringHistory { ArrayList<Long> _scoringTimes = new ArrayList<>(); private ArrayList<Double> _lambdas = new ArrayList<>(); private ArrayList<Integer> _lambdaIters = new ArrayList<>(); private ArrayList<Integer> _lambdaPredictors = new ArrayList<>(); private ArrayList<Double> _lambdaDevTrain = new ArrayList<>(); private ArrayList<Double> _lambdaDevTest; private ArrayList<Double> _lambdaDevXval; private ArrayList<Double> _lambdaDevXvalSE; public LambdaSearchScoringHistory(boolean hasTest, boolean hasXval) { if(hasTest || true)_lambdaDevTest = new ArrayList<>(); if(hasXval){ _lambdaDevXval = new ArrayList<>(); _lambdaDevXvalSE = new ArrayList<>(); } } public synchronized void addLambdaScore(int iter, int predictors, double lambda, double devRatioTrain, double devRatioTest, double devRatioXval, double devRatoioXvalSE) { _scoringTimes.add(System.currentTimeMillis()); _lambdaIters.add(iter); _lambdas.add(lambda); _lambdaPredictors.add(predictors); _lambdaDevTrain.add(devRatioTrain); if(_lambdaDevTest != null)_lambdaDevTest.add(devRatioTest); if(_lambdaDevXval != null)_lambdaDevXval.add(devRatioXval); if(_lambdaDevXvalSE != null)_lambdaDevXvalSE.add(devRatoioXvalSE); } public synchronized TwoDimTable to2dTable() { String[] cnames = new String[]{"timestamp", "duration", "iteration", "lambda", "predictors", "deviance_train"}; if(_lambdaDevTest != null) cnames = ArrayUtils.append(cnames,"deviance_test"); if(_lambdaDevXval != null) cnames = ArrayUtils.append(cnames,new String[]{"deviance_xval","deviance_se"}); String[] ctypes = new String[]{"string", "string", "int", "string","int", "double"}; if(_lambdaDevTest != null) ctypes = ArrayUtils.append(ctypes,"double"); if(_lambdaDevXval != null) ctypes = ArrayUtils.append(ctypes, new String[]{"double","double"}); String[] cformats = new String[]{"%s", "%s", "%d","%s", "%d", "%.3f"}; if(_lambdaDevTest != null) cformats = ArrayUtils.append(cformats,"%.3f"); if(_lambdaDevXval != null) cformats = ArrayUtils.append(cformats,new String[]{"%.3f","%.3f"}); TwoDimTable res = new TwoDimTable("Scoring History", "", new String[_lambdaIters.size()], cnames, ctypes, cformats, ""); int j = 0; DateTimeFormatter fmt = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss"); for (int i = 0; i < _lambdaIters.size(); ++i) { int col = 0; res.set(i, col++, fmt.print(_scoringTimes.get(i))); res.set(i, col++, PrettyPrint.msecs(_scoringTimes.get(i) - _scoringTimes.get(0), true)); res.set(i, col++, _lambdaIters.get(i)); res.set(i, col++, lambdaFormatter.format(_lambdas.get(i))); res.set(i, col++, _lambdaPredictors.get(i)); res.set(i, col++, _lambdaDevTrain.get(i)); if(_lambdaDevTest != null && _lambdaDevTest.size() > i) res.set(i, col++, _lambdaDevTest.get(i)); if(_lambdaDevXval != null && _lambdaDevXval.size() > i) { res.set(i, col++, _lambdaDevXval.get(i)); res.set(i, col++, _lambdaDevXvalSE.get(i)); } } return res; } } private transient ScoringHistory _sc; private transient LambdaSearchScoringHistory _lsc; long _t0 = System.currentTimeMillis(); private transient double _iceptAdjust = 0; private double _lmax; private transient long _nobs; private transient GLMModel _model; @Override public int nclasses() { if (_parms._family == Family.multinomial) return _nclass; if (_parms._family == Family.binomial) return 2; return 1; } private transient double[] _nullBeta; private double[] getNullBeta() { if (_nullBeta == null) { if (_parms._family == Family.multinomial) { _nullBeta = MemoryManager.malloc8d((_dinfo.fullN() + 1) * nclasses()); int N = _dinfo.fullN() + 1; if(_parms._intercept) for (int i = 0; i < nclasses(); ++i) _nullBeta[_dinfo.fullN() + i * N] = Math.log(_state._ymu[i]); } else { _nullBeta = MemoryManager.malloc8d(_dinfo.fullN() + 1); if (_parms._intercept && !(_parms._family == Family.quasibinomial)) _nullBeta[_dinfo.fullN()] = new GLMModel.GLMWeightsFun(_parms).link(_state._ymu[0]); else _nullBeta[_dinfo.fullN()] = 0; } } return _nullBeta; } protected boolean computePriorClassDistribution(){return _parms._family == Family.multinomial;} @Override public void init(boolean expensive) { super.init(expensive); hide("_balance_classes", "Not applicable since class balancing is not required for GLM."); hide("_max_after_balance_size", "Not applicable since class balancing is not required for GLM."); hide("_class_sampling_factors", "Not applicable since class balancing is not required for GLM."); _parms.validate(this); if(_response != null) { if(!isClassifier() && _response.isCategorical()) error("_response", H2O.technote(2, "Regression requires numeric response, got categorical.")); switch (_parms._family) { case binomial: if (!_response.isBinary() && _nclass != 2) error("_family", H2O.technote(2, "Binomial requires the response to be a 2-class categorical or a binary column (0/1)")); break; case multinomial: if (_nclass <= 2) error("_family", H2O.technote(2, "Multinomial requires a categorical response with at least 3 levels (for 2 class problem use family=binomial.")); break; case poisson: if (_nclass != 1) error("_family", "Poisson requires the response to be numeric."); if (_response.min() < 0) error("_family", "Poisson requires response >= 0"); if (!_response.isInt()) warn("_family", "Poisson expects non-negative integer response, got floats."); break; case gamma: if (_nclass != 1) error("_distribution", H2O.technote(2, "Gamma requires the response to be numeric.")); if (_response.min() <= 0) error("_family", "Gamma requires positive respone"); break; case tweedie: if (_nclass != 1) error("_family", H2O.technote(2, "Tweedie requires the response to be numeric.")); break; case quasibinomial: if (_nclass != 1) error("_family", H2O.technote(2, "Quasi_binomial requires the response to be numeric.")); break; case gaussian: // if (_nclass != 1) error("_family", H2O.technote(2, "Gaussian requires the response to be numeric.")); break; default: error("_family", "Invalid distribution: " + _parms._distribution); } } if (expensive) { if (error_count() > 0) return; if (_parms._alpha == null) _parms._alpha = new double[]{_parms._solver == Solver.L_BFGS ? 0 : .5}; if (_parms._lambda_search &&_parms._nlambdas == -1) _parms._nlambdas = _parms._alpha[0] == 0?30:100; // fewer lambdas needed for ridge _lsc = new LambdaSearchScoringHistory(_parms._valid != null,_parms._nfolds > 1); _sc = new ScoringHistory(); _train.bulkRollups(); // make sure we have all the rollups computed in parallel _sc = new ScoringHistory(); _t0 = System.currentTimeMillis(); if (_parms._lambda_search || !_parms._intercept || _parms._lambda == null || _parms._lambda[0] > 0) _parms._use_all_factor_levels = true; if (_parms._link == Link.family_default) _parms._link = _parms._family.defaultLink; _dinfo = new DataInfo(_train.clone(), _valid, 1, _parms._use_all_factor_levels || _parms._lambda_search, _parms._standardize ? DataInfo.TransformType.STANDARDIZE : DataInfo.TransformType.NONE, DataInfo.TransformType.NONE, _parms._missing_values_handling == MissingValuesHandling.Skip, _parms._missing_values_handling == MissingValuesHandling.MeanImputation, false, hasWeightCol(), hasOffsetCol(), hasFoldCol(), _parms._interactions); if (_parms._max_iterations == -1) { // fill in default max iterations int numclasses = _parms._family == Family.multinomial?nclasses():1; if (_parms._solver == Solver.L_BFGS) { _parms._max_iterations = _parms._lambda_search ? _parms._nlambdas * 100 * numclasses : numclasses * Math.max(20, _dinfo.fullN() >> 2); if(_parms._alpha[0] > 0) _parms._max_iterations *= 10; } else _parms._max_iterations = _parms._lambda_search ? 10 * _parms._nlambdas : 50; } if (_valid != null) _validDinfo = _dinfo.validDinfo(_valid); _state = new ComputationState(_job, _parms, _dinfo, null, nclasses()); // skipping extra rows? (outside of weights == 0)GLMT boolean skippingRows = (_parms._missing_values_handling == MissingValuesHandling.Skip && _train.hasNAs()); if (hasWeightCol() || skippingRows) { // need to re-compute means and sd boolean setWeights = skippingRows;// && _parms._lambda_search && _parms._alpha[0] > 0; if (setWeights) { Vec wc = _weights == null ? _dinfo._adaptedFrame.anyVec().makeCon(1) : _weights.makeCopy(); _dinfo.setWeights(_generatedWeights = "__glm_gen_weights", wc); } YMUTask ymt = new YMUTask(_dinfo, _parms._family == Family.multinomial?nclasses():1, setWeights, skippingRows,true,false).doAll(_dinfo._adaptedFrame); if (ymt.wsum() == 0) throw new IllegalArgumentException("No rows left in the dataset after filtering out rows with missing values. Ignore columns with many NAs or impute your missing values prior to calling glm."); Log.info(LogMsg("using " + ymt.nobs() + " nobs out of " + _dinfo._adaptedFrame.numRows() + " total")); // if sparse data, need second pass to compute variance _nobs = ymt.nobs(); if (_parms._obj_reg == -1) _parms._obj_reg = 1.0 / ymt.wsum(); if(!_parms._stdOverride) _dinfo.updateWeightedSigmaAndMean(ymt.predictorSDs(), ymt.predictorMeans()); if (_parms._family == Family.multinomial) { _state._ymu = MemoryManager.malloc8d(_nclass); for (int i = 0; i < _state._ymu.length; ++i) _state._ymu[i] = _priorClassDist[i];//ymt.responseMeans()[i]; } else _state._ymu = _parms._intercept?ymt._yMu:new double[]{_parms.linkInv(0)}; } else { _nobs = _train.numRows(); if (_parms._obj_reg == -1) _parms._obj_reg = 1.0 / _nobs; if (_parms._family == Family.multinomial) { _state._ymu = MemoryManager.malloc8d(_nclass); for (int i = 0; i < _state._ymu.length; ++i) _state._ymu[i] = _priorClassDist[i]; } else _state._ymu = new double[]{_parms._intercept?_train.lastVec().mean():_parms.linkInv(0)}; } BetaConstraint bc = (_parms._beta_constraints != null)?new BetaConstraint(_parms._beta_constraints.get()):new BetaConstraint(); if((bc.hasBounds() || bc.hasProximalPenalty()) && _parms._compute_p_values) error("_compute_p_values","P-values can not be computed for constrained problems"); _state.setBC(bc); if(hasOffsetCol() && _parms._intercept) { // fit intercept GLMGradientSolver gslvr = new GLMGradientSolver(_job,_parms, _dinfo.filterExpandedColumns(new int[0]), 0, _state.activeBC()); double [] x = new L_BFGS().solve(gslvr,new double[]{-_offset.mean()}).coefs; Log.info(LogMsg("fitted intercept = " + x[0])); x[0] = _parms.linkInv(x[0]); _state._ymu = x; } if (_parms._prior > 0) _iceptAdjust = -Math.log(_state._ymu[0] * (1 - _parms._prior) / (_parms._prior * (1 - _state._ymu[0]))); ArrayList<Vec> vecs = new ArrayList<>(); if(_weights != null) vecs.add(_weights); if(_offset != null) vecs.add(_offset); vecs.add(_response); double [] beta = getNullBeta(); GLMGradientInfo ginfo = new GLMGradientSolver(_job,_parms, _dinfo, 0, _state.activeBC()).getGradient(beta); _lmax = lmax(ginfo._gradient); _state.setLambdaMax(_lmax); _model = new GLMModel(_result, _parms, GLM.this, _state._ymu, _dinfo._adaptedFrame.lastVec().sigma(), _lmax, _nobs); String[] warns = _model.adaptTestForTrain(_valid, true, true); for (String s : warns) _job.warn(s); if (_parms._lambda_min_ratio == -1) { _parms._lambda_min_ratio = (_nobs >> 4) > _dinfo.fullN() ? 1e-4 : 1e-2; if(_parms._alpha[0] == 0) _parms._lambda_min_ratio *= 1e-2; // smalelr lambda min for ridge as we are starting quite high } _state.updateState(beta,ginfo); if (_parms._lambda == null) { // no lambda given, we will base lambda as a fraction of lambda max if (_parms._lambda_search) { _parms._lambda = new double[_parms._nlambdas]; double dec = Math.pow(_parms._lambda_min_ratio, 1.0/(_parms._nlambdas - 1)); _parms._lambda[0] = _lmax; double l = _lmax; for (int i = 1; i < _parms._nlambdas; ++i) _parms._lambda[i] = (l *= dec); // todo set the null submodel } else _parms._lambda = new double[]{10 * _parms._lambda_min_ratio * _lmax}; } if(!Double.isNaN(_lambdaCVEstimate)){ for(int i = 0; i < _parms._lambda.length; ++i) if(_parms._lambda[i] < _lambdaCVEstimate){ _parms._lambda = Arrays.copyOf(_parms._lambda,i+1); break; } _parms._lambda[_parms._lambda.length-1] = _lambdaCVEstimate; } if(_parms._objective_epsilon == -1) { if(_parms._lambda_search) _parms._objective_epsilon = 1e-4; else // lower default objective epsilon for non-standardized problems (mostly to match classical tools) _parms._objective_epsilon = _parms._lambda[0] == 0?1e-6:1e-4; } if(_parms._gradient_epsilon == -1) { _parms._gradient_epsilon = _parms._lambda[0] == 0 ? 1e-6 : 1e-4; if(_parms._lambda_search) _parms._gradient_epsilon *= 1e-2; } // clone2 so that I don't change instance which is in the DKV directly // (clone2 also shallow clones _output) _model.clone2().delete_and_lock(_job._key); } } protected static final long WORK_TOTAL = 1000000; transient Key [] _toRemove; private Key[] removeLater(Key ...k){ _toRemove = _toRemove == null?k:ArrayUtils.append(_toRemove,k); return k; } @Override protected GLMDriver trainModelImpl() { return _driver = new GLMDriver(); } private final double lmax(double[] grad) { return Math.max(ArrayUtils.maxValue(grad), -ArrayUtils.minValue(grad)) / Math.max(1e-2, _parms._alpha[0]); } private transient ComputationState _state; /** * Main loop of the glm algo. */ public final class GLMDriver extends Driver implements ProgressMonitor { private long _workPerIteration; private transient double[][] _vcov; private void doCleanup() { try { if(_parms._lambda_search && _parms._is_cv_model) Scope.untrack(removeLater(_dinfo.getWeightsVec()._key)); if(!_cv && _model!=null) _model.unlock(_job); } catch(Throwable t){ // nada } } private transient Cholesky _chol; private transient L1Solver _lslvr; int [] findZeros(double [] vals){ int [] res = new int[4]; int cnt = 0; for(int i = 0; i < vals.length; ++i){ if(vals[i] == 0){ if(res.length == cnt) res = Arrays.copyOf(res,res.length*2); res[cnt++] = i; } } return Arrays.copyOf(res,cnt); } private double [] solveGram(Solver s, GLMIterationTask t) { // look for predictors which never appeared (can happen e.g. with weights or ignored NAs) // never occuring columns must have gram[i] == 0 for all j AND XtY[i] == 0 if(_parms._family != Family.multinomial) { // don't do this for multinomial family - too many problems resizing the gradient int[] zeros = t._gram.findZeroCols(); int falseZeros = 0; for (int i = 0; i < zeros.length; i++) { if (t._xy[zeros[i]] == 0) zeros[i - falseZeros] = zeros[i]; else falseZeros++; } zeros = Arrays.copyOf(zeros, zeros.length - falseZeros); if (zeros.length > 0) { _state.removeCols(zeros); // no need to solve with zeros, remove them, solve and extend the result to original size (filling in zeros) t._gram.dropCols(zeros); t._xy = ArrayUtils.removeIds(t._xy, zeros); if (t._beta != null) t._beta = ArrayUtils.removeIds(t._beta, zeros); } } t._gram.mul(_parms._obj_reg); ArrayUtils.mult(t._xy, _parms._obj_reg); return (s == Solver.COORDINATE_DESCENT)?COD_solve(t,_state._alpha,_state.lambda()):ADMM_solve(t._gram, t._xy); } private double[] ADMM_solve(Gram gram, double [] xy) { if(_parms._remove_collinear_columns || _parms._compute_p_values) { if(!_parms._intercept) throw H2O.unimpl(); ArrayList<Integer> ignoredCols = new ArrayList<>(); Cholesky chol = ((_state._iter == 0)?gram.qrCholesky(ignoredCols, _parms._standardize):gram.cholesky(null)); if(!ignoredCols.isEmpty() && !_parms._remove_collinear_columns) { int [] collinear_cols = new int[ignoredCols.size()]; for(int i = 0; i < collinear_cols.length; ++i) collinear_cols[i] = ignoredCols.get(i); throw new Gram.CollinearColumnsException("Found collinear columns in the dataset. P-values can not be computed with collinear columns in the dataset. Set remove_collinear_columns flag to true to remove collinear columns automatically. Found collinear columns " + Arrays.toString(ArrayUtils.select(_dinfo.coefNames(),collinear_cols))); } if(!chol.isSPD()) throw new NonSPDMatrixException(); _chol = chol; if(!ignoredCols.isEmpty()) { // got some redundant cols int [] collinear_cols = new int[ignoredCols.size()]; for(int i = 0; i < collinear_cols.length; ++i) collinear_cols[i] = ignoredCols.get(i); String [] collinear_col_names = ArrayUtils.select(_state.activeData().coefNames(),collinear_cols); // need to drop the cols from everywhere _model.addWarning("Removed collinear columns " + Arrays.toString(collinear_col_names)); Log.warn("Removed collinear columns " + Arrays.toString(collinear_col_names)); _state.removeCols(collinear_cols); gram.dropCols(collinear_cols); xy = ArrayUtils.removeIds(xy,collinear_cols); } xy = xy.clone(); chol.solve(xy); } else { gram = gram.deep_clone(); xy = xy.clone(); GramSolver slvr = new GramSolver(gram.clone(), xy.clone(), _parms._intercept, _state.l2pen(),_state.l1pen(), _state.activeBC()._betaGiven, _state.activeBC()._rho, _state.activeBC()._betaLB, _state.activeBC()._betaUB); _chol = slvr._chol; if(_state.l1pen() == 0 && !_state.activeBC().hasBounds()) { slvr.solve(xy); } else { xy = MemoryManager.malloc8d(xy.length); if(_state._u == null && _parms._family != Family.multinomial) _state._u = MemoryManager.malloc8d(_state.activeData().fullN()+1); (_lslvr = new ADMM.L1Solver(1e-4, 10000, _state._u)).solve(slvr, xy, _state.l1pen(), _parms._intercept, _state.activeBC()._betaLB, _state.activeBC()._betaUB); } } return xy; } private void fitIRLSM_multinomial(Solver s){ assert _dinfo._responses == 3:"IRLSM for multinomial needs extra information encoded in additional reponses, expected 3 response vecs, got " + _dinfo._responses; double [] beta = _state.betaMultinomial(); do { beta = beta.clone(); for (int c = 0; c < _nclass; ++c) { boolean onlyIcpt = _state.activeDataMultinomial(c).fullN() == 0; _state.setActiveClass(c); LineSearchSolver ls = (_state.l1pen() == 0) ? new MoreThuente(_state.gslvrMultinomial(c), _state.betaMultinomial(c,beta), _state.ginfoMultinomial(c)) : new SimpleBacktrackingLS(_state.gslvrMultinomial(c), _state.betaMultinomial(c,beta), _state.l1pen()); GLMWeightsFun glmw = new GLMWeightsFun(_parms); long t1 = System.currentTimeMillis(); new GLMMultinomialUpdate(_state.activeDataMultinomial(), _job._key, beta, c).doAll(_state.activeDataMultinomial()._adaptedFrame); long t2 = System.currentTimeMillis(); GLMIterationTask t = new GLMTask.GLMIterationTask(_job._key, _state.activeDataMultinomial(c), glmw, ls.getX(), c).doAll(_state.activeDataMultinomial(c)._adaptedFrame); long t3 = System.currentTimeMillis(); double[] betaCnd = solveGram(s,t); long t4 = System.currentTimeMillis(); if (!onlyIcpt && !ls.evaluate(ArrayUtils.subtract(betaCnd, ls.getX(), betaCnd))) { Log.info(LogMsg("Ls failed " + ls)); continue; } long t5 = System.currentTimeMillis(); _state.setBetaMultinomial(c, beta,ls.getX()); // update multinomial Log.info(LogMsg("computed in " + (t2 - t1) + "+" + (t3 - t2) + "+" + (t4 - t3) + "+" + (t5 - t4) + "=" + (t5 - t1) + "ms, step = " + ls.step() + ((_lslvr != null) ? ", l1solver " + _lslvr : ""))); } _state.setActiveClass(-1); } while(progress(beta,_state.gslvr().getGradient(beta))); } private void fitLSM(Solver s){ GLMIterationTask t = new GLMTask.GLMIterationTask(_job._key, _state.activeData(), new GLMWeightsFun(_parms), null).doAll(_state.activeData()._adaptedFrame); double [] beta = solveGram(s,t); // compute mse double [] x = t._gram.mul(beta); for(int i = 0; i < x.length; ++i) x[i] = (x[i] - 2*t._xy[i]); double l = .5*(ArrayUtils.innerProduct(x,beta)/_parms._obj_reg + t._yy ); _state.updateState(beta, l); } private void fitIRLSM(Solver s) { GLMWeightsFun glmw = new GLMWeightsFun(_parms); double [] betaCnd = _state.beta(); LineSearchSolver ls = null; boolean firstIter = true; try { while (true) { long t1 = System.currentTimeMillis(); GLMIterationTask t = new GLMTask.GLMIterationTask(_job._key, _state.activeData(), glmw, betaCnd).doAll(_state.activeData()._adaptedFrame); long t2 = System.currentTimeMillis(); if (!_state._lsNeeded && (Double.isNaN(t._likelihood) || _state.objective(t._beta, t._likelihood) > _state.objective() + _parms._objective_epsilon)) { _state._lsNeeded = true; } else { if (!firstIter && !_state._lsNeeded && !progress(t._beta, t._likelihood)) return; betaCnd = solveGram(s,t); } firstIter = false; long t3 = System.currentTimeMillis(); if(_state._lsNeeded) { if(ls == null) ls = (_state.l1pen() == 0 && !_state.activeBC().hasBounds()) ? new MoreThuente(_state.gslvr(),_state.beta(), _state.ginfo()) : new SimpleBacktrackingLS(_state.gslvr(),_state.beta().clone(), _state.l1pen(), _state.ginfo()); if (!ls.evaluate(ArrayUtils.subtract(betaCnd, ls.getX(), betaCnd))) { Log.info(LogMsg("Ls failed " + ls)); return; } betaCnd = ls.getX(); if(!progress(betaCnd,ls.ginfo())) return; long t4 = System.currentTimeMillis(); Log.info(LogMsg("computed in " + (t2 - t1) + "+" + (t3 - t2) + "+" + (t4 - t3) + "=" + (t4 - t1) + "ms, step = " + ls.step() + ((_lslvr != null) ? ", l1solver " + _lslvr : ""))); } else Log.info(LogMsg("computed in " + (t2 - t1) + "+" + (t3 - t2) + "=" + (t3 - t1) + "ms, step = " + 1 + ((_lslvr != null) ? ", l1solver " + _lslvr : ""))); } } catch(NonSPDMatrixException e) { Log.warn(LogMsg("Got Non SPD matrix, stopped.")); } } private void fitLBFGS() { double [] beta = _state.beta(); final double l1pen = _state.l1pen(); GLMGradientSolver gslvr = _state.gslvr(); GLMWeightsFun glmw = new GLMWeightsFun(_parms); if (beta == null && _parms._family == Family.multinomial) { beta = MemoryManager.malloc8d((_state.activeData().fullN() + 1) * _nclass); int P = _state.activeData().fullN() + 1; if(_parms._intercept) for (int i = 0; i < _nclass; ++i) beta[i * P + P - 1] = glmw.link(_state._ymu[i]); } if (beta == null) { beta = MemoryManager.malloc8d(_state.activeData().fullN() + 1); if (_parms._intercept) beta[beta.length - 1] = glmw.link(_state._ymu[0]); } L_BFGS lbfgs = new L_BFGS().setObjEps(_parms._objective_epsilon).setGradEps(_parms._gradient_epsilon).setMaxIter(_parms._max_iterations); assert beta.length == _state.ginfo()._gradient.length; int P = _dinfo.fullN(); if (l1pen > 0 || _state.activeBC().hasBounds()) { double[] nullBeta = MemoryManager.malloc8d(beta.length); // compute ginfo at null beta to get estimate for rho if (_dinfo._intercept) { if (_parms._family == Family.multinomial) { for (int c = 0; c < _nclass; c++) nullBeta[(c + 1) * (P + 1) - 1] = glmw.link(_state._ymu[c]); } else nullBeta[nullBeta.length - 1] = glmw.link(_state._ymu[0]); } GradientInfo ginfo = gslvr.getGradient(nullBeta); double[] direction = ArrayUtils.mult(ginfo._gradient.clone(), -1); double t = 1; if (l1pen > 0) { MoreThuente mt = new MoreThuente(gslvr,nullBeta); mt.evaluate(direction); t = mt.step(); } double[] rho = MemoryManager.malloc8d(beta.length); double r = _state.activeBC().hasBounds()?1:.1; BetaConstraint bc = _state.activeBC(); // compute rhos for (int i = 0; i < rho.length - 1; ++i) rho[i] = r * ADMM.L1Solver.estimateRho(nullBeta[i] + t * direction[i], l1pen, bc._betaLB == null ? Double.NEGATIVE_INFINITY : bc._betaLB[i], bc._betaUB == null ? Double.POSITIVE_INFINITY : bc._betaUB[i]); for (int ii = P; ii < rho.length; ii += P + 1) rho[ii] = r * ADMM.L1Solver.estimateRho(nullBeta[ii] + t * direction[ii], 0, bc._betaLB == null ? Double.NEGATIVE_INFINITY : bc._betaLB[ii], bc._betaUB == null ? Double.POSITIVE_INFINITY : bc._betaUB[ii]); final double[] objvals = new double[2]; objvals[1] = Double.POSITIVE_INFINITY; double reltol = L1Solver.DEFAULT_RELTOL; double abstol = L1Solver.DEFAULT_ABSTOL; double ADMM_gradEps = 1e-3; ProximalGradientSolver innerSolver = new ProximalGradientSolver(gslvr, beta, rho, _parms._objective_epsilon * 1e-1, _parms._gradient_epsilon, _state.ginfo(), this); // new ProgressMonitor() { // @Override // public boolean progress(double[] betaDiff, GradientInfo ginfo) { // return ++_state._iter < _parms._max_iterations; // } // }); ADMM.L1Solver l1Solver = new ADMM.L1Solver(ADMM_gradEps, 250, reltol, abstol, _state._u); l1Solver._pm = this; l1Solver.solve(innerSolver, beta, l1pen, true, _state.activeBC()._betaLB, _state.activeBC()._betaUB); _state._u = l1Solver._u; _state.updateState(beta,gslvr.getGradient(beta)); } else { if(!_parms._lambda_search && _state._iter == 0) updateProgress(false); Result r = lbfgs.solve(gslvr, beta, _state.ginfo(), new ProgressMonitor() { @Override public boolean progress(double[] beta, GradientInfo ginfo) { if(_state._iter < 4 || ((_state._iter & 3) == 0)) Log.info(LogMsg("LBFGS, gradient norm = " + ArrayUtils.linfnorm(ginfo._gradient, false))); return GLMDriver.this.progress(beta,ginfo); } }); Log.info(LogMsg(r.toString())); _state.updateState(r.coefs,(GLMGradientInfo)r.ginfo); } } private void fitCOD() { double [] beta = _state.beta(); int p = _state.activeData().fullN()+ 1; double wsum,wsumu; // intercept denum double [] denums; boolean skipFirstLevel = !_state.activeData()._useAllFactorLevels; double [] betaold = beta.clone(); double objold = _state.objective(); int iter2=0; // total cd iters // get reweighted least squares vectors Vec[] newVecs = _state.activeData()._adaptedFrame.anyVec().makeZeros(3); Vec w = newVecs[0]; // fixed before each CD loop Vec z = newVecs[1]; // fixed before each CD loop Vec zTilda = newVecs[2]; // will be updated at every variable within CD loop long startTimeTotalNaive = System.currentTimeMillis(); // generate new IRLS iteration while (iter2++ < 30) { Frame fr = new Frame(_state.activeData()._adaptedFrame); fr.add("w", w); // fr has all data fr.add("z", z); fr.add("zTilda", zTilda); GLMGenerateWeightsTask gt = new GLMGenerateWeightsTask(_job._key, _state.activeData(), _parms, beta).doAll(fr); double objVal = objVal(gt._likelihood, gt._betaw, _state.lambda()); denums = gt.denums; wsum = gt.wsum; wsumu = gt.wsumu; int iter1 = 0; // coordinate descent loop while (iter1++ < 100) { Frame fr2 = new Frame(); fr2.add("w", w); fr2.add("z", z); fr2.add("zTilda", zTilda); // original x%*%beta if first iteration for(int i=0; i < _state.activeData()._cats; i++) { Frame fr3 = new Frame(fr2); int level_num = _state.activeData()._catOffsets[i+1]-_state.activeData()._catOffsets[i]; int prev_level_num = 0; fr3.add("xj", _state.activeData()._adaptedFrame.vec(i)); boolean intercept = (i == 0); // prev var is intercept if(!intercept) { prev_level_num = _state.activeData()._catOffsets[i]-_state.activeData()._catOffsets[i-1]; fr3.add("xjm1", _state.activeData()._adaptedFrame.vec(i-1)); // add previous categorical variable } int start_old = _state.activeData()._catOffsets[i]; GLMCoordinateDescentTaskSeqNaive stupdate; if(intercept) stupdate = new GLMCoordinateDescentTaskSeqNaive(intercept, false, 4 , Arrays.copyOfRange(betaold, start_old, start_old+level_num), new double [] {beta[p-1]}, _state.activeData()._catLvls[i], null, null, null, null, null, skipFirstLevel).doAll(fr3); else stupdate = new GLMCoordinateDescentTaskSeqNaive(intercept, false, 1 , Arrays.copyOfRange(betaold, start_old,start_old+level_num), Arrays.copyOfRange(beta, _state.activeData()._catOffsets[i-1], _state.activeData()._catOffsets[i]) , _state.activeData()._catLvls[i] , _state.activeData()._catLvls[i-1], null, null, null, null, skipFirstLevel ).doAll(fr3); for(int j=0; j < level_num; ++j) beta[_state.activeData()._catOffsets[i]+j] = ADMM.shrinkage(stupdate._temp[j] / wsumu, _state.lambda() * _parms._alpha[0]) / (denums[_state.activeData()._catOffsets[i]+j] / wsumu + _state.lambda() * (1 - _parms._alpha[0])); } int cat_num = 2; // if intercept, or not intercept but not first numeric, then both are numeric . for (int i = 0; i < _state.activeData()._nums; ++i) { GLMCoordinateDescentTaskSeqNaive stupdate; Frame fr3 = new Frame(fr2); fr3.add("xj", _state.activeData()._adaptedFrame.vec(i+_state.activeData()._cats)); // add current variable col boolean intercept = (i == 0 && _state.activeData().numStart() == 0); // if true then all numeric case and doing beta_1 double [] meannew=null, meanold=null, varnew=null, varold=null; if(i > 0 || intercept) {// previous var is a numeric var cat_num = 3; if(!intercept) fr3.add("xjm1", _state.activeData()._adaptedFrame.vec(i - 1 + _state.activeData()._cats)); // add previous one if not doing a beta_1 update, ow just pass it the intercept term if( _state.activeData()._normMul!=null ) { varold = new double[]{_state.activeData()._normMul[i]}; meanold = new double[]{_state.activeData()._normSub[i]}; if (i!= 0){ varnew = new double []{ _state.activeData()._normMul[i-1]}; meannew = new double [] { _state.activeData()._normSub[i-1]}; } } stupdate = new GLMCoordinateDescentTaskSeqNaive(intercept, false, cat_num , new double [] { betaold[_state.activeData().numStart()+ i]}, new double []{ beta[ (_state.activeData().numStart()+i-1+p)%p ]}, null, null, varold, meanold, varnew, meannew, skipFirstLevel ).doAll(fr3); beta[i+_state.activeData().numStart()] = ADMM.shrinkage(stupdate._temp[0] / wsumu, _state.lambda() * _parms._alpha[0]) / (denums[i+_state.activeData().numStart()] / wsumu + _state.lambda() * (1 - _parms._alpha[0])); } else if (i == 0 && !intercept){ // previous one is the last categorical variable int prev_level_num = _state.activeData().numStart()-_state.activeData()._catOffsets[_state.activeData()._cats-1]; fr3.add("xjm1", _state.activeData()._adaptedFrame.vec(_state.activeData()._cats-1)); // add previous categorical variable if( _state.activeData()._normMul!=null){ varold = new double []{ _state.activeData()._normMul[i]}; meanold = new double [] { _state.activeData()._normSub[i]}; } stupdate = new GLMCoordinateDescentTaskSeqNaive(intercept, false, cat_num , new double [] {betaold[ _state.activeData().numStart()]}, Arrays.copyOfRange(beta,_state.activeData()._catOffsets[_state.activeData()._cats-1],_state.activeData().numStart() ), null, _state.activeData()._catLvls[_state.activeData()._cats-1], varold, meanold, null, null, skipFirstLevel ).doAll(fr3); beta[_state.activeData().numStart()] = ADMM.shrinkage(stupdate._temp[0] / wsumu, _state.lambda() * _parms._alpha[0]) / (denums[_state.activeData().numStart()] / wsumu + _state.lambda() * (1 - _parms._alpha[0])); } } if(_state.activeData()._nums + _state.activeData()._cats > 0) { // intercept update: preceded by a categorical or numeric variable Frame fr3 = new Frame(fr2); fr3.add("xjm1", _state.activeData()._adaptedFrame.vec(_state.activeData()._cats + _state.activeData()._nums - 1)); // add last variable updated in cycle to the frame GLMCoordinateDescentTaskSeqNaive iupdate; if (_state.activeData()._adaptedFrame.vec(_state.activeData()._cats + _state.activeData()._nums - 1).isCategorical()) { // only categorical vars cat_num = 2; iupdate = new GLMCoordinateDescentTaskSeqNaive(false, true, cat_num, new double[]{betaold[betaold.length - 1]}, Arrays.copyOfRange(beta, _state.activeData()._catOffsets[_state.activeData()._cats - 1], _state.activeData()._catOffsets[_state.activeData()._cats]), null, _state.activeData()._catLvls[_state.activeData()._cats - 1], null, null, null, null, skipFirstLevel).doAll(fr3); } else { // last variable is numeric cat_num = 3; double[] meannew = null, varnew = null; if (_state.activeData()._normMul != null) { varnew = new double[]{_state.activeData()._normMul[_state.activeData()._normMul.length - 1]}; meannew = new double[]{_state.activeData()._normSub[_state.activeData()._normSub.length - 1]}; } iupdate = new GLMCoordinateDescentTaskSeqNaive(false, true, cat_num, new double[]{betaold[betaold.length - 1]}, new double[]{beta[beta.length - 2]}, null, null, null, null, varnew, meannew, skipFirstLevel).doAll(fr3); } if (_parms._intercept) beta[beta.length - 1] = iupdate._temp[0] / wsum; } double maxdiff = ArrayUtils.linfnorm(ArrayUtils.subtract(beta, betaold), false); // false to keep the intercept System.arraycopy(beta, 0, betaold, 0, beta.length); if (maxdiff < _parms._beta_epsilon) break; } double percdiff = Math.abs((objold - objVal)/objold); if (percdiff < _parms._objective_epsilon & iter2 >1 ) break; objold=objVal; System.out.println("iter1 = " + iter1); } System.out.println("iter2 = " + iter2); long endTimeTotalNaive = System.currentTimeMillis(); long durationTotalNaive = (endTimeTotalNaive - startTimeTotalNaive)/1000; System.out.println("Time to run Naive Coordinate Descent " + durationTotalNaive); _state._iter = iter2; for (Vec v : newVecs) v.remove(); _state.updateState(beta,objold); } private void fitModel() { Solver solver = (_parms._solver == Solver.AUTO) ? defaultSolver() : _parms._solver; switch (solver) { case COORDINATE_DESCENT: // fall through to IRLSM case IRLSM: if(_parms._family == Family.multinomial) fitIRLSM_multinomial(solver); else if(_parms._family == Family.gaussian && _parms._link == Link.identity) fitLSM(solver); else fitIRLSM(solver); break; case L_BFGS: fitLBFGS(); break; case COORDINATE_DESCENT_NAIVE: fitCOD(); break; default: throw H2O.unimpl(); } if(_parms._compute_p_values) { // compute p-values double se = 1; boolean seEst = false; double [] beta = _state.beta(); if(_parms._family != Family.binomial && _parms._family != Family.poisson) { seEst = true; ComputeSETsk ct = new ComputeSETsk(null, _state.activeData(), _job._key, beta, _parms).doAll(_state.activeData()._adaptedFrame); se = ct._sumsqe / (_nobs - 1 - _state.activeData().fullN()); } double [] zvalues = MemoryManager.malloc8d(_state.activeData().fullN()+1); Cholesky chol = _chol; if(_parms._standardize){ // compute non-standardized t(X)%*%W%*%X DataInfo activeData = _state.activeData(); double [] beta_nostd = activeData.denormalizeBeta(beta); DataInfo.TransformType transform = activeData._predictor_transform; activeData.setPredictorTransform(DataInfo.TransformType.NONE); Gram g = new GLMIterationTask(_job._key,activeData,new GLMWeightsFun(_parms),beta_nostd).doAll(activeData._adaptedFrame)._gram; activeData.setPredictorTransform(transform); // just in case, restore the trasnform g.mul(_parms._obj_reg); chol = g.cholesky(null); beta = beta_nostd; } double [][] inv = chol.getInv(); ArrayUtils.mult(inv,_parms._obj_reg*se); _vcov = inv; for(int i = 0; i < zvalues.length; ++i) zvalues[i] = beta[i]/Math.sqrt(inv[i][i]); _model.setZValues(expandVec(zvalues,_state.activeData()._activeCols,_dinfo.fullN()+1,Double.NaN),se, seEst); } } private long _lastScore = System.currentTimeMillis(); private long timeSinceLastScoring(){return System.currentTimeMillis() - _lastScore;} private void scoreAndUpdateModel(){ // compute full validation on train and test Log.info(LogMsg("Scoring after " + timeSinceLastScoring() + "ms")); long t1 = System.currentTimeMillis(); Frame train = DKV.<Frame>getGet(_parms._train); _model.score(train).delete(); ModelMetrics mtrain = ModelMetrics.getFromDKV(_model, train); // updated by model.scoreAndUpdateModel _model._output._training_metrics = mtrain; long t2 = System.currentTimeMillis(); Log.info(LogMsg("Training metrics computed in " + (t2-t1) + "ms")); Log.info(LogMsg(mtrain.toString())); if(_valid != null) { Frame valid = DKV.<Frame>getGet(_parms._valid); _model.score(valid).delete(); _model._output._validation_metrics = ModelMetrics.getFromDKV(_model, valid); //updated by model.scoreAndUpdateModel } _model._output._scoring_history = _parms._lambda_search?_lsc.to2dTable():_sc.to2dTable(); _model.update(_job._key); _model.generateSummary(_parms._train,_state._iter); _lastScore = System.currentTimeMillis(); long scoringTime = System.currentTimeMillis() - t1; _scoringInterval = Math.max(_scoringInterval,20*scoringTime); // at most 5% overhead for scoring } protected Submodel computeSubmodel(int i,double lambda) { Submodel sm; if(lambda >= _lmax) _model.addSubmodel(sm = new Submodel(lambda,getNullBeta(),_state._iter,_nullDevTrain,_nullDevTest)); else { _model.addSubmodel(sm = new Submodel(lambda, _state.beta(),_state._iter,-1,-1)); _state.setLambda(lambda); checkMemoryFootPrint(_state.activeData()); do { if (_parms._family == Family.multinomial) for (int c = 0; c < _nclass; ++c) Log.info(LogMsg("Class " + c + " got " + _state.activeDataMultinomial(c).fullN() + " active columns out of " + _state._dinfo.fullN() + " total")); else Log.info(LogMsg("Got " + _state.activeData().fullN() + " active columns out of " + _state._dinfo.fullN() + " total")); fitModel(); } while (!_state.checkKKTs()); Log.info(LogMsg("solution has " + ArrayUtils.countNonzeros(_state.beta()) + " nonzeros")); if (_parms._lambda_search) { // need train and test deviance, only "the best" submodel will be fully scored double trainDev = _state.deviance() / _nobs; double testDev = Double.NaN; if (_validDinfo != null) { testDev = _parms._family == Family.multinomial ? new GLMResDevTaskMultinomial(_job._key, _validDinfo, _dinfo.denormalizeBeta(_state.beta()), _nclass).doAll(_validDinfo._adaptedFrame).avgDev() : new GLMResDevTask(_job._key, _validDinfo, _parms, _dinfo.denormalizeBeta(_state.beta())).doAll(_validDinfo._adaptedFrame).avgDev(); } Log.info(LogMsg("train deviance = " + trainDev + ", test deviance = " + testDev)); double xvalDev = _xval_test_deviances == null ? -1 : _xval_test_deviances[i]; double xvalDevSE = _xval_test_sd == null ? -1 : _xval_test_sd[i]; _lsc.addLambdaScore(_state._iter, ArrayUtils.countNonzeros(_state.beta()), _state.lambda(), trainDev, testDev, xvalDev, xvalDevSE); _model.updateSubmodel(sm = new Submodel(_state.lambda(), _state.beta(), _state._iter, trainDev, testDev)); } else // model is gonna be scored subsequently anyways _model.updateSubmodel(sm = new Submodel(lambda, _state.beta(), _state._iter, -1, -1)); } _model.update(_job); return sm; } private transient double _nullDevTrain = Double.NaN; private transient double _nullDevTest = Double.NaN; @Override public void computeImpl() { if(_doInit) init(true); if (error_count() > 0) throw H2OModelBuilderIllegalArgumentException.makeFromBuilder(GLM.this); if(_parms._lambda_search) { _nullDevTrain = _parms._family == Family.multinomial ?new GLMResDevTaskMultinomial(_job._key,_state._dinfo,getNullBeta(), _nclass).doAll(_state._dinfo._adaptedFrame).avgDev() :new GLMResDevTask(_job._key, _state._dinfo, _parms, getNullBeta()).doAll(_state._dinfo._adaptedFrame).avgDev(); if(_validDinfo != null) _nullDevTest = _parms._family == Family.multinomial ?new GLMResDevTaskMultinomial(_job._key,_validDinfo,getNullBeta(), _nclass).doAll(_validDinfo._adaptedFrame).avgDev() :new GLMResDevTask(_job._key, _validDinfo, _parms, getNullBeta()).doAll(_validDinfo._adaptedFrame).avgDev(); _workPerIteration = WORK_TOTAL/_parms._nlambdas; } else _workPerIteration = 1 + (WORK_TOTAL/_parms._max_iterations); if(_parms._family == Family.multinomial && _parms._solver != Solver.L_BFGS ) { double [] nb = getNullBeta(); double maxRow = ArrayUtils.maxValue(nb); double sumExp = 0; int P = _dinfo.fullN(); int N = _dinfo.fullN()+1; for(int i = 1; i < _nclass; ++i) sumExp += Math.exp(nb[i*N + P] - maxRow); Vec [] vecs = _dinfo._adaptedFrame.anyVec().makeDoubles(2, new double[]{sumExp,maxRow}); if(_parms._lambda_search && _parms._is_cv_model) { Scope.untrack(vecs[0]._key, vecs[1]._key); removeLater(vecs[0]._key,vecs[1]._key); } _dinfo.addResponse(new String[]{"__glm_sumExp", "__glm_maxRow"}, vecs); } double oldDevTrain = _nullDevTrain; double oldDevTest = _nullDevTest; double [] devHistoryTrain = new double[5]; double [] devHistoryTest = new double[5]; if(!_parms._lambda_search) updateProgress(false); // lambda search loop for (int i = 0; i < _parms._lambda.length; ++i) { // lambda search if(_parms._max_iterations != -1 && _state._iter >= _parms._max_iterations) break; Submodel sm = computeSubmodel(i,_parms._lambda[i]); double trainDev = sm.devianceTrain; double testDev = sm.devianceTest; devHistoryTest[i % devHistoryTest.length] = (oldDevTest - testDev)/oldDevTest; oldDevTest = testDev; devHistoryTrain[i % devHistoryTrain.length] = (oldDevTrain - trainDev)/oldDevTrain; oldDevTrain = trainDev; if(_parms._lambda[i] < _lmax && Double.isNaN(_lambdaCVEstimate) /** if we have cv lambda estimate we should use it, can not stop before reaching it */) { if (_parms._early_stopping && _state._iter >= devHistoryTrain.length) { double s = ArrayUtils.maxValue(devHistoryTrain); if (s < 1e-4) { Log.info(LogMsg("converged at lambda[" + i + "] = " + _parms._lambda[i] + ", improvement on train = " + s)); break; // started overfitting } if (_validDinfo != null && _parms._nfolds <= 1) { // check for early stopping on test but only if not doing xval s = ArrayUtils.maxValue(devHistoryTest); if (s < 0) { Log.info(LogMsg("converged at lambda[" + i + "] = " + _parms._lambda[i] + ", improvement on test = " + s)); break; // started overfitting } } } } if(_parms._lambda_search && (_parms._score_each_iteration || timeSinceLastScoring() > _scoringInterval)) { _model._output.setSubmodelIdx(_model._output._best_lambda_idx = i); scoreAndUpdateModel(); // update partial results } _job.update(_workPerIteration,"iter=" + _state._iter + " lmb=" + lambdaFormatter.format(_state.lambda()) + "deviance trn/tst= " + devFormatter.format(trainDev) + "/" + devFormatter.format(testDev) + " P=" + ArrayUtils.countNonzeros(_state.beta())); } if(_state._iter >= _parms._max_iterations) _job.warn("Reached maximum number of iterations " + _parms._max_iterations + "!"); if(_parms._nfolds > 1 && !Double.isNaN(_lambdaCVEstimate)) _model._output.setSubmodel(_lambdaCVEstimate); else _model._output.pickBestModel(); scoreAndUpdateModel(); if(_vcov != null) { _model.setVcov(_vcov); _model.update(_job._key); } if(!(_parms)._lambda_search && _state._iter < _parms._max_iterations){ _job.update(_workPerIteration*(_parms._max_iterations - _state._iter)); } if(_iceptAdjust != 0) { // apply the intercept adjust according to prior probability assert _parms._intercept; double [] b = _model._output._global_beta; b[b.length-1] += _iceptAdjust; for(Submodel sm:_model._output._submodels) sm.beta[sm.beta.length-1] += _iceptAdjust; _model.update(_job._key); } doCleanup(); } @Override public boolean onExceptionalCompletion(Throwable t, CountedCompleter caller){ doCleanup(); return true; } @Override public boolean progress(double [] beta, GradientInfo ginfo) { _state._iter++; if(ginfo instanceof ProximalGradientInfo) { ginfo = ((ProximalGradientInfo) ginfo)._origGinfo; GLMGradientInfo gginfo = (GLMGradientInfo) ginfo; _state.updateState(beta, gginfo); if (!_parms._lambda_search) updateProgress(false); return !timeout() && !_job.stop_requested() && _state._iter < _parms._max_iterations; } else { GLMGradientInfo gginfo = (GLMGradientInfo) ginfo; if(gginfo._gradient == null) _state.updateState(beta,gginfo._likelihood); else _state.updateState(beta, gginfo); if (!_parms._lambda_search) updateProgress(true); boolean converged = _state.converged(); if (converged) Log.info(LogMsg(_state.convergenceMsg)); return !timeout() && !_job.stop_requested() && !converged && _state._iter < _parms._max_iterations; } } public boolean progress(double [] beta, double likelihood) { _state._iter++; _state.updateState(beta,likelihood); if(!_parms._lambda_search) updateProgress(true); boolean converged = _state.converged(); if(converged) Log.info(LogMsg(_state.convergenceMsg)); return !_job.stop_requested() && !converged && _state._iter < _parms._max_iterations ; } private transient long _scoringInterval = SCORING_INTERVAL_MSEC; // update user visible progress protected void updateProgress(boolean canScore){ assert !_parms._lambda_search; _sc.addIterationScore(_state._iter, _state.likelihood(), _state.objective()); _job.update(_workPerIteration,_state.toString()); if(canScore && (_parms._score_each_iteration || timeSinceLastScoring() > _scoringInterval)) { _model.update(_state.expandBeta(_state.beta()), -1, -1, _state._iter); scoreAndUpdateModel(); } } } private Solver defaultSolver() { Solver s = Solver.IRLSM; int max_active = 0; if(_parms._family == Family.multinomial ) for(int c = 0; c < _nclass; ++c) max_active = Math.max(_state.activeDataMultinomial(c).fullN(),max_active); else max_active = _state.activeData().fullN(); if(max_active >= 5000) // cutoff has to be somewhere s = Solver.L_BFGS; else if(_parms._lambda_search) { // lambda search prefers coordinate descent // l1 lambda search is better with coordinate descent! s = Solver.COORDINATE_DESCENT; } else if(_state.activeBC().hasBounds() && !_state.activeBC().hasProximalPenalty()) { s = Solver.COORDINATE_DESCENT; } else if(_parms._family == Family.multinomial && _parms._alpha[0] == 0) s = Solver.L_BFGS; // multinomial does better with lbfgs else Log.info(LogMsg("picked solver " + s)); if(s != Solver.L_BFGS && _parms._max_active_predictors == -1) _parms._max_active_predictors = 5000; _parms._solver = s; return s; } double objVal(double likelihood, double[] beta, double lambda) { double alpha = _parms._alpha[0]; double proximalPen = 0; BetaConstraint bc = _state.activeBC(); if (_state.activeBC()._betaGiven != null && bc._rho != null) { for (int i = 0; i < bc._betaGiven.length; ++i) { double diff = beta[i] - bc._betaGiven[i]; proximalPen += diff * diff * bc._rho[i]; } } return (likelihood * _parms._obj_reg + .5 * proximalPen + lambda * (alpha * ArrayUtils.l1norm(beta, _parms._intercept) + (1 - alpha) * .5 * ArrayUtils.l2norm2(beta, _parms._intercept))); } private String LogMsg(String msg) {return "GLM[dest=" + dest() + ", " + _state + "] " + msg;} private static final double[] expandVec(double[] beta, final int[] activeCols, int fullN) { return expandVec(beta, activeCols, fullN, 0); } private static final double[] expandVec(double[] beta, final int[] activeCols, int fullN, double filler) { assert beta != null; if (activeCols == null) return beta; double[] res = MemoryManager.malloc8d(fullN); Arrays.fill(res, filler); int i = 0; for (int c : activeCols) res[c] = beta[i++]; res[res.length - 1] = beta[beta.length - 1]; return res; } private static double [] doUpdateCD(double [] grads, double [] ary, double diff , int variable_min, int variable_max) { for (int i = 0; i < variable_min; i++) grads[i] += diff * ary[i]; for (int i = variable_max; i < grads.length; i++) grads[i] += diff * ary[i]; return grads; } private static double [] doSparseUpdateCD(double [] grads, double [] ary, int[] ids, double diff , int variable_min, int variable_max) { for(int i = 0; i < ids.length; ++i) grads[ids[i]] += diff * ary[i]; return grads; } public double [] COD_solve(GLMIterationTask gt, double alpha, double lambda) { double wsumInv = 1.0/(gt.wsum*_parms._obj_reg); double l1pen = lambda * alpha; double l2pen = lambda*(1-alpha); double [][] xx = gt._gram.getXX(); double [] diagInv = MemoryManager.malloc8d(xx.length); for(int i = 0; i < diagInv.length; ++i) diagInv[i] = 1.0/(xx[i][i] + l2pen); int [][] nzs = new int[_state.activeData().numStart()][]; if(nzs.length > 1000) { final int [] nzs_ary = new int[xx.length]; for (int i = 0; i < nzs.length; ++i) { double[] x = xx[i].clone(); int k = 0; for (int j = 0; j < x.length; ++j) { if (i != j && x[j] != 0) { x[k] = x[j]; nzs_ary[k++] = j; } } if (k < (nzs_ary.length >> 3)) { nzs[i] = Arrays.copyOf(nzs_ary, k); xx[i] = Arrays.copyOf(x,k); } } } double [] grads = new double [gt._xy.length]; double [] beta = _state.beta().clone(); for(int i = 0; i < grads.length; ++i) { double ip = 0; if(i < nzs.length && nzs[i] != null) { int [] ids = nzs[i]; double [] x = xx[i]; for(int j = 0; j < nzs[i].length; ++j) ip += x[j]*beta[ids[j]]; grads[i] = gt._xy[i] - ip; } else { grads[i] = gt._xy[i] - ArrayUtils.innerProduct(xx[i], beta) + xx[i][i] * beta[i]; } } int iter1 = 0; int P = gt._xy.length - 1; final BetaConstraint bc = _state.activeBC(); DataInfo activeData = _state.activeData(); // CD loop while (iter1++ < 1000 /*Math.max(P,500)*/) { double bdiffPos = 0; double bdiffNeg = 0; for (int i = 0; i < activeData._cats; ++i) { for(int j = activeData._catOffsets[i]; j < activeData._catOffsets[i+1]; ++j) { // can do in parallel double b = bc.applyBounds(ADMM.shrinkage(grads[j], l1pen) * diagInv[j],j); double bd = beta[j] - b; bdiffPos = bd > bdiffPos?bd:bdiffPos; bdiffNeg = bd < bdiffNeg?bd:bdiffNeg; if(nzs[j] == null) doUpdateCD(grads, xx[j], bd, activeData._catOffsets[i], activeData._catOffsets[i + 1]); else doSparseUpdateCD(grads, xx[j], nzs[j], bd, activeData._catOffsets[i], activeData._catOffsets[i + 1]); beta[j] = b; } } int numStart = activeData.numStart(); for (int i = numStart; i < P; ++i) { double b = bc.applyBounds(ADMM.shrinkage(grads[i], l1pen) * diagInv[i],i); double bd = beta[i] - b; bdiffPos = bd > bdiffPos?bd:bdiffPos; bdiffNeg = bd < bdiffNeg?bd:bdiffNeg; doUpdateCD(grads, xx[i], bd, i,i+1); beta[i] = b; } // intercept if(_parms._intercept) { double b = bc.applyBounds(grads[P] * wsumInv,P); double bd = beta[P] - b; doUpdateCD(grads, xx[P], bd, P, P + 1); bdiffPos = bd > bdiffPos ? bd : bdiffPos; bdiffNeg = bd < bdiffNeg ? bd : bdiffNeg; beta[P] = b; } if (-1e-4 < bdiffNeg && bdiffPos < 1e-4) break; } return beta; } /** * Created by tomasnykodym on 3/30/15. */ public static final class GramSolver implements ProximalSolver { private final Gram _gram; private Cholesky _chol; private final double[] _xy; final double _lambda; double[] _rho; boolean _addedL2; double _betaEps; private static double boundedX(double x, double lb, double ub) { if (x < lb) x = lb; if (x > ub) x = ub; return x; } public GramSolver(Gram gram, double[] xy, double lmax, double betaEps, boolean intercept) { _gram = gram; _lambda = 0; _betaEps = betaEps; _xy = xy; double[] rhos = MemoryManager.malloc8d(xy.length); computeCholesky(gram, rhos, lmax * 1e-8,intercept); _addedL2 = rhos[0] != 0; _rho = _addedL2 ? rhos : null; } // solve non-penalized problem public void solve(double[] result) { System.arraycopy(_xy, 0, result, 0, _xy.length); _chol.solve(result); double gerr = Double.POSITIVE_INFINITY; if (_addedL2) { // had to add l2-pen to turn the gram to be SPD double[] oldRes = MemoryManager.arrayCopyOf(result, result.length); for (int i = 0; i < 1000; ++i) { solve(oldRes, result); double[] g = gradient(result)._gradient; gerr = Math.max(-ArrayUtils.minValue(g), ArrayUtils.maxValue(g)); if (gerr < 1e-4) return; System.arraycopy(result, 0, oldRes, 0, result.length); } Log.warn("Gram solver did not converge, gerr = " + gerr); } } public GramSolver(Gram gram, double[] xy, boolean intercept, double l2pen, double l1pen, double[] beta_given, double[] proxPen, double[] lb, double[] ub) { if (ub != null && lb != null) for (int i = 0; i < ub.length; ++i) { assert ub[i] >= lb[i] : i + ": ub < lb, ub = " + Arrays.toString(ub) + ", lb = " + Arrays.toString(lb); } _lambda = l2pen; _gram = gram; // Try to pick optimal rho constant here used in ADMM solver. // // Rho defines the strength of proximal-penalty and also the strentg of L1 penalty aplpied in each step. // Picking good rho constant is tricky and greatly influences the speed of convergence and precision with which we are able to solve the problem. // // Intuitively, we want the proximal l2-penalty ~ l1 penalty (l1 pen = lambda/rho, where lambda is the l1 penalty applied to the problem) // Here we compute the rho for each coordinate by using equation for computing coefficient for single coordinate and then making the two penalties equal. // int ii = intercept ? 1 : 0; int icptCol = gram.fullN()-1; double[] rhos = MemoryManager.malloc8d(xy.length); double min = Double.POSITIVE_INFINITY; for (int i = 0; i < xy.length - ii; ++i) { double d = xy[i]; d = d >= 0 ? d : -d; if (d < min && d != 0) min = d; } double ybar = xy[icptCol]; for (int i = 0; i < rhos.length - ii; ++i) { double y = xy[i]; if (y == 0) y = min; double xbar = gram.get(icptCol, i); double x = ((y - ybar * xbar) / ((gram.get(i, i) - xbar * xbar) + l2pen));///gram.get(i,i); rhos[i] = ADMM.L1Solver.estimateRho(x, l1pen, lb == null ? Double.NEGATIVE_INFINITY : lb[i], ub == null ? Double.POSITIVE_INFINITY : ub[i]); } // do the intercept separate as l1pen does not apply to it if (intercept && (lb != null && !Double.isInfinite(lb[icptCol]) || ub != null && !Double.isInfinite(ub[icptCol]))) { int icpt = xy.length - 1; rhos[icpt] = 1;//(xy[icpt] >= 0 ? xy[icpt] : -xy[icpt]); } if (l2pen > 0) gram.addDiag(l2pen); if (proxPen != null && beta_given != null) { gram.addDiag(proxPen); xy = xy.clone(); for (int i = 0; i < xy.length; ++i) xy[i] += proxPen[i] * beta_given[i]; } _xy = xy; _rho = rhos; computeCholesky(gram, rhos, 1e-5,intercept); } private void computeCholesky(Gram gram, double[] rhos, double rhoAdd, boolean intercept) { gram.addDiag(rhos); if(!intercept) { gram.dropIntercept(); rhos = Arrays.copyOf(rhos,rhos.length-1); _xy[_xy.length-1] = 0; } _chol = gram.cholesky(null, true, null); if (!_chol.isSPD()) { // make sure rho is big enough gram.addDiag(ArrayUtils.mult(rhos, -1)); gram.addDiag(rhoAdd,!intercept); Log.info("Got NonSPD matrix with original rho, re-computing with rho = " + (_rho[0]+rhoAdd)); _chol = gram.cholesky(null, true, null); int cnt = 0; double rhoAddSum = rhoAdd; while (!_chol.isSPD() && cnt++ < 5) { gram.addDiag(rhoAdd,!intercept); rhoAddSum += rhoAdd; Log.warn("Still NonSPD matrix, re-computing with rho = " + (rhos[0] + rhoAddSum)); _chol = gram.cholesky(null, true, null); } if (!_chol.isSPD()) throw new NonSPDMatrixException(); } gram.addDiag(ArrayUtils.mult(rhos, -1)); ArrayUtils.mult(rhos, -1); } @Override public double[] rho() { return _rho; } @Override public boolean solve(double[] beta_given, double[] result) { if (beta_given != null) for (int i = 0; i < _xy.length; ++i) result[i] = _xy[i] + _rho[i] * beta_given[i]; else System.arraycopy(_xy, 0, result, 0, _xy.length); _chol.solve(result); return true; } @Override public boolean hasGradient() { return false; } @Override public GradientInfo gradient(double[] beta) { double[] grad = _gram.mul(beta); for (int i = 0; i < _xy.length; ++i) grad[i] -= _xy[i]; return new GradientInfo(Double.NaN,grad); // todo compute the objective } @Override public int iter() { return 0; } } public static class ProximalGradientInfo extends GradientInfo { final GradientInfo _origGinfo; public ProximalGradientInfo(GradientInfo origGinfo, double objVal, double[] gradient) { super(objVal, gradient); _origGinfo = origGinfo; } } /** * Simple wrapper around ginfo computation, adding proximal penalty */ public static class ProximalGradientSolver implements GradientSolver, ProximalSolver { final GradientSolver _solver; double[] _betaGiven; double[] _beta; private ProximalGradientInfo _ginfo; private final ProgressMonitor _pm; final double[] _rho; private final double _objEps; private final double _gradEps; public ProximalGradientSolver(GradientSolver s, double[] betaStart, double[] rho, double objEps, double gradEps, GradientInfo ginfo,ProgressMonitor pm) { super(); _solver = s; _rho = rho; _objEps = objEps; _gradEps = gradEps; _pm = pm; _beta = betaStart; _betaGiven = MemoryManager.malloc8d(betaStart.length); // _ginfo = new ProximalGradientInfo(ginfo,ginfo._objVal,ginfo._gradient); } public static double proximal_gradient(double[] grad, double obj, double[] beta, double[] beta_given, double[] rho) { for (int i = 0; i < beta.length; ++i) { double diff = (beta[i] - beta_given[i]); double pen = rho[i] * diff; if(grad != null) grad[i] += pen; obj += .5 * pen * diff; } return obj; } private ProximalGradientInfo computeProxGrad(GradientInfo ginfo, double [] beta) { assert !(ginfo instanceof ProximalGradientInfo); double[] gradient = ginfo._gradient.clone(); double obj = proximal_gradient(gradient, ginfo._objVal, beta, _betaGiven, _rho); return new ProximalGradientInfo(ginfo, obj, gradient); } @Override public ProximalGradientInfo getGradient(double[] beta) { return computeProxGrad(_solver.getGradient(beta),beta); } @Override public GradientInfo getObjective(double[] beta) { GradientInfo ginfo = _solver.getObjective(beta); double obj = proximal_gradient(null, ginfo._objVal, beta, _betaGiven, _rho); return new ProximalGradientInfo(ginfo,obj,null); } @Override public double[] rho() { return _rho; } private int _iter; @Override public boolean solve(double[] beta_given, double[] beta) { GradientInfo origGinfo = (_ginfo == null || !Arrays.equals(_beta,beta)) ?_solver.getGradient(beta) :_ginfo._origGinfo; System.arraycopy(beta_given,0,_betaGiven,0,beta_given.length); L_BFGS.Result r = new L_BFGS().setObjEps(_objEps).setGradEps(_gradEps).solve(this, beta, _ginfo = computeProxGrad(origGinfo,beta), _pm); System.arraycopy(r.coefs,0,beta,0,r.coefs.length); _beta = r.coefs; _iter += r.iter; _ginfo = (ProximalGradientInfo) r.ginfo; return r.converged; } @Override public boolean hasGradient() { return true; } @Override public GradientInfo gradient(double[] beta) { return getGradient(beta)._origGinfo; } @Override public int iter() { return _iter; } } public static class GLMGradientInfo extends GradientInfo { final double _likelihood; public GLMGradientInfo(double likelihood, double objVal, double[] grad) { super(objVal, grad); _likelihood = likelihood; } public String toString(){ return "GLM grad info: likelihood = " + _likelihood + super.toString(); } } /** * Gradient and line search computation for L_BFGS and also L_BFGS solver wrapper (for ADMM) */ public static final class GLMGradientSolver implements GradientSolver { final GLMParameters _parms; final DataInfo _dinfo; final BetaConstraint _bc; final double _l2pen; // l2 penalty double[][] _betaMultinomial; final Job _job; public GLMGradientSolver(Job job, GLMParameters glmp, DataInfo dinfo, double l2pen, BetaConstraint bc) { _job = job; _bc = bc; _parms = glmp; _dinfo = dinfo; _l2pen = l2pen; } @Override public GLMGradientInfo getGradient(double[] beta) { if (_parms._family == Family.multinomial) { if (_betaMultinomial == null) { int nclasses = beta.length / (_dinfo.fullN() + 1); assert beta.length % (_dinfo.fullN() + 1) == 0:"beta len = " + beta.length + ", fullN +1 == " + (_dinfo.fullN()+1); _betaMultinomial = new double[nclasses][]; for (int i = 0; i < nclasses; ++i) _betaMultinomial[i] = MemoryManager.malloc8d(_dinfo.fullN() + 1); } int off = 0; for (int i = 0; i < _betaMultinomial.length; ++i) { System.arraycopy(beta, off, _betaMultinomial[i], 0, _betaMultinomial[i].length); off += _betaMultinomial[i].length; } GLMMultinomialGradientTask gt = new GLMMultinomialGradientTask(_job,_dinfo, _l2pen, _betaMultinomial, _parms._obj_reg).doAll(_dinfo._adaptedFrame); double l2pen = 0; for (double[] b : _betaMultinomial) l2pen += ArrayUtils.l2norm2(b, _dinfo._intercept); double [] grad = gt.gradient(); if(!_parms._intercept){ for(int i = _dinfo.fullN(); i < beta.length; i += _dinfo.fullN()+1) grad[i] = 0; } return new GLMGradientInfo(gt._likelihood, gt._likelihood * _parms._obj_reg + .5 * _l2pen * l2pen, grad); } else { assert beta.length == _dinfo.fullN() + 1; assert _parms._intercept || (beta[beta.length-1] == 0); GLMGradientTask gt; if(_parms._family == Family.binomial && _parms._link == Link.logit) gt = new GLMBinomialGradientTask(_job == null?null:_job._key,_dinfo,_parms,_l2pen, beta).doAll(_dinfo._adaptedFrame); else if(_parms._family == Family.gaussian && _parms._link == Link.identity) gt = new GLMGaussianGradientTask(_job == null?null:_job._key,_dinfo,_parms,_l2pen, beta).doAll(_dinfo._adaptedFrame); else if(_parms._family == Family.poisson && _parms._link == Link.log) gt = new GLMPoissonGradientTask(_job == null?null:_job._key,_dinfo,_parms,_l2pen, beta).doAll(_dinfo._adaptedFrame); else if(_parms._family == Family.quasibinomial) gt = new GLMQuasiBinomialGradientTask(_job == null?null:_job._key,_dinfo,_parms,_l2pen, beta).doAll(_dinfo._adaptedFrame); else gt = new GLMGenericGradientTask(_job == null?null:_job._key, _dinfo, _parms, _l2pen, beta).doAll(_dinfo._adaptedFrame); double [] gradient = gt._gradient; double likelihood = gt._likelihood; if (!_parms._intercept) // no intercept, null the ginfo gradient[gradient.length - 1] = 0; double obj = likelihood * _parms._obj_reg + .5 * _l2pen * ArrayUtils.l2norm2(beta, true); if (_bc != null && _bc._betaGiven != null && _bc._rho != null) obj = ProximalGradientSolver.proximal_gradient(gradient, obj, beta, _bc._betaGiven, _bc._rho); return new GLMGradientInfo(likelihood, obj, gradient); } } @Override public GradientInfo getObjective(double[] beta) { double l = new GLMResDevTask(_job._key,_dinfo,_parms,beta).doAll(_dinfo._adaptedFrame)._likelihood; return new GLMGradientInfo(l,l*_parms._obj_reg + .5*_l2pen*ArrayUtils.l2norm2(beta,true),null); } } protected static double sparseOffset(double[] beta, DataInfo dinfo) { double etaOffset = 0; if (dinfo._normMul != null && dinfo._normSub != null && beta != null) { int ns = dinfo.numStart(); for (int i = 0; i < dinfo._nums; ++i) etaOffset -= beta[i + ns] * dinfo._normSub[i] * dinfo._normMul[i]; } return etaOffset; } public final class BetaConstraint extends Iced { double[] _betaStart; double[] _betaGiven; double[] _rho; double[] _betaLB; double[] _betaUB; public BetaConstraint() { if (_parms._non_negative) setNonNegative(); } public void setNonNegative() { if (_betaLB == null) { _betaLB = MemoryManager.malloc8d(_dinfo.fullN() + 1); _betaLB[_dinfo.fullN()] = Double.NEGATIVE_INFINITY; } else for (int i = 0; i < _betaLB.length - 1; ++i) _betaLB[i] = Math.max(0, _betaLB[i]); if (_betaUB == null) { _betaUB = MemoryManager.malloc8d(_dinfo.fullN() + 1); Arrays.fill(_betaUB, Double.POSITIVE_INFINITY); } } public double applyBounds(double d, int i) { if(_betaLB != null && d < _betaLB[i]) return _betaLB[i]; if(_betaUB != null && d > _betaUB[i]) return _betaUB[i]; return d; } public BetaConstraint(Frame beta_constraints) { Vec v = beta_constraints.vec("names"); String[] dom; int[] map; if (v.isString()) { dom = new String[(int) v.length()]; map = new int[dom.length]; BufferedString tmpStr = new BufferedString(); for (int i = 0; i < dom.length; ++i) { dom[i] = v.atStr(tmpStr, i).toString(); map[i] = i; } // check for dups String[] sortedDom = dom.clone(); Arrays.sort(sortedDom); for (int i = 1; i < sortedDom.length; ++i) if (sortedDom[i - 1].equals(sortedDom[i])) throw new IllegalArgumentException("Illegal beta constraints file, got duplicate constraint for predictor '" + sortedDom[i - 1] + "'!"); } else if (v.isCategorical()) { dom = v.domain(); map = FrameUtils.asInts(v); // check for dups int[] sortedMap = MemoryManager.arrayCopyOf(map, map.length); Arrays.sort(sortedMap); for (int i = 1; i < sortedMap.length; ++i) if (sortedMap[i - 1] == sortedMap[i]) throw new IllegalArgumentException("Illegal beta constraints file, got duplicate constraint for predictor '" + dom[sortedMap[i - 1]] + "'!"); } else throw new IllegalArgumentException("Illegal beta constraints file, names column expected to contain column names (strings)"); // for now only categoricals allowed here String[] names = ArrayUtils.append(_dinfo.coefNames(), "Intercept"); if (!Arrays.deepEquals(dom, names)) { // need mapping HashMap<String, Integer> m = new HashMap<String, Integer>(); for (int i = 0; i < names.length; ++i) m.put(names[i], i); int[] newMap = MemoryManager.malloc4(dom.length); for (int i = 0; i < map.length; ++i) { if (_removedCols.contains(dom[map[i]])) { newMap[i] = -1; continue; } Integer I = m.get(dom[map[i]]); if (I == null) { throw new IllegalArgumentException("Unrecognized coefficient name in beta-constraint file, unknown name '" + dom[map[i]] + "'"); } newMap[i] = I; } map = newMap; } final int numoff = _dinfo.numStart(); String[] valid_col_names = new String[]{"names", "beta_given", "beta_start", "lower_bounds", "upper_bounds", "rho", "mean", "std_dev"}; Arrays.sort(valid_col_names); for (String s : beta_constraints.names()) if (Arrays.binarySearch(valid_col_names, s) < 0) error("beta_constraints", "Unknown column name '" + s + "'"); if ((v = beta_constraints.vec("beta_start")) != null) { _betaStart = MemoryManager.malloc8d(_dinfo.fullN() + (_dinfo._intercept ? 1 : 0)); for (int i = 0; i < (int) v.length(); ++i) if (map[i] != -1) _betaStart[map[i]] = v.at(i); } if ((v = beta_constraints.vec("beta_given")) != null) { _betaGiven = MemoryManager.malloc8d(_dinfo.fullN() + (_dinfo._intercept ? 1 : 0)); for (int i = 0; i < (int) v.length(); ++i) if (map[i] != -1) _betaGiven[map[i]] = v.at(i); } if ((v = beta_constraints.vec("upper_bounds")) != null) { _betaUB = MemoryManager.malloc8d(_dinfo.fullN() + (_dinfo._intercept ? 1 : 0)); Arrays.fill(_betaUB, Double.POSITIVE_INFINITY); for (int i = 0; i < (int) v.length(); ++i) if (map[i] != -1) _betaUB[map[i]] = v.at(i); } if ((v = beta_constraints.vec("lower_bounds")) != null) { _betaLB = MemoryManager.malloc8d(_dinfo.fullN() + (_dinfo._intercept ? 1 : 0)); Arrays.fill(_betaLB, Double.NEGATIVE_INFINITY); for (int i = 0; i < (int) v.length(); ++i) if (map[i] != -1) _betaLB[map[i]] = v.at(i); } if ((v = beta_constraints.vec("rho")) != null) { _rho = MemoryManager.malloc8d(_dinfo.fullN() + (_dinfo._intercept ? 1 : 0)); for (int i = 0; i < (int) v.length(); ++i) if (map[i] != -1) _rho[map[i]] = v.at(i); } // mean override (for data standardization) if ((v = beta_constraints.vec("mean")) != null) { _parms._stdOverride = true; for (int i = 0; i < v.length(); ++i) { if (!v.isNA(i) && map[i] != -1) { int idx = map == null ? i : map[i]; if (idx > _dinfo.numStart() && idx < _dinfo.fullN()) { _dinfo._normSub[idx - _dinfo.numStart()] = v.at(i); } else { // categorical or Intercept, will be ignored } } } } // standard deviation override (for data standardization) if ((v = beta_constraints.vec("std_dev")) != null) { _parms._stdOverride = true; for (int i = 0; i < v.length(); ++i) { if (!v.isNA(i) && map[i] != -1) { int idx = map == null ? i : map[i]; if (idx > _dinfo.numStart() && idx < _dinfo.fullN()) { _dinfo._normMul[idx - _dinfo.numStart()] = 1.0 / v.at(i); } else { // categorical or Intercept, will be ignored } } } } if (_dinfo._normMul != null) { double normG = 0, normS = 0, normLB = 0, normUB = 0; for (int i = numoff; i < _dinfo.fullN(); ++i) { double s = _dinfo._normSub[i - numoff]; double d = 1.0 / _dinfo._normMul[i - numoff]; if (_betaUB != null && !Double.isInfinite(_betaUB[i])) { normUB *= s; _betaUB[i] *= d; } if (_betaLB != null && !Double.isInfinite(_betaUB[i])) { normLB *= s; _betaLB[i] *= d; } if (_betaGiven != null) { normG += _betaGiven[i] * s; _betaGiven[i] *= d; } if (_betaStart != null) { normS += _betaStart[i] * s; _betaStart[i] *= d; } } if (_dinfo._intercept) { int n = _dinfo.fullN(); if (_betaGiven != null) _betaGiven[n] += normG; if (_betaStart != null) _betaStart[n] += normS; if (_betaLB != null) _betaLB[n] += normLB; if (_betaUB != null) _betaUB[n] += normUB; } } if (_betaStart == null && _betaGiven != null) _betaStart = _betaGiven.clone(); if (_betaStart != null) { if (_betaLB != null || _betaUB != null) { for (int i = 0; i < _betaStart.length; ++i) { if (_betaLB != null && _betaLB[i] > _betaStart[i]) _betaStart[i] = _betaLB[i]; if (_betaUB != null && _betaUB[i] < _betaStart[i]) _betaStart[i] = _betaUB[i]; } } } if (_parms._non_negative) setNonNegative(); check(); } public String toString() { double[][] ary = new double[_betaGiven.length][3]; for (int i = 0; i < _betaGiven.length; ++i) { ary[i][0] = _betaGiven[i]; ary[i][1] = _betaLB[i]; ary[i][2] = _betaUB[i]; } return ArrayUtils.pprint(ary); } public boolean hasBounds() { if (_betaLB != null) for (double d : _betaLB) if (!Double.isInfinite(d)) return true; if (_betaUB != null) for (double d : _betaUB) if (!Double.isInfinite(d)) return true; return false; } public boolean hasProximalPenalty() { return _betaGiven != null && _rho != null && ArrayUtils.countNonzeros(_rho) > 0; } public void adjustGradient(double[] beta, double[] grad) { if (_betaGiven != null && _rho != null) { for (int i = 0; i < _betaGiven.length; ++i) { double diff = beta[i] - _betaGiven[i]; grad[i] += _rho[i] * diff; } } } double proxPen(double[] beta) { double res = 0; if (_betaGiven != null && _rho != null) { for (int i = 0; i < _betaGiven.length; ++i) { double diff = beta[i] - _betaGiven[i]; res += _rho[i] * diff * diff; } res *= .5; } return res; } public void check() { if (_betaLB != null && _betaUB != null) for (int i = 0; i < _betaLB.length; ++i) if (!(_betaLB[i] <= _betaUB[i])) throw new IllegalArgumentException("lower bounds must be <= upper bounds, " + _betaLB[i] + " !<= " + _betaUB[i]); } public BetaConstraint filterExpandedColumns(int[] activeCols) { BetaConstraint res = new BetaConstraint(); if (_betaLB != null) res._betaLB = ArrayUtils.select(_betaLB, activeCols); if (_betaUB != null) res._betaUB = ArrayUtils.select(_betaUB, activeCols); if (_betaGiven != null) res._betaGiven = ArrayUtils.select(_betaGiven, activeCols); if (_rho != null) res._rho = ArrayUtils.select(_rho, activeCols); if (_betaStart != null) res._betaStart = ArrayUtils.select(_betaStart, activeCols); return res; } } }