package hex.glm;
import hex.FrameTask.DataInfo;
import hex.VarImp;
import hex.glm.GLMParams.Family;
import hex.glm.GLMValidation.GLMXValidation;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.api.DocGen;
import water.api.Request.API;
import water.fvec.Chunk;
import water.util.Utils;
import java.util.Arrays;
import java.util.HashMap;
public class GLMModel extends Model implements Comparable<GLMModel> {
static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields
static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.
@API(help="lambda_value max, smallest lambda_value which drives all coefficients to zero")
final double lambda_max;
@API(help="mean of response in the training dataset")
public final double ymu;
@API(help="actual expected mean of the response (given by the user before running the model or ymu)")
final double prior;
@API(help="job key assigned to the job building this model")
final Key job_key;
@API(help = "Model parameters", json = true)
final private GLM2 parameters;
@Override public final GLM2 get_params() { return parameters; }
@Override public final Request2 job() { return get_params(); }
@API(help="Input data info")
DataInfo data_info;
@API(help="Decision threshold.")
double threshold;
@API(help="glm params")
final GLMParams glm;
@API(help="beta epsilon - stop iterating when beta diff is below this threshold.")
final double beta_eps;
@API(help="regularization parameter driving proportion of L1/L2 penalty.")
final double alpha;
@API(help="column names including expanded categorical values")
public String [] coefficients_names;
@API(help="index of lambda_value giving best results")
int best_lambda_idx;
public DataInfo dinfo(){return data_info;}
public Key [] xvalModels() {
if(submodels == null)return null;
for(Submodel sm:submodels)
if(sm.xvalidation instanceof GLMXValidation){
return ((GLMXValidation)sm.xvalidation).xval_models;
}
return null;
}
public double auc(){
if(glm.family == Family.binomial && submodels != null && submodels[best_lambda_idx].validation != null) {
Submodel sm = submodels[best_lambda_idx];
return sm.xvalidation != null?sm.xvalidation.auc:sm.validation.auc;
}
return -1;
}
public double aic(){
if(submodels != null && submodels[best_lambda_idx].validation != null){
Submodel sm = submodels[best_lambda_idx];
return sm.xvalidation != null?sm.xvalidation.aic:sm.validation.aic;
}
return Double.MAX_VALUE;
}
public double devExplained(){
if(submodels == null || submodels[best_lambda_idx].validation == null)
return 0;
Submodel sm = submodels[best_lambda_idx];
GLMValidation val = sm.xvalidation == null?sm.validation:sm.xvalidation;
return 1.0 - val.residual_deviance/null_validation.residual_deviance;
}
public static class UnlockModelTask extends DTask.DKeyTask<UnlockModelTask,GLMModel>{
final Key _jobKey;
public UnlockModelTask(H2OCountedCompleter cmp, Key modelKey, Key jobKey){
super(cmp,modelKey);
_jobKey = jobKey;
}
@Override
public void map(GLMModel m) {
Key [] xvals = m.xvalModels();
if(xvals != null){
addToPendingCount(xvals.length);
for(int i = 0; i < xvals.length; ++i)
new UnlockModelTask(this,xvals[i],_jobKey).forkTask();
}
m.unlock(_jobKey);
}
}
public static class DeleteModelTask extends DTask.DKeyTask<DeleteModelTask,GLMModel>{
final Key _modelKey;
public DeleteModelTask(H2OCountedCompleter cmp, Key modelKey){
super(cmp,modelKey);
_modelKey = modelKey;
}
@Override
public void map(GLMModel m) {
Key[] xvals = m.xvalModels();
if (xvals != null) {
addToPendingCount(xvals.length);
for (int i = 0; i < xvals.length; ++i)
new DeleteModelTask(this, xvals[i]).forkTask();
}
m.delete();
}
}
@Override public GLMModel clone(){
GLMModel res = (GLMModel)super.clone();
res.submodels = submodels.clone();
if(warnings != null)
res.warnings = warnings.clone();
else
res.warnings = new String[0];
return res;
}
@Override
public int compareTo(GLMModel m){
// assert m._dataKey.equals(_dataKey);
assert m.glm.family == glm.family;
assert m.glm.link == glm.link;
switch(glm.family){
case binomial: // compare by AUC, higher is better
return (int)(1e6*(m.auc()-auc()));
case gamma: // compare by percentage of explained deviance, higher is better
return (int)(100*(m.devExplained()-devExplained()));
default: // compare by AICs by default, lower is better
return (int)(100*(aic()- m.aic()));
}
}
@API(help="Overall run time")
long run_time;
@API(help="computation started at")
long start_time;
// fully expanded beta used for scoring
private double [] global_beta;
@API(help="Validation of the null model")
public GLMValidation null_validation;
static class Submodel extends Iced {
static final int API_WEAVER = 1; // This file has auto-gen'd doc & json fields
static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.
@API(help="lambda_value value used for computation of this submodel")
final double lambda_value;
@API(help="number of iterations computed.")
final int iteration;
@API(help="running time of the algo in ms.")
final long run_time;
@API(help="Validation")
GLMValidation validation;
@API(help="X-Validation")
GLMValidation xvalidation;
@API(help="Beta vector containing model coefficients.") double [] beta;
@API(help="Beta vector containing normalized coefficients (coefficients obtained on normalized data).") double [] norm_beta;
final int rank;
@API(help="Indexes to the coefficient_names array containing names (and order) of the non-zero coefficients in this model.")
final int [] idxs;
@API(help="sparseCoefFlag")
final boolean sparseCoef;
public Submodel(double lambda , double [] beta, double [] norm_beta, long run_time, int iteration, boolean sparseCoef){
this.lambda_value = lambda;
this.run_time = run_time;
this.iteration = iteration;
int r = 0;
if(beta != null){
final double [] b = norm_beta != null?norm_beta:beta;
// grab the indeces of non-zero coefficients
for(double d:beta)if(d != 0)++r;
idxs = MemoryManager.malloc4(sparseCoef?r:beta.length);
int j = 0;
for(int i = 0; i < beta.length; ++i)
if(!sparseCoef || beta[i] != 0)idxs[j++] = i;
j = 0;
this.beta = MemoryManager.malloc8d(idxs.length);
for(int i:idxs)
this.beta[j++] = beta[i];
if(norm_beta != null){
j = 0;
this.norm_beta = MemoryManager.malloc8d(idxs.length);
for(int i:idxs) this.norm_beta[j++] = norm_beta[i];
}
} else idxs = null;
rank = r;
this.sparseCoef = sparseCoef;
}
}
@API(help = "models computed for particular lambda_value values")
Submodel [] submodels;
final boolean useAllFactorLevels;
@API(help = "Variable importances", json=true)
VarImp variable_importances;
public GLMModel(GLM2 parameters, Key selfKey, Key dataKey, GLMParams glmp, String [] coefficients_names, double [] beta, DataInfo dinfo, double threshold) {
super(selfKey,dataKey,dinfo._adaptedFrame,null);
this.parameters = (GLM2)parameters.clone();
submodels = new Submodel[]{new Submodel(0,beta,null,-1,-1,false)};
this.coefficients_names = coefficients_names;
alpha = 0;
lambda_max = Double.NaN;
this.threshold = threshold;
useAllFactorLevels = dinfo._useAllFactorLevels;
global_beta = submodels[0].beta.clone();
this.glm = glmp;
this.ymu = Double.NaN;
this.prior = Double.NaN;
this.warnings = new String[]{"Hand made model."};
this.data_info = dinfo;
this.beta_eps = Double.NaN;
this.job_key = null;
best_lambda_idx = 0;
}
public GLMModel(GLM2 job, Key selfKey, DataInfo dinfo, GLMParams glm, GLMValidation nullVal, double beta_eps, double alpha, double lambda_max, double ymu, double prior) {
super(selfKey,job.source._key == null ? dinfo._frameKey : job.source._key,dinfo._adaptedFrame, /* priorClassDistribution */ null);
parameters = Job.hygiene((GLM2) job.clone());
job_key = job.self();
this.ymu = ymu;
this.prior = prior;
this.glm = glm;
threshold = 0.5;
this.data_info = dinfo;
this.warnings = new String[0];
this.alpha = alpha;
this.lambda_max = lambda_max;
this.beta_eps = beta_eps;
submodels = new Submodel[0];
run_time = 0;
start_time = System.currentTimeMillis();
coefficients_names = coefNames();
useAllFactorLevels = dinfo._useAllFactorLevels;
null_validation = nullVal;
null_validation.null_deviance = null_validation.residual_deviance;
}
public void pickBestModel(boolean useAuc){
int bestId = submodels.length-1;
if(submodels.length > 2) {
boolean xval = false;
GLMValidation bestVal = null;
for(Submodel sm:submodels) {
if(sm.xvalidation != null) {
xval = true;
bestVal = sm.xvalidation;
}
}
if(!xval)
bestVal = submodels[0].validation;
for (int i = 1; i < submodels.length; ++i) {
GLMValidation val = xval ? submodels[i].xvalidation : submodels[i].validation;
if (val == null || val == bestVal) continue;
if ((useAuc && val.auc > bestVal.auc)
|| (xval && val.residual_deviance < bestVal.residual_deviance)
|| (((bestVal.residual_deviance - val.residual_deviance) / null_validation.residual_deviance) >= 0.01)) {
bestVal = val;
bestId = i;
}
}
}
best_lambda_idx = bestId;
setSubmodelIdx(bestId);
}
// public static void setSubmodel(H2OCountedCompleter cmp, Key modelKey, final double lambda, double[] beta, double[] norm_beta, int iteration, long runtime, boolean sparseCoef){
public static void setSubmodel(H2OCountedCompleter cmp, Key modelKey, final double lambda, double[] beta, double[] norm_beta, int iteration, long runtime, boolean sparseCoef){
setSubmodel(cmp,modelKey,lambda,beta,norm_beta,iteration,runtime,sparseCoef,null);
}
public static class GetScoringModelTask extends DTask.DKeyTask<GetScoringModelTask,GLMModel> {
final double _lambda;
public GLMModel _res;
public GetScoringModelTask(H2OCountedCompleter cmp, Key modelKey, double lambda){
super(cmp,modelKey);
_lambda = lambda;
}
@Override
public void map(GLMModel m) {
_res = m.clone();
Submodel sm = Double.isNaN(_lambda)?_res.submodels[_res.best_lambda_idx]:_res.submodelForLambda(_lambda);
assert sm != null : "GLM[" + m._key + "]: missing submodel for lambda " + _lambda;
sm = (Submodel) sm.clone();
_res.submodels = new Submodel[]{sm};
_res.setSubmodelIdx(0);
}
}
public static void setXvalidation(H2OCountedCompleter cmp, Key modelKey, final double lambda, final GLMValidation val){
// expected cmp has already set correct pending count
new TAtomic<GLMModel>(cmp){
@Override
public GLMModel atomic(GLMModel old) {
if(old == null)return old; // job could've been cancelled
old.submodels = old.submodels.clone();
int id = old.submodelIdForLambda(lambda);
old.submodels[id] = (Submodel)old.submodels[id].clone();
old.submodels[id].xvalidation = val;
old.pickBestModel(false);
return old;
}
}.fork(modelKey);
}
public static void setSubmodel(H2OCountedCompleter cmp, Key modelKey, final double lambda, double[] beta, double[] norm_beta, final int iteration, long runtime, boolean sparseCoef, final GLMValidation val){
final Submodel sm = new Submodel(lambda,beta, norm_beta, runtime, iteration,sparseCoef);
sm.validation = val;
cmp.addToPendingCount(1);
new TAtomic<GLMModel>(cmp){
@Override
public GLMModel atomic(GLMModel old) {
if(old == null)return old; // job could've been cancelled!
if(old.submodels == null){
old.submodels = new Submodel[]{sm};
} else {
int id = old.submodelIdForLambda(lambda);
if (id < 0) {
id = -id - 1;
old.submodels = Arrays.copyOf(old.submodels, old.submodels.length + 1);
for (int i = old.submodels.length - 1; i > id; --i)
old.submodels[i] = old.submodels[i - 1];
} else if (old.submodels[id].iteration > sm.iteration)
return old;
else
old.submodels = old.submodels.clone();
old.submodels[id] = sm;
old.run_time = Math.max(old.run_time,sm.run_time);
}
old.pickBestModel(false);
return old;
}
}.fork(modelKey);
}
public void addSubmodel(double lambda){
submodels = Arrays.copyOf(submodels,submodels.length+1);
run_time = (System.currentTimeMillis()-start_time);
submodels[submodels.length-1] = new Submodel(lambda,null, null, 0, 0,true);
}
public void dropSubmodel() {
submodels = Arrays.copyOf(submodels,submodels.length-1);
}
public double lambda(){
if(submodels == null)return Double.NaN;
return submodels[best_lambda_idx].lambda_value;
}
public GLMValidation validation(){
return submodels[best_lambda_idx].validation;
}
public int iteration(){
Submodel [] sm = submodels;
for(int i = sm.length-1; i >= 0; --i)
if(sm[i] != null && sm[i].iteration != 0)
return sm[i].iteration;
return 0;
}
public double [] beta(){return global_beta;}
public double [] norm_beta(double lambda){
int i = submodels.length-1;
for(;i>=0;--i)
if(submodels[i].lambda_value == lambda) {
if(submodels[i].norm_beta == null)
return beta(); // not normalized
double [] res = MemoryManager.malloc8d(beta().length);
int k = 0;
for(int j:submodels[i].idxs)
res[j] = submodels[i].norm_beta[k++];
return res;
}
throw new RuntimeException("No submodel for lambda_value = " + lambda);
}
public void addWarning(String w){
final int n = warnings.length;
warnings = Arrays.copyOf(warnings,warnings.length+1);
warnings[n] = w;
}
@Override protected float[] score0(double[] data, float[] preds) {
double eta = 0.0;
final double [] b = beta();
if(!useAllFactorLevels){ // skip level 0 of all factors
for(int i = 0; i < data_info._catOffsets.length-1; ++i) if(data[i] != 0)
eta += b[data_info._catOffsets[i] + (int)(data[i]-1)];
} else { // do not skip any levels!
for(int i = 0; i < data_info._catOffsets.length-1; ++i)
eta += b[data_info._catOffsets[i] + (int)data[i]];
}
final int noff = data_info.numStart() - data_info._cats;
for(int i = data_info._cats; i < data.length; ++i)
eta += b[noff+i]*data[i];
if(data_info._hasIntercept)
eta += b[b.length-1]; // add intercept
double mu = glm.linkInv(eta);
preds[0] = (float)mu;
if( glm.family == Family.binomial ) { // threshold for prediction
if(Double.isNaN(mu)){
preds[0] = Float.NaN;
preds[1] = Float.NaN;
preds[2] = Float.NaN;
} else {
preds[0] = (mu >= threshold ? 1 : 0);
preds[1] = 1.0f - (float)mu; // class 0
preds[2] = (float)mu; // class 1
}
}
return preds;
}
public final int ncoefs() {return beta().length;}
@Override public int nclasses(){ return glm.family == Family.binomial?2:1;}
@Override
public String [] classNames(){
String [] res = super.classNames();
if(glm.getFamily() == Family.binomial && res == null)
res = new String[]{"0","1"};
return res;
}
// public static void setAndTestValidation(final H2OCountedCompleter cmp,final Key modelKey, final double lambda, final GLMValidation val){
// if(cmp != null)cmp.addToPendingCount(1);
// new TAtomic<GLMModel>(cmp){
// @Override
// public GLMModel atomic(GLMModel old) {
// if(old == null)return old;
// old.submodels = old.submodels.clone();
// Submodel sm = old.submodelForLambda(lambda);
// if(sm == null)return old;
// if(val instanceof GLMXValidation)
// sm.xvalidation = (GLMXValidation)val;
// else
// sm.validation = val;
// old.pickBestModel(false);
// return old;
// }
// }.fork(modelKey);
// }
public static class GLMValidationTask<T extends GLMValidationTask<T>> extends MRTask2<T> {
protected final GLMModel _model;
protected GLMValidation _res;
public final double _lambda;
public boolean _improved;
Key _jobKey;
public static Key makeKey(){return Key.make("__GLMValidation_" + Key.make().toString());}
public GLMValidationTask(GLMModel model, double lambda){this(model,lambda,null);}
public GLMValidationTask(GLMModel model, double lambda, H2OCountedCompleter completer){super(completer); _lambda = lambda; _model = model;}
@Override public void map(Chunk [] chunks){
_res = new GLMValidation(null,_model.glm,_model.rank(_lambda));
final int nrows = chunks[0]._len;
double [] row = MemoryManager.malloc8d(_model._names.length);
float [] preds = MemoryManager.malloc4f(_model.glm.family == Family.binomial?3:1);
OUTER:
for(int i = 0; i < nrows; ++i){
if(chunks[chunks.length-1].isNA0(i))continue;
for(int j = 0; j < chunks.length-1; ++j){
if(chunks[j].isNA0(i))continue OUTER;
row[j] = chunks[j].at0(i);
}
_model.score0(row, preds);
double response = chunks[chunks.length-1].at0(i);
_res.add(response, _model.glm.family == Family.binomial?preds[2]:preds[0]);
}
}
@Override public void reduce(GLMValidationTask gval){_res.add(gval._res);}
@Override public void postGlobal(){
_res.computeAIC();
_res.computeAUC();
}
}
// use general score to reduce number of possible different code paths
public static class GLMXValidationTask extends GLMValidationTask<GLMXValidationTask>{
protected final GLMModel [] _xmodels;
protected GLMValidation [] _xvals;
long _nobs;
final float [] _thresholds;
public static Key makeKey(){return Key.make("__GLMValidation_" + Key.make().toString());}
public GLMXValidationTask(GLMModel mainModel,double lambda, GLMModel [] xmodels, float [] thresholds){this(mainModel,lambda,xmodels,thresholds,null);}
public GLMXValidationTask(GLMModel mainModel,double lambda, GLMModel [] xmodels, float [] thresholds, final H2OCountedCompleter completer){
super(mainModel, lambda,completer);
_xmodels = xmodels;
_thresholds = thresholds;
}
@Override public void map(Chunk [] chunks){
_xvals = new GLMValidation[_xmodels.length];
for(int i = 0; i < _xmodels.length; ++i)
_xvals[i] = new GLMValidation(null,_xmodels[i].glm,_xmodels[i].rank(),_thresholds);
final int nrows = chunks[0]._len;
long start = chunks[0]._start;
double [] row = MemoryManager.malloc8d(_xmodels[0]._names.length);
float [] preds = MemoryManager.malloc4f(_xmodels[0].glm.family == Family.binomial?3:1);
OUTER:
for(int i = 0; i < nrows; ++i){
if(chunks[chunks.length-1].isNA0(i))continue;
for(int j = 0; j < chunks.length-1; ++j){
if(chunks[j].isNA0(i))continue OUTER;
row[j] = chunks[j].at0(i);
}
++_nobs;
final int mid = (int)((i + start) % _xmodels.length);
final GLMModel model = _xmodels[mid];
final GLMValidation val = _xvals[mid];
model.score0(row, preds);
double response = chunks[chunks.length-1].at80(i);
val.add(response, model.glm.family == Family.binomial?preds[2]:preds[0]);
}
}
@Override public void reduce(GLMXValidationTask gval){
_nobs += gval._nobs;
for(int i = 0; i < _xvals.length; ++i)
_xvals[i].add(gval._xvals[i]);}
@Override public void postGlobal() {
H2OCountedCompleter cmp = (H2OCountedCompleter)getCompleter();
if(cmp != null)cmp.addToPendingCount(_xvals.length + 1);
for (int i = 0; i < _xvals.length; ++i) {
_xvals[i].computeAIC();
_xvals[i].computeAUC();
_xvals[i].nobs = _nobs - _xvals[i].nobs;
_xvals[i].null_deviance = _xmodels[i].null_validation.residual_deviance;
GLMModel.setXvalidation(cmp, _xmodels[i]._key, _lambda, _xvals[i]);
}
GLMXValidation xval = new GLMXValidation(_model, _xmodels, _xvals, _lambda, _nobs,_thresholds);
xval.null_deviance = _model.null_validation.residual_deviance;
GLMModel.setXvalidation(cmp, _model._key, _lambda, xval);
}
}
public GLMParams getParams() {
return glm;
}
@Override
public String toString(){
return ("GLM Model (key=" + _key + " , trained on " + _dataKey + ", family = " + glm.family + ", link = " + glm.link + ", #iterations = " + iteration() + ")");
}
public int rank() {return rank(submodels[best_lambda_idx].lambda_value);}
public int submodelIdForLambda(double lambda){
if(lambda > lambda_max)lambda = lambda_max;
int i = submodels.length-1;
for(;i >=0; --i)
// first condition to cover lambda == 0 case (0/0 is Inf in java!)
if(lambda == submodels[i].lambda_value || Math.abs(submodels[i].lambda_value - lambda)/lambda < 1e-5)
return i;
else if(submodels[i].lambda_value > lambda)
return -i-2;
return -1;
}
public Submodel submodelForLambda(double lambda){
if(lambda > lambda_max)
return submodels[0];
int i = submodelIdForLambda(lambda);
return i < 0?null:submodels[i];
}
public int rank(double lambda) {
Submodel sm = submodelForLambda(lambda);
if(sm == null)return 0;
return submodelForLambda(lambda).rank;
}
public void setValidation(GLMValidation val ){
submodels[submodels.length-1].validation = val;
}
public void setSubmodelIdx(int l){
best_lambda_idx = l;
threshold = submodels[l].validation == null?0.5:submodels[l].validation.best_threshold;
if(global_beta == null) global_beta = MemoryManager.malloc8d(this.coefficients_names.length);
else Arrays.fill(global_beta,0);
int j = 0;
for(int i:submodels[l].idxs)
global_beta[i] = submodels[l].beta[j++];
}
/**
* get beta coefficients in a map indexed by name
* @return
*/
public HashMap<String,Double> coefficients(){
HashMap<String, Double> res = new HashMap<String, Double>();
final double [] b = beta();
if(b != null) for(int i = 0; i < b.length; ++i)res.put(coefficients_names[i],b[i]);
return res;
}
private String [] coefNames(){
return Utils.append(data_info.coefNames(),new String[]{"Intercept"});
}
public VarImp varimp() {
return this.variable_importances;
}
protected void maybeComputeVariableImportances() {
GLM2 params = get_params();
this.variable_importances = null;
final double[] b = beta();
if (params.variable_importances && null != b) {
// Warn if we may be returning results that might not include an important (base) level. . .
if (! params.use_all_factor_levels)
this.addWarning("Variable Importance may be missing important variables: because use_all_factor_levels is off the importance of base categorical levels will NOT be included.");
float[] coefs_abs_value = new float[b.length - 1]; // Don't include the Intercept
String[] names = new String[b.length - 1];
for (int i = 0; i < b.length - 1; ++i) {
coefs_abs_value[i] = (float)Math.abs(b[i]);
names[i] = coefficients_names[i];
}
this.variable_importances = new VarImp(coefs_abs_value, names);
}
}
}