GLMTask.java example

Explorer
h2o-2-master
package hex.glm;

import hex.FrameTask;
import hex.glm.GLMParams.Family;
import hex.glm.GLMParams.Link;
import hex.gram.Gram;

import java.util.Arrays;

import water.H2O.H2OCountedCompleter;
import water.*;
import water.fvec.Chunk;
import water.util.Utils;

/**
 * Contains all GLM related tasks.
 *
 * @author tomasnykodym
 *
 */

public abstract class GLMTask<T extends GLMTask<T>> extends FrameTask<T> {
  final protected GLMParams _glm;

  public GLMTask(Key jobKey, DataInfo dinfo, GLMParams glm){this(jobKey,dinfo,glm,null);}
  public GLMTask(Key jobKey, DataInfo dinfo, GLMParams glm,H2OCountedCompleter cmp){super(jobKey,dinfo,cmp);_glm = glm;}

  //helper function to compute eta - i.e. beta * row
  protected final double computeEta(final int ncats, final int [] cats, final int nnums, final double [] nums, final double [] beta){
    double res = 0;
    for(int i = 0; i < ncats; ++i)res += beta[cats[i]];
    final int numStart = _dinfo.numStart();
    for (int i = 0; i < nnums; ++i) res += nums[i] * beta[numStart + i];
    if(_dinfo._hasIntercept)
      res += beta[beta.length-1]; // intercept
    return res;
  }
  /**
   * Helper task to compute precise mean of response and number of observations.
   * (We skip rows with NAs, so we can't use Vec's mean in general.
   *
   * @author tomasnykodym
   *
   */
  static class YMUTask extends FrameTask<YMUTask>{
    private long []  _nobs;
    protected double [] _ymu;
    public double [] _ymin;
    public double [] _ymax;
    final int _nfolds;
    public YMUTask(Key jobKey, DataInfo dinfo,int nfolds) {this(jobKey,dinfo, nfolds, null);}
    public YMUTask(Key jobKey, DataInfo dinfo,int nfolds,  H2OCountedCompleter cmp) {
      super(jobKey,dinfo,cmp);
      _nfolds = nfolds;
    }
    @Override public void chunkInit(){
      super.chunkInit();
      _ymu = new double[_nfolds+1];
      _nobs = new long[_nfolds+1];
      _ymax = new double[_nfolds+1];
      _ymin = new double[_nfolds+1];
      Arrays.fill(_ymax,Double.NEGATIVE_INFINITY);
      Arrays.fill(_ymax,Double.POSITIVE_INFINITY);
    }
    @Override protected void processRow(long gid, double[] nums, int ncats, int[] cats, double [] responses) {
      double response = responses[0];
      _ymu[0] += response;
      ++_nobs[0];
      if(response < _ymin[0])_ymin[0] = response;
      if(response > _ymax[0])_ymax[0] = response;
      for(int i = 1; i < _nfolds+1; ++i) {
        if(gid % _nfolds == (i-1))
          continue;
        _ymu[i] += response;
        ++_nobs[i];
        if(response < _ymin[0])_ymin[i] = response;
        if(response > _ymax[i])_ymax[i] = response;
      }
    }
    @Override public void reduce(YMUTask t){
      if(t._nobs[0] != 0){
        if(_nobs[0] == 0){
          _ymu = t._ymu;
          _nobs = t._nobs;
          _ymin = t._ymin;
          _ymax = t._ymax;
        } else {
          for(int i = 0; i < _nfolds+1; ++i) {
            if(_nobs[i] + t._nobs[i] == 0)continue;
            _ymu[i] = _ymu[i] * ((double) _nobs[i] / (_nobs[i] + t._nobs[i])) + t._ymu[i] * t._nobs[i] / (_nobs[i] + t._nobs[i]);
            _nobs[i] += t._nobs[i];
            if(t._ymax[i] > _ymax[i])
              _ymax[i] = t._ymax[i];
            if(t._ymin[i] < _ymin[i])
              _ymin[i] = t._ymin[i];
          }
        }
      }
    }
    @Override protected void chunkDone(long n){
      for(int i = 0; i < _ymu.length; ++i)
        if(_nobs[i] != 0)_ymu[i] /= _nobs[i];
    }
    public double ymu(){return ymu(-1);}
    public long nobs(){return nobs(-1);}
    public double ymu(int foldId){return _ymu[foldId+1];}
    public long nobs(int foldId){return _nobs[foldId+1];}
  }

  public static class GLMLineSearchTask extends GLMTask<GLMLineSearchTask> {

    public GLMLineSearchTask(int noff, Key jobKey, DataInfo dinfo, GLMParams glm, double[] oldBeta, double[] newBeta,  double [] lowerBounds, double [] upperBounds, double ymu, long nobs, H2OCountedCompleter cmp) {
      super(jobKey, dinfo, glm, cmp);
      double [][] betas = new double[32][];
      double step = GLM2.LS_STEP;
      for(int i = 0; i < betas.length; ++i){
        betas[i] = MemoryManager.malloc8d(newBeta.length);
        for(int j = 0; j < oldBeta.length; ++j) {
          betas[i][j] = oldBeta[j] + step * (newBeta[j] - oldBeta[j]);
          if(lowerBounds != null && betas[i][j] < lowerBounds[j])
            betas[i][j] = lowerBounds[j];
          if(upperBounds != null && betas[i][j] > upperBounds[j])
            betas[i][j] = upperBounds[j];
        }
        step *= GLM2.LS_STEP;
      }
      // public GLMIterationTask(Key jobKey, DataInfo dinfo, GLMParams glm, boolean computeGram, boolean validate, boolean computeGradient, double [] beta, double ymu, double reg, float [] thresholds, H2OCountedCompleter cmp) {
      _glmts = new GLMIterationTask[betas.length];
      for(int i = 0; i < _glmts.length; ++i)
        _glmts[i] = new GLMIterationTask(noff, jobKey,dinfo,glm,false,true,true,betas[i],ymu,1.0/nobs,new float[]{0} /* don't really want CMs!*/,null);
    }

    GLMIterationTask [] _glmts;
    @Override public void chunkInit(){
      _glmts = _glmts.clone();
      for(int i = 0; i < _glmts.length; ++i)
        (_glmts[i] = _glmts[i].clone()).chunkInit();
    }
    @Override public void chunkDone(long n){
      for(int i = 0; i < _glmts.length; ++i)
        _glmts[i].chunkDone(n);
    }
    @Override public void postGlobal(){
      for(int i = 0; i < _glmts.length; ++i)
        _glmts[i].postGlobal();
    }
    @Override public final void processRow(long gid, final double [] nums, final int ncats, final int [] cats, double [] responses){
      for(int i = 0; i < _glmts.length; ++i)
        _glmts[i].processRow(gid,nums,ncats,cats,responses);
    }
    @Override
    public void reduce(GLMLineSearchTask git){
      for(int i = 0; i < _glmts.length; ++i)
        _glmts[i].reduce(git._glmts[i]);
    }
  }


  public static class GLMInterceptTask extends MRTask2<GLMInterceptTask> {
    double _xy;
    double _xx;
    double _icpt;
    final double _sub;
    final double _mul;
    final GLMParams _glm;
    public GLMInterceptTask(GLMParams glm, double sub, double mul, double icpt){
      _glm = glm;
      _sub = sub;
      _mul = mul;
      _icpt = icpt ;
    }
    @Override public void map(Chunk offset, Chunk response){
      for(int i = 0; i < offset._len; ++i){
        if(offset.isNA0(i) || response.isNA0(i)) continue;
        double eta = _icpt;
        double y = response.at0(i);
        double mu = _glm.linkInv(eta+(offset.at0(i)-_sub)*_mul);
        double var = _glm.variance(mu);
        if(var < 1e-5) var = 1e-5; // avoid numerical problems with 0 variance
        double d = _glm.linkDeriv(mu);
        double w = 1.0/(var*d*d);
        double z = eta + (y-mu)*d;
        _xy += w*z;
        _xx += w;
      }
    }
    @Override public void reduce(GLMInterceptTask other){
      _xx += other._xx;
      _xy += other._xy;
    }
    @Override public  void postGlobal(){
      _icpt = _xy/_xx;
    }
  }
  /**
   * One iteration of glm, computes weighted gram matrix and t(x)*y vector and t(y)*y scalar.
   *
   * @author tomasnykodym
   */
  public static class GLMIterationTask extends GLMTask<GLMIterationTask> {
    final double [] _beta;
    protected Gram      _gram;
    double [] _xy;
    protected double [] _grad;
    double    _yy;
    GLMValidation _val; // validation of previous model
    protected final double _reg;
    long _nobs;
    final boolean _validate;
    final float [] _thresholds;
    float [][] _newThresholds;
    int [] _ti;
    final boolean _computeGradient;
    final boolean _computeGram;
    public static final int N_THRESHOLDS = 50;
    final int _noffsets;
    final double _ymu;

    public GLMIterationTask(int noff, Key jobKey, DataInfo dinfo, GLMParams glm, boolean computeGram, boolean validate, boolean computeGradient, double [] beta, double ymu, double reg, float [] thresholds, H2OCountedCompleter cmp) {
      super(jobKey, dinfo,glm,cmp);
      assert beta != null;

      _ymu = ymu;
      _noffsets = noff;
      assert beta == null || beta.length == (dinfo.fullN()+(_dinfo._hasIntercept?1:0)-_noffsets):"beta.length != dinfo.fullN(), beta = " + beta.length + " dinfo = " + dinfo.fullN() + ", noffsets = " + _noffsets;
      _beta = beta;
      _reg = reg;
      _computeGram = computeGram;
      _validate = validate;
      assert thresholds != null;
      _thresholds = _validate?thresholds:null;

      _computeGradient = computeGradient;
      assert !_computeGradient || validate;
    }

    private void sampleThresholds(int yi){
      _ti[yi] = (_newThresholds[yi].length >> 2);
      try{ Arrays.sort(_newThresholds[yi]);} catch(Throwable t){
        System.out.println("got AIOOB during sort?! ary = " + Arrays.toString(_newThresholds[yi]));
        return;
      } // sort throws AIOOB sometimes!
      for (int i = 0; i < _newThresholds.length; i += 4)
        _newThresholds[yi][i >> 2] = _newThresholds[yi][i];
    }
    @Override public void processRow(long gid, final double [] nums, final int ncats, final int [] cats, double [] responses /* time...*/){
      ++_nobs;
      double off = 0;
      for(int i = 1; i <= _noffsets; ++i)
        off += nums[nums.length-i];
      final double y = responses[0];
      assert ((_glm.family != Family.gamma) || y > 0) : "illegal response column, y must be > 0  for family = Gamma.";
      assert ((_glm.family != Family.binomial) || (0 <= y && y <= 1)) : "illegal response column, y must be <0,1>  for family=Binomial. got " + y;
      double w, eta, mu, var, z;
      final int numStart = _dinfo.numStart();
      double d = 1;
      if( _glm.family == Family.gaussian && _glm.link == Link.identity){
        w = 1;
        z = _glm.link(y);
        mu =  off + ((_validate || _computeGradient)?computeEta(ncats,cats,nums.length-_noffsets,nums,_beta):0);
      } else {
        eta = computeEta(ncats, cats,nums.length-_noffsets,nums,_beta);
        mu = _glm.linkInv(eta+off);
        var = _glm.variance(mu);
        if(var < 1e-5) var = 1e-5; // avoid numerical problems with 0 variance
        d = _glm.linkDeriv(mu);
        z = eta + (y-mu)*d;
        w = 1.0/(var*d*d);
      }
      if(_validate) {
        _val.add(y, mu);
        if(_glm.family == Family.binomial) {
          int yi = (int) y;
          if (_ti[yi] == _newThresholds[yi].length)
            sampleThresholds(yi);
          _newThresholds[yi][_ti[yi]++] = (float) mu;
        }
      }
      assert w >= 0|| Double.isNaN(w) : "invalid weight " + w; // allow NaNs - can occur if line-search is needed!
      final double wz = w * z;
      _yy += wz * z;
      if(_computeGradient || _computeGram){
        final double grad = _computeGradient?w*d*(mu-y):0;
        for(int i = 0; i < ncats; ++i){
          final int ii = cats[i];
          if(_computeGradient)_grad[ii] += grad;
          _xy[ii] += wz;
        }
        for(int i = 0; i < nums.length - _noffsets; ++i){
          _xy[numStart+i] += wz*nums[i];
          if(_computeGradient)
            _grad[numStart+i] += grad*nums[i];
        }

        if(_dinfo._hasIntercept){
          _xy[_xy.length-1] += wz;
          if(_computeGradient)
            _grad[numStart + _dinfo._nums - _noffsets] += grad;
        }
        if(_computeGram)_gram.addRow(nums, ncats, cats, w);
      }

    }
    @Override protected void chunkInit(){
      if(_computeGram)_gram = new Gram(_dinfo.fullN()-_noffsets, _dinfo.largestCat(), _dinfo._nums-_noffsets, _dinfo._cats, _dinfo._hasIntercept);
      _xy = MemoryManager.malloc8d(_dinfo.fullN()+(_dinfo._hasIntercept?1:0)-_noffsets); // + 1 is for intercept
      int rank = 0;
      if(_beta != null)for(double d:_beta)if(d != 0)++rank;
      if(_validate){
        _val = new GLMValidation(null, _glm,rank, _thresholds);
        if(_glm.family == Family.binomial){
          _ti = new int[2];
          _newThresholds = new float[2][N_THRESHOLDS << 2];
        }
      }
      if(_computeGradient)
        _grad = MemoryManager.malloc8d(_dinfo.fullN()+ (_dinfo._hasIntercept?1:0) - _noffsets);
      if(_glm.family == Family.binomial && _validate){
        _ti = new int[2];
        _newThresholds = new float[2][4*N_THRESHOLDS];
      }
    }

    @Override protected void chunkDone(long n){
      if(_computeGram)_gram.mul(_reg);
      for(int i = 0; i < _xy.length; ++i)
        _xy[i] *= _reg;
      if(_grad != null)
        for(int i = 0; i < _grad.length; ++i)
          _grad[i] *= _reg;
      _yy *= _reg;
      if(_validate && _glm.family == Family.binomial) {
        _newThresholds[0] = Arrays.copyOf(_newThresholds[0],_ti[0]);
        _newThresholds[1] = Arrays.copyOf(_newThresholds[1],_ti[1]);
        Arrays.sort(_newThresholds[0]);
        Arrays.sort(_newThresholds[1]);
      }
    }

    @Override
    public void reduce(GLMIterationTask git){
      if(_jobKey == null || Job.isRunning(_jobKey)) {
        Utils.add(_xy, git._xy);
        if (_computeGram) _gram.add(git._gram);
        _yy += git._yy;
        _nobs += git._nobs;
        if (_validate) _val.add(git._val);
        if (_computeGradient) Utils.add(_grad, git._grad);
        if(_validate && _glm.family == Family.binomial) {
          _newThresholds[0] = Utils.join(_newThresholds[0], git._newThresholds[0]);
          _newThresholds[1] = Utils.join(_newThresholds[1], git._newThresholds[1]);
          if (_newThresholds[0].length >= 2 * N_THRESHOLDS) {
            for (int i = 0; i < 2 * N_THRESHOLDS; i += 2)
              _newThresholds[0][i >> 1] = _newThresholds[0][i];
          }
          if (_newThresholds[0].length > N_THRESHOLDS)
            _newThresholds[0] = Arrays.copyOf(_newThresholds[0], N_THRESHOLDS);
          if (_newThresholds[1].length >= 2 * N_THRESHOLDS) {
            for (int i = 0; i < 2 * N_THRESHOLDS; i += 2)
              _newThresholds[1][i >> 1] = _newThresholds[1][i];
          }
          if (_newThresholds[1].length > N_THRESHOLDS)
            _newThresholds[1] = Arrays.copyOf(_newThresholds[1], N_THRESHOLDS);
        }
        super.reduce(git);
      }
    }

    @Override public void postGlobal(){
      if(_val != null){
        _val.computeAIC();
        _val.computeAUC();
      }

    }
    public double [] gradient(double alpha, double lambda){
      final double [] res = _grad.clone();
      if(_beta != null)
        for(int i = 0; i < res.length-1; ++i) res[i] += (1-alpha)*lambda*_beta[i];
      return res;
    }
  }
}