DRealHistogram.java example

Explorer
h2o-2-master
package hex.gbm;

import water.MemoryManager;
import water.TypeMap;
import water.util.Utils;

import java.util.Arrays;
import java.util.Comparator;

/**
   A Histogram, computed in parallel over a Vec.
   <p>
   Sums and sums-of-squares of doubles
   <p>
   @author Cliff Click
*/
public class DRealHistogram extends DHistogram<DRealHistogram> {
  private double _sums[], _ssqs[]; // Sums & square-sums, shared, atomically incremented

  private static int FROZENTYPE = TypeMap.onIce("hex.gbm.DRealHistogram");
  @Override public int frozenType() { assert(FROZENTYPE != 0); return FROZENTYPE; }

  public DRealHistogram( String name, final int nbins, byte isInt, float min, float maxEx, long nelems, int min_rows, boolean doGrpSplit ) {
    super(name,nbins,isInt,min,maxEx,nelems,min_rows,doGrpSplit);
  }
  @Override boolean isBinom() { return false; }

  @Override public double mean(int b) {
    long n = _bins[b];
    return n>0 ? _sums[b]/n : 0;
  }
  @Override public double var (int b) {
    long n = _bins[b];
    if( n<=1 ) return 0;
    return (_ssqs[b] - _sums[b]*_sums[b]/n)/(n-1);
  }

  // Big allocation of arrays
  @Override void init0() {
    _sums = MemoryManager.malloc8d(_nbin);
    _ssqs = MemoryManager.malloc8d(_nbin);
  }

  // Add one row to a bin found via simple linear interpolation.
  // Compute response mean & variance.
  // Done racily instead F/J map calls, so atomic
  @Override void incr0( int b, double y ) {
    Utils.AtomicDoubleArray.add(_sums,b,y);
    Utils.AtomicDoubleArray.add(_ssqs,b,y*y);
  }
  // Same, except square done by caller
  void incr1( int b, double y, double yy ) {
    Utils.AtomicDoubleArray.add(_sums,b,y);
    Utils.AtomicDoubleArray.add(_ssqs,b,yy);
  }

  // Merge two equal histograms together.
  // Done in a F/J reduce, so no synchronization needed.
  @Override void add0( DRealHistogram dsh ) {
    Utils.add(_sums,dsh._sums);
    Utils.add(_ssqs,dsh._ssqs);
  }

  // Compute a "score" for a column; lower score "wins" (is a better split).
  // Score is the sum of the MSEs when the data is split at a single point.
  // mses[1] == MSE for splitting between bins  0  and 1.
  // mses[n] == MSE for splitting between bins n-1 and n.
  @Override public DTree.Split scoreMSE( int col ) {
    final int nbins = nbins();
    assert nbins > 1;

    // Store indices from sort to determine group split later
    Integer idx[] = new Integer[nbins];
    for(int b = 0; b < nbins; b++) idx[b] = b;

    // Sort predictor levels in ascending order of mean response within each bin
    if(_isInt == 2 && _step == 1.0f && nbins >= 4 && _doGrpSplit) {
      final double[] means = new double[nbins];
      System.arraycopy(_sums, 0, means, 0, nbins);
      Arrays.sort(idx, new Comparator<Integer>() {
        @Override
        public int compare(Integer o1, Integer o2) {
          return ((Double)means[o1]).compareTo(means[o2]);
        }
      });
    }

    // Compute mean/var for cumulative bins from 0 to nbins inclusive.
    double sums0[] = MemoryManager.malloc8d(nbins+1);
    double ssqs0[] = MemoryManager.malloc8d(nbins+1);
    long     ns0[] = MemoryManager.malloc8 (nbins+1);
    for( int b=1; b<=nbins; b++ ) {
      double m0 = sums0[b-1],  m1 = _sums[b-1];
      double s0 = ssqs0[b-1],  s1 = _ssqs[b-1];
      long   k0 = ns0  [b-1],  k1 = _bins[b-1];
      if( k0==0 && k1==0 ) continue;
      sums0[b] = m0+m1;
      ssqs0[b] = s0+s1;
      ns0  [b] = k0+k1;
    }
    long tot = ns0[nbins];
    // If we see zero variance, we must have a constant response in this
    // column.  Normally this situation is cut out before we even try to split, but we might
    // have NA's in THIS column...
    if( ssqs0[nbins]*tot - sums0[nbins]*sums0[nbins] == 0 ) {
      assert isConstantResponse(); return null;
    }

    // Compute mean/var for cumulative bins from nbins to 0 inclusive.
    double sums1[] = MemoryManager.malloc8d(nbins+1);
    double ssqs1[] = MemoryManager.malloc8d(nbins+1);
    long     ns1[] = MemoryManager.malloc8 (nbins+1);
    for( int b=nbins-1; b>=0; b-- ) {
      double m0 = sums1[b+1],  m1 = _sums[b];
      double s0 = ssqs1[b+1],  s1 = _ssqs[b];
      long   k0 = ns1  [b+1],  k1 = _bins[b];
      if( k0==0 && k1==0 ) continue;
      sums1[b] = m0+m1;
      ssqs1[b] = s0+s1;
      ns1  [b] = k0+k1;
      assert ns0[b]+ns1[b]==tot;
    }

    // Now roll the split-point across the bins.  There are 2 ways to do this:
    // split left/right based on being less than some value, or being equal/
    // not-equal to some value.  Equal/not-equal makes sense for catagoricals
    // but both splits could work for any integral datatype.  Do the less-than
    // splits first.
    int best=0;                         // The no-split
    double best_se0=Double.MAX_VALUE;   // Best squared error
    double best_se1=Double.MAX_VALUE;   // Best squared error
    byte equal=0;                // Ranged check
    for( int b=1; b<=nbins-1; b++ ) {
      if( _bins[b] == 0 ) continue; // Ignore empty splits
      if( ns0[b] < _min_rows ) continue;
      if( ns1[b] < _min_rows ) break; // ns1 shrinks at the higher bin#s, so if it fails once it fails always
      // We're making an unbiased estimator, so that MSE==Var.
      // Then Squared Error = MSE*N = Var*N
      //                    = (ssqs/N - mean^2)*N
      //                    = ssqs - N*mean^2
      //                    = ssqs - N*(sum/N)(sum/N)
      //                    = ssqs - sum^2/N
      double se0 = ssqs0[b] - sums0[b]*sums0[b]/ns0[b];
      double se1 = ssqs1[b] - sums1[b]*sums1[b]/ns1[b];
      if( (se0+se1 < best_se0+best_se1) || // Strictly less error?
          // Or tied MSE, then pick split towards middle bins
          (se0+se1 == best_se0+best_se1 &&
           Math.abs(b -(nbins>>1)) < Math.abs(best-(nbins>>1))) ) {
        best_se0 = se0;   best_se1 = se1;
        best = b;
      }
    }

    // If the min==max, we can also try an equality-based split
    if( _isInt > 0 && _step == 1.0f &&    // For any integral (not float) column
        _maxEx-_min > 2 ) { // Also need more than 2 (boolean) choices to actually try a new split pattern
      for( int b=1; b<=nbins-1; b++ ) {
        if( _bins[b] < _min_rows ) continue; // Ignore small bin
        long N =        ns0[b] + ns1[b+1];
        double sums = sums0[b]+sums1[b+1];
        double ssqs = ssqs0[b]+ssqs1[b+1];
        if( N < _min_rows ) continue;
        double si =  ssqs    -  sums   * sums   /   N    ; // Left+right, excluding 'b'
        double sx = _ssqs[b] - _sums[b]*_sums[b]/_bins[b]; // Just 'b'
        if( si+sx < best_se0+best_se1 ) { // Strictly less error?
          best_se0 = si;   best_se1 = sx;
          best = b;        equal = 1; // Equality check
        }
      }
    }

    if( best==0 ) return null;  // No place to split
    assert best > 0 : "Must actually pick a split "+best;
    long   n0 = equal == 0 ?   ns0[best] :   ns0[best]+  ns1[best+1];
    long   n1 = equal == 0 ?   ns1[best] : _bins[best]              ;
    double p0 = equal == 0 ? sums0[best] : sums0[best]+sums1[best+1];
    double p1 = equal == 0 ? sums1[best] : _sums[best]              ;

    // For categorical predictors, set bits for levels grouped to right of split
    Utils.IcedBitSet bs = null;
    if(_isInt == 2 && _step == 1.0f && nbins >= 4 && _doGrpSplit) {
      // Small cats: always use 4B to store and prepend offset # of zeros at front
      // Big cats: save offset and store only nbins # of bits that are left after trimming
      int offset = (int)_min;
      if(_maxEx <= 32) {
        equal = 2;
        bs = new Utils.IcedBitSet(32);
        for(int i = best; i < nbins; i++)
          bs.set(idx[i] + offset);
      } else {
        equal = 3;
        bs = new Utils.IcedBitSet(nbins, offset);
        for(int i = best; i < nbins; i++)
          bs.set(idx[i]);
      }
    }

    return new DTree.Split(col,best,bs,equal,best_se0,best_se1,n0,n1,p0/n0,p1/n1);
  }

  @Override public long byteSize0() {
    return 8*2 +                // 2 more internal arrays
      24+_sums.length<<3 +
      24+_ssqs.length<<3 ;
  }
}