DHistogram.java example

Explorer
h2o-2-master
package hex.gbm;

import sun.misc.Unsafe;
import water.*;
import water.fvec.Frame;
import water.fvec.Vec;
import water.nbhm.UtilUnsafe;
import water.util.Utils;

/**
   A Histogram, computed in parallel over a Vec.
   <p>
   A {@code DHistogram} bins every value added to it, and computes a the vec
   min and max (for use in the next split), and response mean and variance for each
   bin.  {@code DHistogram}s are initialized with a min, max and number-of-
   elements to be added (all of which are generally available from a Vec).
   Bins run from min to max in uniform sizes.  If the {@code DHistogram} can
   determine that fewer bins are needed (e.g. boolean columns run from 0 to 1,
   but only ever take on 2 values, so only 2 bins are needed), then fewer bins
   are used.
   <p>
   {@code DHistogram} are shared per-node, and atomically updated.  There's an
   {@code add} call to help cross-node reductions.  The data is stored in
   primitive arrays, so it can be sent over the wire.
   <p>
   If we are successively splitting rows (e.g. in a decision tree), then a
   fresh {@code DHistogram} for each split will dynamically re-bin the data.
   Each successive split will logarithmically divide the data.  At the first
   split, outliers will end up in their own bins - but perhaps some central
   bins may be very full.  At the next split(s), the full bins will get split,
   and again until (with a log number of splits) each bin holds roughly the
   same amount of data.  This dynamic binning resolves a lot of problems with
   picking the proper bin count or limits - generally a few more tree levels
   will equal any fancy but fixed-size binning strategy.
   <p>
   @author Cliff Click
*/
public abstract class DHistogram<TDH extends DHistogram> extends Iced {
  public final transient String _name; // Column name (for debugging)
  public final byte  _isInt;    // 0: float col, 1: int col, 2: enum & int col
  public final char  _nbin;     // Bin count
  public final float _step;     // Linear interpolation step per bin
  public final float _min, _maxEx; // Conservative Min/Max over whole collection.  _maxEx is Exclusive.
  public       int   _bins[];   // Bins, shared, atomically incremented
  public final int   _min_rows;   // Fewest allowed rows in any split
  public final boolean _doGrpSplit;

  // Atomically updated float min/max
  protected    float  _min2, _maxIn; // Min/Max, shared, atomically updated.  _maxIn is Inclusive.
  private static final Unsafe _unsafe = UtilUnsafe.getUnsafe();
  static private final long _min2Offset;
  static private final long _max2Offset;
  static {
    try {
      _min2Offset = _unsafe.objectFieldOffset(DHistogram.class.getDeclaredField("_min2"));
      _max2Offset = _unsafe.objectFieldOffset(DHistogram.class.getDeclaredField("_maxIn"));
    } catch( Exception e ) {
      throw H2O.fail();
    }
  }

  public void setMin( float min ) {
    int imin = Float.floatToRawIntBits(min);
    float old = _min2;
    while( min < old && !_unsafe.compareAndSwapInt(this, _min2Offset, Float.floatToRawIntBits(old), imin ) )
      old = _min2;
  }
  // Find Inclusive _max2
  public void setMax( float max ) {
    int imax = Float.floatToRawIntBits(max);
    float old = _maxIn;
    while( max > old && !_unsafe.compareAndSwapInt(this, _max2Offset, Float.floatToRawIntBits(old), imax ) )
      old = _maxIn;
  }

  private static int MAX_FACTOR_BINS=1024; // Allow more bins for factors
  public DHistogram( String name, final int nbins, final byte isInt, final float min, final float maxEx, long nelems, int min_rows, boolean doGrpSplit ) {
    assert nelems > 0;
    assert nbins >= 1;
    assert maxEx > min : "Caller ensures "+maxEx+">"+min+", since if max==min== the column "+name+" is all constants";
    _isInt = isInt;
    _name = name;
    _min_rows = min_rows;
    _doGrpSplit = doGrpSplit;
    _min=min;
    _maxEx=maxEx;               // Set Exclusive max
    _min2 =  Float.MAX_VALUE;   // Set min/max to outer bounds
    _maxIn= -Float.MAX_VALUE;
    // See if we can show there are fewer unique elements than nbins.
    // Common for e.g. boolean columns, or near leaves.
    int xbins = nbins;
    if( isInt>0 && maxEx-min <= Math.max(nbins,(isInt==2?MAX_FACTOR_BINS:nbins)) ) {
      assert ((long)min)==min;                // No overflow
      xbins = (char)((long)maxEx-(long)min);  // Shrink bins
      assert xbins > 1;                       // Caller ensures enough range to bother
      _step = 1.0f;                            // Fixed stepsize
    } else {
      _step = nbins/(maxEx-min);
      assert _step > 0 && !Float.isInfinite(_step);
    }
    _nbin = (char)xbins;
    // Do not allocate the big arrays here; wait for scoreCols to pick which cols will be used.
  }
  abstract boolean isBinom();

  // Interpolate d to find bin#
  int bin( float col_data ) {
    if( Float.isNaN(col_data) ) return 0; // Always NAs to bin 0
    if (Float.isInfinite(col_data)) // Put infinity to most left/right bin
      if (col_data<0) return 0;
      else return _bins.length-1;
    // When the model is exposed to new test data, we could have data that is
    // out of range of any bin - however this binning call only happens during
    // model-building.
    assert _min <= col_data && col_data < _maxEx : "Coldata "+col_data+" out of range "+this;
    int idx1  = (int)((col_data-_min)*_step);
    assert 0 <= idx1 && idx1 <= _bins.length;
    if( idx1 == _bins.length) idx1--; // Roundoff error allows idx1 to hit upper bound, so truncate
    return idx1;
  }
  float binAt( int b ) { return _min+b/_step; }

  public int nbins() { return _nbin; }
  public int bins(int b) { return _bins[b]; }
  public float mins(int b) { return _min2; }
  public float maxsIn(int b) { return _maxIn; } // Always an Inclusive max
  abstract public double mean(int b);
  abstract public double var (int b);

  // Big allocation of arrays
  abstract void init0();
  final void init() {
    assert _bins == null;
    _bins = MemoryManager.malloc4(_nbin);
    init0();
  }

  // Add one row to a bin found via simple linear interpolation.
  // Compute bin min/max.
  // Compute response mean & variance.
  abstract void incr0( int b, double y );
  final void incr( float col_data, double y ) {
    assert Float.isNaN(col_data) || Float.isInfinite(col_data) || (_min <= col_data && col_data < _maxEx) : "col_data "+col_data+" out of range "+this;
    int b = bin(col_data);      // Compute bin# via linear interpolation
    Utils.AtomicIntArray.incr(_bins,b); // Bump count in bin
    // Track actual lower/upper bound per-bin
    if (!Float.isInfinite(col_data)) {
      setMin(col_data);
      setMax(col_data);
    }
    if( y != 0 ) incr0(b,y);
  }

  // Merge two equal histograms together.  Done in a F/J reduce, so no
  // synchronization needed.
  abstract void add0( TDH dsh );
  void add( TDH dsh ) {
    assert _isInt == dsh._isInt && _nbin == dsh._nbin && _step == dsh._step &&
      _min == dsh._min && _maxEx == dsh._maxEx;
    assert (_bins == null && dsh._bins == null) || (_bins != null && dsh._bins != null);
    if( _bins == null ) return;
    Utils.add(_bins,dsh._bins);
    if( _min2  > dsh._min2  ) _min2  = dsh._min2 ;
    if( _maxIn < dsh._maxIn ) _maxIn = dsh._maxIn;
    add0(dsh);
  }

  // Inclusive min & max
  public float find_min  () { return _min2 ; }
  public float find_maxIn() { return _maxIn; }
  // Exclusive max
  public float find_maxEx() { return find_maxEx(_maxIn,_isInt); }
  static public float find_maxEx(float maxIn, int isInt ) {
    float ulp = Math.ulp(maxIn);
    if( isInt > 0 && 1 > ulp ) ulp = 1;
    float res = maxIn+ulp;
    return Float.isInfinite(res) ? maxIn : res;
  }

  // Compute a "score" for a column; lower score "wins" (is a better split).
  // Score is the sum of the MSEs when the data is split at a single point.
  // mses[1] == MSE for splitting between bins  0  and 1.
  // mses[n] == MSE for splitting between bins n-1 and n.
  abstract public DTree.Split scoreMSE( int col );

  // The initial histogram bins are setup from the Vec rollups.
  static public DHistogram[] initialHist(Frame fr, int ncols, int nbins, DHistogram hs[], int min_rows, boolean doGrpSplit, boolean isBinom) {
    Vec vecs[] = fr.vecs();
    for( int c=0; c<ncols; c++ ) {
      Vec v = vecs[c];
      final float minIn = (float)Math.max(v.min(),-Float.MAX_VALUE); // inclusive vector min
      final float maxIn = (float)Math.min(v.max(), Float.MAX_VALUE); // inclusive vector max
      final float maxEx = find_maxEx(maxIn,v.isInt()?1:0); // smallest exclusive max
      final long vlen = v.length();
      hs[c] = v.naCnt()==vlen || v.min()==v.max() ? null :
        make(fr._names[c],nbins,(byte)(v.isEnum() ? 2 : (v.isInt()?1:0)),minIn,maxEx,vlen,min_rows,doGrpSplit,isBinom);
    }
    return hs;
  }

  static public DHistogram make( String name, final int nbins, byte isInt, float min, float maxEx, long nelems, int min_rows, boolean doGrpSplit, boolean isBinom ) {
    return isBinom
      ? new DBinomHistogram(name,nbins,isInt,min,maxEx,nelems,min_rows,doGrpSplit)
      : new  DRealHistogram(name,nbins,isInt,min,maxEx,nelems,min_rows,doGrpSplit);
  }

  // Check for a constant response variable
  public boolean isConstantResponse() {
    double m = Double.NaN;
    for( int b=0; b<_bins.length; b++ ) {
      if( _bins[b] == 0 ) continue;
      if( var(b) > 1e-14 ) return false;
      double mean = mean(b);
      if( mean != m )
        if( Double.isNaN(m) ) m=mean;
        else if(Math.abs(m - mean) > 1e-6) return false;
    }
    return true;
  }

  // Pretty-print a histogram
  @Override public String toString() {
    StringBuilder sb = new StringBuilder();
    sb.append(_name).append(":").append(_min).append("-").append(_maxEx).append(" step="+(1/_step)+" nbins="+nbins()+" isInt="+_isInt);
    if( _bins != null ) {
      for( int b=0; b<_bins.length; b++ ) {
        sb.append(String.format("\ncnt=%d, [%f - %f], mean/var=", _bins[b],mins(b),maxsIn(b)));
        sb.append(String.format("%6.2f/%6.2f,", mean(b), var(b)));
      }
      sb.append('\n');
    }
    return sb.toString();
  }

  abstract public long byteSize0();
  public long byteSize() {
    long sum = 8+8;             // Self header
    sum += 1+2;                 // enum; nbin
    sum += 4+4+4+4+4;           // step,min,max,min2,max2
    sum += 8*1;                 // 1 internal arrays
    if( _bins == null ) return sum;
    // + 20(array header) + len<<2 (array body)
    sum += 24+_bins.length<<3;
    sum += byteSize0();
    return sum;
  }
}