package hex.gbm; import water.MemoryManager; import water.TypeMap; import water.util.Utils; import java.util.Arrays; import java.util.Comparator; /** A Histogram, computed in parallel over a Vec. <p> Sums and sums-of-squares of doubles <p> @author Cliff Click */ public class DRealHistogram extends DHistogram<DRealHistogram> { private double _sums[], _ssqs[]; // Sums & square-sums, shared, atomically incremented private static int FROZENTYPE = TypeMap.onIce("hex.gbm.DRealHistogram"); @Override public int frozenType() { assert(FROZENTYPE != 0); return FROZENTYPE; } public DRealHistogram( String name, final int nbins, byte isInt, float min, float maxEx, long nelems, int min_rows, boolean doGrpSplit ) { super(name,nbins,isInt,min,maxEx,nelems,min_rows,doGrpSplit); } @Override boolean isBinom() { return false; } @Override public double mean(int b) { long n = _bins[b]; return n>0 ? _sums[b]/n : 0; } @Override public double var (int b) { long n = _bins[b]; if( n<=1 ) return 0; return (_ssqs[b] - _sums[b]*_sums[b]/n)/(n-1); } // Big allocation of arrays @Override void init0() { _sums = MemoryManager.malloc8d(_nbin); _ssqs = MemoryManager.malloc8d(_nbin); } // Add one row to a bin found via simple linear interpolation. // Compute response mean & variance. // Done racily instead F/J map calls, so atomic @Override void incr0( int b, double y ) { Utils.AtomicDoubleArray.add(_sums,b,y); Utils.AtomicDoubleArray.add(_ssqs,b,y*y); } // Same, except square done by caller void incr1( int b, double y, double yy ) { Utils.AtomicDoubleArray.add(_sums,b,y); Utils.AtomicDoubleArray.add(_ssqs,b,yy); } // Merge two equal histograms together. // Done in a F/J reduce, so no synchronization needed. @Override void add0( DRealHistogram dsh ) { Utils.add(_sums,dsh._sums); Utils.add(_ssqs,dsh._ssqs); } // Compute a "score" for a column; lower score "wins" (is a better split). // Score is the sum of the MSEs when the data is split at a single point. // mses[1] == MSE for splitting between bins 0 and 1. // mses[n] == MSE for splitting between bins n-1 and n. @Override public DTree.Split scoreMSE( int col ) { final int nbins = nbins(); assert nbins > 1; // Store indices from sort to determine group split later Integer idx[] = new Integer[nbins]; for(int b = 0; b < nbins; b++) idx[b] = b; // Sort predictor levels in ascending order of mean response within each bin if(_isInt == 2 && _step == 1.0f && nbins >= 4 && _doGrpSplit) { final double[] means = new double[nbins]; System.arraycopy(_sums, 0, means, 0, nbins); Arrays.sort(idx, new Comparator<Integer>() { @Override public int compare(Integer o1, Integer o2) { return ((Double)means[o1]).compareTo(means[o2]); } }); } // Compute mean/var for cumulative bins from 0 to nbins inclusive. double sums0[] = MemoryManager.malloc8d(nbins+1); double ssqs0[] = MemoryManager.malloc8d(nbins+1); long ns0[] = MemoryManager.malloc8 (nbins+1); for( int b=1; b<=nbins; b++ ) { double m0 = sums0[b-1], m1 = _sums[b-1]; double s0 = ssqs0[b-1], s1 = _ssqs[b-1]; long k0 = ns0 [b-1], k1 = _bins[b-1]; if( k0==0 && k1==0 ) continue; sums0[b] = m0+m1; ssqs0[b] = s0+s1; ns0 [b] = k0+k1; } long tot = ns0[nbins]; // If we see zero variance, we must have a constant response in this // column. Normally this situation is cut out before we even try to split, but we might // have NA's in THIS column... if( ssqs0[nbins]*tot - sums0[nbins]*sums0[nbins] == 0 ) { assert isConstantResponse(); return null; } // Compute mean/var for cumulative bins from nbins to 0 inclusive. double sums1[] = MemoryManager.malloc8d(nbins+1); double ssqs1[] = MemoryManager.malloc8d(nbins+1); long ns1[] = MemoryManager.malloc8 (nbins+1); for( int b=nbins-1; b>=0; b-- ) { double m0 = sums1[b+1], m1 = _sums[b]; double s0 = ssqs1[b+1], s1 = _ssqs[b]; long k0 = ns1 [b+1], k1 = _bins[b]; if( k0==0 && k1==0 ) continue; sums1[b] = m0+m1; ssqs1[b] = s0+s1; ns1 [b] = k0+k1; assert ns0[b]+ns1[b]==tot; } // Now roll the split-point across the bins. There are 2 ways to do this: // split left/right based on being less than some value, or being equal/ // not-equal to some value. Equal/not-equal makes sense for catagoricals // but both splits could work for any integral datatype. Do the less-than // splits first. int best=0; // The no-split double best_se0=Double.MAX_VALUE; // Best squared error double best_se1=Double.MAX_VALUE; // Best squared error byte equal=0; // Ranged check for( int b=1; b<=nbins-1; b++ ) { if( _bins[b] == 0 ) continue; // Ignore empty splits if( ns0[b] < _min_rows ) continue; if( ns1[b] < _min_rows ) break; // ns1 shrinks at the higher bin#s, so if it fails once it fails always // We're making an unbiased estimator, so that MSE==Var. // Then Squared Error = MSE*N = Var*N // = (ssqs/N - mean^2)*N // = ssqs - N*mean^2 // = ssqs - N*(sum/N)(sum/N) // = ssqs - sum^2/N double se0 = ssqs0[b] - sums0[b]*sums0[b]/ns0[b]; double se1 = ssqs1[b] - sums1[b]*sums1[b]/ns1[b]; if( (se0+se1 < best_se0+best_se1) || // Strictly less error? // Or tied MSE, then pick split towards middle bins (se0+se1 == best_se0+best_se1 && Math.abs(b -(nbins>>1)) < Math.abs(best-(nbins>>1))) ) { best_se0 = se0; best_se1 = se1; best = b; } } // If the min==max, we can also try an equality-based split if( _isInt > 0 && _step == 1.0f && // For any integral (not float) column _maxEx-_min > 2 ) { // Also need more than 2 (boolean) choices to actually try a new split pattern for( int b=1; b<=nbins-1; b++ ) { if( _bins[b] < _min_rows ) continue; // Ignore small bin long N = ns0[b] + ns1[b+1]; double sums = sums0[b]+sums1[b+1]; double ssqs = ssqs0[b]+ssqs1[b+1]; if( N < _min_rows ) continue; double si = ssqs - sums * sums / N ; // Left+right, excluding 'b' double sx = _ssqs[b] - _sums[b]*_sums[b]/_bins[b]; // Just 'b' if( si+sx < best_se0+best_se1 ) { // Strictly less error? best_se0 = si; best_se1 = sx; best = b; equal = 1; // Equality check } } } if( best==0 ) return null; // No place to split assert best > 0 : "Must actually pick a split "+best; long n0 = equal == 0 ? ns0[best] : ns0[best]+ ns1[best+1]; long n1 = equal == 0 ? ns1[best] : _bins[best] ; double p0 = equal == 0 ? sums0[best] : sums0[best]+sums1[best+1]; double p1 = equal == 0 ? sums1[best] : _sums[best] ; // For categorical predictors, set bits for levels grouped to right of split Utils.IcedBitSet bs = null; if(_isInt == 2 && _step == 1.0f && nbins >= 4 && _doGrpSplit) { // Small cats: always use 4B to store and prepend offset # of zeros at front // Big cats: save offset and store only nbins # of bits that are left after trimming int offset = (int)_min; if(_maxEx <= 32) { equal = 2; bs = new Utils.IcedBitSet(32); for(int i = best; i < nbins; i++) bs.set(idx[i] + offset); } else { equal = 3; bs = new Utils.IcedBitSet(nbins, offset); for(int i = best; i < nbins; i++) bs.set(idx[i]); } } return new DTree.Split(col,best,bs,equal,best_se0,best_se1,n0,n1,p0/n0,p1/n1); } @Override public long byteSize0() { return 8*2 + // 2 more internal arrays 24+_sums.length<<3 + 24+_ssqs.length<<3 ; } }