package hex; import water.*; import water.api.*; import water.api.Request.API; import water.fvec.*; import water.exec.Flow; import water.util.Utils; import water.util.Log; import java.util.Arrays; /** * Summary of a column. */ public class Summary2 extends Iced { static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code. // This Request supports the HTML 'GET' command, and this is the help text // for GET. static final String DOC_GET = "Returns a summary of a fluid-vec frame"; public static final int MAX_HIST_SZ = H2O.DATA_MAX_FACTOR_LEVELS; public static final int NMAX = 5; // updated boundaries to be 0.1% 1%...99%, 99.9% so R code didn't have to change // ideally we extend the array here, and just update the R extraction of 25/50/75 percentiles // note python tests (junit?) may look at result public static final double DEFAULT_PERCENTILES[] = {0.001,0.01,0.10,0.25,0.33,0.50,0.66,0.75,0.90,0.99,0.999}; private static final int T_REAL = 0; private static final int T_INT = 1; private static final int T_ENUM = 2; public BasicStat _stat0; /* Basic Vec stats collected by PrePass. */ public final int _type; // 0 - real; 1 - int; 2 - enum public double[] _mins; public double[] _maxs; long _gprows; // non-empty rows per group final transient String[] _domain; final transient double _start; final transient double _start2; final transient double _binsz; final transient double _binsz2; // 2nd finer grained histogram used for quantile estimates for numerics transient int _len1; /* Size of filled elements in a chunk. */ transient double[] _pctile; static abstract class Stats extends Iced { static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code. static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields @API(help="stats type" ) public String type; Stats(String type) { this.type = type; } } // An internal JSON-output-only class @SuppressWarnings("unused") static class EnumStats extends Stats { static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code. static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields public EnumStats( int card ) { super("Enum"); this.cardinality = card; } @API(help="cardinality" ) public final int cardinality; } static class NumStats extends Stats { static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code. static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields public NumStats( double mean, double sigma, long zeros, double[] mins, double[] maxs, double[] pctile) { super("Numeric"); this.mean = mean; this.sd = sigma; this.zeros = zeros; this.mins = mins; this.maxs = maxs; this.pctile = pctile; this.pct = DEFAULT_PERCENTILES; } @API(help="mean" ) public final double mean; @API(help="sd" ) public final double sd; @API(help="#zeros" ) public final long zeros; @API(help="min elements") public final double[] mins; // min N elements @API(help="max elements") public final double[] maxs; // max N elements @API(help="percentile thresholds" ) public final double[] pct; @API(help="percentiles" ) public final double[] pctile; } // OUTPUTS // Basic info @API(help="name" ) public String colname; @API(help="type" ) public String type; // Basic stats @API(help="NAs" ) public long nacnt; @API(help="Base Stats" ) public Stats stats; @API(help="histogram start") public double hstart; @API(help="histogram bin step") public double hstep; @API(help="histogram headers" ) public String[] hbrk; @API(help="histogram bin values") public long[] hcnt; public long[] hcnt2; // finer histogram. not visible public double[] hcnt2_min; // min actual for each bin public double[] hcnt2_max; // max actual for each bin public static class BasicStat extends Iced { public long _len; /* length of vec */ public long _nas; /* number of NA's */ public long _nans; /* number of NaN's */ public long _pinfs; /* number of positive infinity's */ public long _ninfs; /* number of positive infinity's */ public long _zeros; /* number of zeros */ public double _min1; /* if there's -Inf, then -Inf, o/w min2 */ public double _max1; /* if there's Inf, then Inf, o/w max2 */ public double _min2; /* min of the finite numbers. NaN if there's none. */ public double _max2; /* max of the finite numbers. NaN if there's none. */ public BasicStat( ) { _len = 0; _nas = 0; _nans = 0; _pinfs = 0; _ninfs = 0; _zeros = 0; _min1 = Double.NaN; _max1 = Double.NaN; _min2 = Double.NaN; _max2 = Double.NaN; } public BasicStat add(Chunk chk) { _len = chk._len; for(int i = 0; i < chk._len; i++) { double val; if (chk.isNA0(i)) { _nas++; continue; } if( chk._vec.isUUID() ) continue; if (Double.isNaN(val = chk.at0(i))) { _nans++; continue; } if (val == Double.POSITIVE_INFINITY) _pinfs++; else if (val == Double.NEGATIVE_INFINITY) _ninfs++; else { _min2 = Double.isNaN(_min2)? val : Math.min(_min2,val); _max2 = Double.isNaN(_max2)? val : Math.max(_max2,val); if (val == .0) _zeros++; } } return this; } public BasicStat add(BasicStat other) { _len += other._len; _nas += other._nas; _nans += other._nans; _pinfs += other._pinfs; _ninfs += other._ninfs; _zeros += other._zeros; if (Double.isNaN(_min2)) _min2 = other._min2; else if (!Double.isNaN(other._min2)) _min2 = Math.min(_min2, other._min2); if (Double.isNaN(_max2)) _max2 = other._max2; else if (!Double.isNaN(other._max2)) _max2 = Math.max(_max2, other._max2); return this; } public BasicStat finishUp() { _min1 = _ninfs>0? Double.NEGATIVE_INFINITY /* there's -Inf */ : !Double.isNaN(_min2)? _min2 /* min is finite */ : _pinfs>0? Double.POSITIVE_INFINITY /* Only Infs exist */ : Double.NaN; /* All NaN's or NAs */ _max1 = _pinfs>0? Double.POSITIVE_INFINITY /* there's Inf */ : !Double.isNaN(_max2)? _max2 /* max is finite */ : _ninfs>0? Double.NEGATIVE_INFINITY /* Only -Infs exist */ : Double.NaN; /* All NaN's or NAs */ return this; } } public static class PrePass extends MRTask2<PrePass> { public BasicStat _basicStats[]; @Override public void map(Chunk[] cs) { _basicStats = new BasicStat[cs.length]; for (int c=0; c < cs.length; c++) _basicStats[c] = new BasicStat().add(cs[c]); } @Override public void reduce(PrePass other){ for (int c = 0; c < _basicStats.length; c++) _basicStats[c].add(other._basicStats[c]); } public PrePass finishUp() { for (BasicStat stat : _basicStats) stat.finishUp(); return this; } } public static class SummaryTask2 extends MRTask2<SummaryTask2> { private BasicStat[] _basics; private int _max_qbins; public Summary2 _summaries[]; public SummaryTask2 (BasicStat[] basicStats, int max_qbins) { _basics = basicStats; _max_qbins = max_qbins; } @Override public void map(Chunk[] cs) { _summaries = new Summary2[cs.length]; for (int i = 0; i < cs.length; i++) _summaries[i] = new Summary2(_fr.vecs()[i], _fr.names()[i], _basics[i], _max_qbins).add(cs[i]); } @Override public void reduce(SummaryTask2 other) { for (int i = 0; i < _summaries.length; i++) _summaries[i].add(other._summaries[i]); } } // Entry point for the Flow passes, to allow easy percentiles on filtered GroupBy public static class SummaryPerRow extends Flow.PerRow<SummaryPerRow> { public final Frame _fr; public final Summary2 _summaries[]; public SummaryPerRow( Frame fr ) { this(fr,null); } private SummaryPerRow( Frame fr, Summary2[] sums ) { _fr = fr; _summaries = sums; } @Override public void mapreduce( double ds[] ) { for( int i=0; i<ds.length; i++ ) _summaries[i].add(ds[i]); } @Override public void reduce( SummaryPerRow that ) { for (int i = 0; i < _summaries.length; i++) _summaries[i].add(that._summaries[i]); } @Override public SummaryPerRow make() { Vec[] vecs = _fr.vecs(); Summary2 sums[] = new Summary2[vecs.length]; BasicStat basics[] = new PrePass().doAll(_fr).finishUp()._basicStats; for( int i=0; i<vecs.length; i++ ) sums[i] = new Summary2(vecs[i], _fr._names[i], basics[i]); return new SummaryPerRow(_fr,sums); } @Override public String toString() { String s = ""; for( int i=0; i<_summaries.length; i++ ) s += _fr._names[i]+" "+_summaries[i]+"\n"; return s; } public void finishUp() { Vec[] vecs = _fr.vecs(); for (int i = 0; i < vecs.length; i++) _summaries[i].finishUp(vecs[i]); } } @Override public String toString() { String s = ""; if( stats instanceof NumStats ) { double pct [] = ((NumStats)stats).pct ; double pctile[] = ((NumStats)stats).pctile; for( int i=0; i<pct.length; i++ ) s += ""+(pct[i]*100)+"%="+pctile[i]+", "; } else { s += "cardinality="+((EnumStats)stats).cardinality; } return s; } public void finishUp(Vec vec) { nacnt = _stat0._nas; if (_type == T_ENUM) { // Compute majority items for enum data computeMajorities(); } else { _pctile = new double[DEFAULT_PERCENTILES.length]; approxQuantiles(_pctile, DEFAULT_PERCENTILES, _stat0._min2, _stat0._max2); } // remove the trailing NaNs for (int i = 0; i < _mins.length; i++) { if (Double.isNaN(_mins[i])) { _mins = Arrays.copyOf(_mins, i); break; } } for (int i = 0; i < _maxs.length; i++) { if (Double.isNaN(_maxs[i])) { _maxs = Arrays.copyOf(_maxs, i); break; } } for (int i = 0; i < _maxs.length>>>1; i++) { double t = _maxs[i]; _maxs[i] = _maxs[_maxs.length-1-i]; _maxs[_maxs.length-1-i] = t; } this.stats = _type==T_ENUM ? new EnumStats(vec.domain().length) : new NumStats(vec.mean(), vec.sigma(), _stat0._zeros, _mins, _maxs, _pctile); if (_type == T_ENUM) { this.hstart = 0; this.hstep = 1; this.hbrk = _domain; } else { this.hstart = _start; this.hstep = _binsz; this.hbrk = new String[hcnt.length]; for (int i = 0; i < hbrk.length; i++) hbrk[i] = Utils.p2d(i==0 ? _start : binValue(i)); } } public Summary2(Vec vec, String name, BasicStat stat0, int max_qbins) { colname = name; _stat0 = stat0; _type = vec.isEnum()?T_ENUM:vec.isInt()?T_INT:T_REAL; _domain = vec.isEnum() ? vec.domain() : null; _gprows = 0; double sigma = Double.isNaN(vec.sigma()) ? 0 : vec.sigma(); if ( _type != T_ENUM ) { _mins = MemoryManager.malloc8d((int)Math.min(vec.length(),NMAX)); _maxs = MemoryManager.malloc8d((int)Math.min(vec.length(),NMAX)); Arrays.fill(_mins, Double.NaN); Arrays.fill(_maxs, Double.NaN); } else { _mins = MemoryManager.malloc8d(Math.min(_domain.length,NMAX)); _maxs = MemoryManager.malloc8d(Math.min(_domain.length,NMAX)); } if( vec.isEnum() && _domain.length < MAX_HIST_SZ ) { _start = 0; _start2 = 0; _binsz = 1; _binsz2 = 1; // hack for now. if there are no enum values, keep these length 1, for consistency // in asserts below int dlength = _domain.length==0 ? 1 : _domain.length; hcnt = new long[dlength]; hcnt2 = new long[dlength]; hcnt2_min = new double[dlength]; hcnt2_max = new double[dlength]; } else if ( !(Double.isNaN(stat0._min2) || Double.isNaN(stat0._max2)) ) { // guard against improper parse (date type) or zero c._sigma long N = _stat0._len - stat0._nas - stat0._nans - stat0._pinfs - stat0._ninfs; double b = Math.max(1e-4,3.5 * sigma/ Math.cbrt(N)); double d = Math.pow(10, Math.floor(Math.log10(b))); if (b > 20*d/3) d *= 10; else if (b > 5*d/3) d *= 5; // tweak for integers if (d < 1. && vec.isInt()) d = 1.; // Result from the dynamic bin sizing equations double startSuggest = d * Math.floor(stat0._min2 / d); double binszSuggest = d; int nbinSuggest = (int) Math.ceil((stat0._max2 - startSuggest)/d) + 1; // Protect against massive binning. browser doesn't need int BROWSER_BIN_TARGET = 100; // _binsz/_start is used in the histogramming. // nbin is used in the array declaration. must be big enough. // the resulting nbin, could be really large number. We need to cap it. // should also be obsessive and check that it's not 0 and force to 1. // Since nbin is implied by _binsz, ratio _binsz and recompute nbin int binCase = 0; // keep track in case we assert double start; if ( stat0._max2==stat0._min2) { binszSuggest = 0; // fixed next with other 0 cases. start = stat0._min2; binCase = 1; } // minimum 2 if min/max different else if ( stat0._max2!=stat0._min2 && nbinSuggest<2 ) { binszSuggest = (stat0._max2 - stat0._min2) / 2.0; start = stat0._min2; binCase = 2; } else if (nbinSuggest<1 || nbinSuggest>BROWSER_BIN_TARGET ) { // switch to a static equation with a fixed bin count, and recompute binszSuggest // one more bin than necessary for the range (99 exact. causes one extra binszSuggest = (stat0._max2 - stat0._min2) / (BROWSER_BIN_TARGET - 1.0); start = binszSuggest * Math.floor(stat0._min2 / binszSuggest); binCase = 3; } else { // align to binszSuggest boundary. (this is for reals) start = binszSuggest * Math.floor(stat0._min2 / binszSuggest); binCase = 4; } // _binsz = 0 means min/max are equal for reals?. Just make it a little number // this won't show up in browser display, since bins are labelled by start value // Now that we know the best bin size that will fit..Floor the _binsz if integer so visible // histogram looks good for integers. This is our final best bin size. double binsz = (binszSuggest!=0) ? binszSuggest : (vec.isInt() ? 1 : 1e-13d); _binsz = vec.isInt() ? Math.floor(binsz) : binsz; // make integers start on an integer too! _start = vec.isInt() ? Math.floor(start) : start; // This equation creates possibility of some of the first bins being empty // also: _binsz means many _binsz2 could be empty at the start if we resused _start there // FIX! is this okay if the dynamic range is > 2**32 // align to bin size? int nbin = (int) Math.ceil((stat0._max2 - _start)/_binsz) + 1; double impliedBinEnd = _start + (nbin * _binsz); String assertMsg = _start+" "+_stat0._min2+" "+_stat0._max2+ " "+impliedBinEnd+" "+_binsz+" "+nbin+" "+startSuggest+" "+nbinSuggest+" "+binCase; // Log.debug("Summary2 bin1. "+assertMsg); assert _start <= _stat0._min2 : assertMsg; // just in case, make sure it's big enough assert nbin > 0: assertMsg; // just for double checking we're okay (nothing outside the bin rang) assert impliedBinEnd>=_stat0._max2 : assertMsg; // create a 2nd finer grained historam for quantile estimates. // okay if it is approx. 1000 bins (+-1) // update: we allow api to change max_qbins. default 1000. larger = more accuracy assert max_qbins > 0 && max_qbins <= 10000000 : "max_qbins must be >0 and <= 10000000"; // okay if 1 more than max_qbins gets created double d2 = (stat0._max2 - stat0._min2) / max_qbins; // _binsz2 = 0 means min/max are equal for reals?. Just make it a little number // this won't show up in browser display, since bins are labelled by start value _binsz2 = (d2!=0) ? d2 : (vec.isInt() ? 1 : 1e-13d); _start2 = stat0._min2; int nbin2 = (int) Math.ceil((stat0._max2 - _start2)/_binsz2) + 1; double impliedBinEnd2 = _start2 + (nbin2 * _binsz2); assertMsg = _start2+" "+_stat0._min2+" "+_stat0._max2+ " "+impliedBinEnd2+" "+_binsz2+" "+nbin2; // Log.debug("Summary2 bin2. "+assertMsg); assert _start2 <= stat0._min2 : assertMsg; assert nbin2 > 0 : assertMsg; // can't make any assertion about _start2 vs _start (either can be smaller due to fp issues) assert impliedBinEnd2>=_stat0._max2 : assertMsg; hcnt = new long[nbin]; hcnt2 = new long[nbin2]; hcnt2_min = new double[nbin2]; hcnt2_max = new double[nbin2]; // Log.debug("Finer histogram has "+nbin2+" bins. Visible histogram has "+nbin); // Log.debug("Finer histogram starts at "+_start2+" Visible histogram starts at "+_start); // Log.debug("stat0._min2 "+stat0._min2+" stat0._max2 "+stat0._max2); } else { // vec does not contain finite numbers Log.debug("Summary2: NaN in stat0._min2: "+stat0._min2+" or stat0._max2: "+stat0._max2); // vec.min() wouldn't be any better here. It could be NaN? 4/13/14 // _start = vec.min(); // _start2 = vec.min(); // _binsz = Double.POSITIVE_INFINITY; // _binsz2 = Double.POSITIVE_INFINITY; _start = Double.NaN; _start2 = Double.NaN; _binsz = Double.NaN; _binsz2 = Double.NaN; hcnt = new long[1]; hcnt2 = new long[1]; hcnt2_min = new double[1]; hcnt2_max = new double[1]; } } public Summary2(Vec vec, String name, BasicStat stat0) { this(vec, name, stat0, 1000); } public Summary2 add(Chunk chk) { if( chk._vec.isUUID() ) { // Log.info("Summary2: isUUID() in add"); return this; } for (int i = 0; i < chk._len; i++) add(chk.at0(i)); return this; } public void add(double val) { if( Double.isNaN(val) ) return; // can get infinity due to bad enum parse to real // histogram is sized ok, but the index calc below will be too big // just drop them. not sure if something better to do? if( val==Double.POSITIVE_INFINITY ) return; if( val==Double.NEGATIVE_INFINITY ) return; _len1++; _gprows++; if ( _type != T_ENUM ) { int index; // update min/max if (val < _mins[_mins.length-1] || Double.isNaN(_mins[_mins.length-1])) { index = Arrays.binarySearch(_mins, val); if (index < 0) { index = -(index + 1); for (int j = _mins.length -1; j > index; j--) _mins[j] = _mins[j-1]; _mins[index] = val; } } boolean hasNan = Double.isNaN(_maxs[_maxs.length-1]); if (val > _maxs[0] || hasNan) { index = Arrays.binarySearch(_maxs, val); if (index < 0) { index = -(index + 1); if (hasNan) { for (int j = _maxs.length -1; j > index; j--) _maxs[j] = _maxs[j-1]; _maxs[index] = val; } else { for (int j = 0; j < index-1; j++) _maxs[j] = _maxs[j+1]; _maxs[index-1] = val; } } } // update the finer histogram (used for quantile estimates on numerics) long binIdx2; if (hcnt2.length==1) { binIdx2 = 0; // not used } else { binIdx2 = (int) Math.floor((val - _start2) / _binsz2); } int binIdx2Int = (int) binIdx2; assert (_start2 <= val) : "Why is val < _start2? val:"+val+" _start2:"; assert (binIdx2Int >= 0 && binIdx2Int < hcnt2.length) : "binIdx2Int too big for hcnt2 "+binIdx2Int+" "+hcnt2.length+" "+val+" "+_start2+" "+_binsz2; if (hcnt2[binIdx2Int] == 0) { // Log.debug("New init: "+val+" for index "+binIdx2Int); hcnt2_min[binIdx2Int] = val; hcnt2_max[binIdx2Int] = val; } else { if (val < hcnt2_min[binIdx2Int]) { // Log.debug("New min: "+val+" for index "+binIdx2Int); hcnt2_min[binIdx2Int] = val; } if (val > hcnt2_max[binIdx2Int]) { // if ( binIdx2Int == 500 ) Log.debug("New max: "+val+" for index "+binIdx2Int); hcnt2_max[binIdx2Int] = val; } } ++hcnt2[binIdx2Int]; } // update the histogram the browser/json uses long binIdx; if (hcnt.length == 1) { binIdx = 0; } // interesting. do we really track Infs in the histogram? else if (val == Double.NEGATIVE_INFINITY) { binIdx = 0; } else if (val == Double.POSITIVE_INFINITY) { binIdx = hcnt.length-1; } else { binIdx = (int) Math.floor((val - _start) / _binsz); } int binIdxInt = (int) binIdx; assert (_start <= val) : "Why is val < _start? val:"+val+" _start:"; assert (binIdxInt >= 0 && binIdx < hcnt.length) : "binIdxInt bad for hcnt2. binIdxInt:"+binIdxInt+" hcnt.length:"+hcnt.length+" val:"+val+" _start:"+_start+" _binsz:"+_binsz; ++hcnt[binIdxInt]; } public Summary2 add(Summary2 other) { // merge hcnt and hcnt just by adding if (hcnt != null) Utils.add(hcnt, other.hcnt); _gprows += other._gprows; if (_type == T_ENUM) return this; // merge hcnt2 per-bin mins // other must be same length, but use it's length for safety // could add assert on lengths? for (int k = 0; k < other.hcnt2_min.length; k++) { // for now..die on NaNs assert !Double.isNaN(other.hcnt2_min[k]) : "NaN in other.hcnt2_min merging"; assert !Double.isNaN(other.hcnt2[k]) : "NaN in hcnt2_min merging"; assert !Double.isNaN(hcnt2_min[k]) : "NaN in hcnt2_min merging"; assert !Double.isNaN(hcnt2[k]) : "NaN in hcnt2_min merging"; // cover the initial case (relying on initial min = 0 to work is wrong) // Only take the new max if it's hcnt2 is non-zero. like a valid bit // can hcnt2 ever be null here? if (other.hcnt2[k] > 0) { if ( hcnt2[k]==0 || ( other.hcnt2_min[k] < hcnt2_min[k] )) { hcnt2_min[k] = other.hcnt2_min[k]; } } } // merge hcnt2 per-bin maxs // other must be same length, but use it's length for safety for (int k = 0; k < other.hcnt2_max.length; k++) { // for now..die on NaNs assert !Double.isNaN(other.hcnt2_max[k]) : "NaN in other.hcnt2_max merging"; assert !Double.isNaN(other.hcnt2[k]) : "NaN in hcnt2_min merging"; assert !Double.isNaN(hcnt2_max[k]) : "NaN in hcnt2_max merging"; assert !Double.isNaN(hcnt2[k]) : "NaN in hcnt2_max merging"; // cover the initial case (relying on initial min = 0 to work is wrong) // Only take the new max if it's hcnt2 is non-zero. like a valid bit // can hcnt2 ever be null here? if (other.hcnt2[k] > 0) { if ( hcnt2[k]==0 || ( other.hcnt2_max[k] > hcnt2_max[k] )) { hcnt2_max[k] = other.hcnt2_max[k]; } } } // can hcnt2 ever be null here?. Inc last, so the zero case is detected above // seems like everything would fail if hcnt2 doesn't exist here if (hcnt2 != null) Utils.add(hcnt2, other.hcnt2); // merge hcnt mins double[] ds = MemoryManager.malloc8d(_mins.length); int i = 0, j = 0; for (int k = 0; k < ds.length; k++) if (_mins[i] < other._mins[j]) ds[k] = _mins[i++]; else if (Double.isNaN(other._mins[j])) ds[k] = _mins[i++]; else { // _min[i] >= other._min[j] if (_mins[i] == other._mins[j]) i++; ds[k] = other._mins[j++]; } System.arraycopy(ds,0,_mins,0,ds.length); for (i = _maxs.length - 1; Double.isNaN(_maxs[i]); i--) if (i == 0) {i--; break;} for (j = _maxs.length - 1; Double.isNaN(other._maxs[j]); j--) if (j == 0) {j--; break;} ds = MemoryManager.malloc8d(i + j + 2); // merge hcnt maxs, also deduplicating against mins? int k = 0, ii = 0, jj = 0; while (ii <= i && jj <= j) { if (_maxs[ii] < other._maxs[jj]) ds[k] = _maxs[ii++]; else if (_maxs[ii] > other._maxs[jj]) ds[k] = other._maxs[jj++]; else { // _maxs[ii] == other.maxs[jj] ds[k] = _maxs[ii++]; jj++; } k++; } while (ii <= i) ds[k++] = _maxs[ii++]; while (jj <= j) ds[k++] = other._maxs[jj++]; System.arraycopy(ds,Math.max(0, k - _maxs.length),_maxs,0,Math.min(k,_maxs.length)); for (int t = k; t < _maxs.length; t++) _maxs[t] = Double.NaN; return this; } // _start of each hcnt bin public double binValue(int b) { return _start + b*_binsz; } // can we assert against something here? // assert _gprows==htot2(0, 0) : "_gprows: "+_gprows+" htot2(): "+htot2(0, 0); // need to count >4B rows private long htot2(long low, long high) { long cnt = 0; for (int i = 0; i < hcnt2.length; i++) cnt+=hcnt2[i]; // add the stuff outside the bins, 0,0 for single pass cnt = cnt + low + high; return cnt; } //****************************************************************************** // NOTE: only works on a backfilled hcnt2, unlike Quantiles. eliminates nextK search // The backfill is not done here, so it's only done once (because 10 calls here) private double approxLikeInQuantiles(double threshold, double valStart, double valEnd) { // Code is lifted from Quantiles.java, with only a little jiggering // on the branches around forceBestApprox/interpolation type, and use of globals // that have different names. Need to merge sometime. // the 'intent' is to be the same as the single pass Quantiles approx, interpolation_type==-1 // max_qbins was the goal for sizing. // nbins2 was what was used for size, after various calcs // just assume hcnt2 is the right length! // Don't need at least two bins..since we'll always have 'some' answer // are we being called on constant 0? int maxBinCnt = hcnt2.length; // Find the row count we want to hit, within some bin. long currentCnt = 0; double targetCntFull = threshold * (_gprows-1); // zero based indexing long targetCntInt = (long) Math.floor(targetCntFull); double targetCntFract = targetCntFull - (double) targetCntInt; assert (targetCntFract>=0) && (targetCntFract<=1); // Log.debug("QS_ targetCntInt: "+targetCntInt+" targetCntFract: "+targetCntFract); // walk thru and find out what bin to look inside int k = 0; while(k!=maxBinCnt && ((currentCnt + hcnt2[k]) <= targetCntInt)) { // Log.debug("Q_ Looping for k: "+threshold+" "+k+" "+maxBinCnt+" "+currentCnt+" "+targetCntInt+ // " "+hcnt2[k]+" "+hcnt2_min[k]+" "+hcnt2_max[k]); currentCnt += hcnt2[k]; ++k; // Note the loop condition covers the breakout condition: // (currentCnt==targetCntInt && (hcnt2[k]!=0) // also: don't go pass array bounds } assert hcnt2[k]!=0; // Log.debug("QS_ Found k (approx): "+threshold+" "+k+" "+currentCnt+" "+targetCntInt+ // " "+_gprows+" "+hcnt2[k]+" "+hcnt2_min[k]+" "+hcnt2_max[k]); assert (currentCnt + hcnt2[k]) > targetCntInt : targetCntInt+" "+currentCnt+" "+k+" "+" "+maxBinCnt; assert hcnt2[k]!=1 || hcnt2_min[k]==hcnt2_max[k]; boolean done = false; double guess = Double.NaN; boolean interpolated = false; double dDiff; // special cases. If the desired row is the last of equal values in this bin (2 or more) // we will need to intepolate with a nextK out-of-bin value // we can't iterate, since it won't improve things and the bin-size will be zero! // trying to resolve case of binsize=0 for next pass, after this, is flawed thinking. // implies the values are not the same..end of bin interpolate to next boolean atStartOfBin = hcnt2[k]>=1 && (currentCnt == targetCntInt); boolean atEndOfBin = !atStartOfBin && (hcnt2[k]>=2 && ((currentCnt + hcnt2[k] - 1) == targetCntInt)); boolean inMidOfBin = !atStartOfBin && !atEndOfBin && (hcnt2[k]>=3) && (hcnt2_min[k]==hcnt2_max[k]); boolean interpolateEndNeeded = false; if ( atEndOfBin ) { if ( targetCntFract != 0 ) { interpolateEndNeeded = true; } else { guess = hcnt2_max[k]; done = true; // Log.debug("QS_ Guess M "+guess); } } else if ( inMidOfBin ) { // if we know there is something before and after us with same value, // we never need to interpolate (only allowed when min=max guess = hcnt2_min[k]; done = true; // Log.debug("QS_ Guess N "+guess); } if ( !done && atStartOfBin ) { // no interpolation needed if ( hcnt2[k]>2 && (hcnt2_min[k]==hcnt2_max[k]) ) { guess = hcnt2_min[k]; done = true; // Log.debug("QS_ Guess A "+guess); } // min/max can be equal or not equal here else if ( hcnt2[k]==2 ) { // interpolate between min/max for the two value bin // type 7 (linear interpolation) // Unlike mean, which just depends on two adjacent values, this adjustment // adds possible errors related to the arithmetic on the total # of rows. dDiff = hcnt2_max[k] - hcnt2_min[k]; // two adjacent..as if sorted! // targetCntFract is fraction of total rows guess = hcnt2_min[k] + (targetCntFract * dDiff); done = true; interpolated = true; // Log.debug("QS_ Guess B "+guess+" targetCntFract: "+targetCntFract); } // no interpolation needed else if ( (hcnt2[k]==1) && (targetCntFract==0) ) { assert hcnt2_min[k]==hcnt2_max[k]; guess = hcnt2_min[k]; done = true; // Log.debug("QS_ Guess C "+guess); } } // interpolate into a nextK value // all the qualification is so we don't set done when we're not, for multipass // interpolate from single bin, end of two entry bin, or for approx boolean stillCanGetIt = atStartOfBin && hcnt2[k]==1 && targetCntFract!=0; if ( !done ) { if ( hcnt2[k]==1 ) { assert hcnt2_min[k]==hcnt2_max[k]; // Log.debug("QS_ Single value in this bin, but fractional means we need to interpolate to next non-zero"); } if ( interpolateEndNeeded ) { // Log.debug("QS_ Interpolating off the end of a bin!"); } double nextVal; int nextK; // if we're at the end assert k < maxBinCnt : k+" "+maxBinCnt; if ( (k+1)==maxBinCnt) { // Log.debug("QS_ Using valEnd for approx interpolate: "+valEnd); nextVal = valEnd; // just in case the binning didn't max in a bin before the last } else { nextK = k + 1; nextVal = hcnt2_min[nextK]; // Log.debug("QS_ Using nextK for interpolate: "+nextK+" "+hcnt2_min[nextK]); // hcnt2[nextK] may be zero here if we backfilled } // can still get an exact interpolation, when hcnt2[k]=2 if ( stillCanGetIt ) { dDiff = nextVal - hcnt2_max[k]; // two adjacent, as if sorted! // targetCntFract is fraction of total rows guess = hcnt2_max[k] + (targetCntFract * dDiff); interpolated = true; done = true; // has to be one above us when needed. (or we're at end) // Log.debug("QS_ Guess D "+guess+" "+nextVal+" "+hcnt2_min[k]+" "+hcnt2_max[k]+" "+hcnt2[k]+" "+nextVal+ // " targetCntFull: "+targetCntFull+" targetCntFract: "+targetCntFract+ // " _gprows: " + _gprows+" "+stillCanGetIt); } else { // single pass approx..with unresolved bin assert hcnt2[k]!=0 : hcnt2[k]+" "+k; // use max within this bin, to stay within the guaranteed error bounds dDiff = (hcnt2_max[k] - hcnt2_min[k]) / hcnt2[k]; guess = hcnt2_min[k] + (targetCntFull-currentCnt) * dDiff; interpolated = true; done = true; // has to be one above us when needed. (or we're at end) // Log.debug("QS_ Guess E "+guess+" "+nextVal+" "+hcnt2_min[k]+" "+hcnt2_max[k]+" "+hcnt2[k]+" "+nextVal+ // " targetCntFull: "+targetCntFull+" targetCntFract: "+targetCntFract+ // " _gprows: " + _gprows); } } assert !Double.isNaN(guess); // covers positive/negative inf also (if we divide by 0) return guess; } //****************************************************************************** private void approxQuantiles(double[] qtiles, double[] thres, double valStart, double valEnd){ // not called for enums assert _type != T_ENUM; // hcnt2 may have been sized differently than the max_qbins goal int maxBinCnt = hcnt2.length; if ( maxBinCnt==0 ) return; // this would imply we didn't get anything correctly. Maybe real col with all NA? if ( (maxBinCnt==1) && (hcnt2[0]==0) ) return; // Perf hack that is currently different than Quantiles.java // back fill hcnt2_min where it's zero, so we can avoid the nextK search // when we need to interpolate. Keep hcnt2[k]=0 so we know not to use it // other than for getting nextK without searching. This is powerful // because if we're getting 10 quantiles from a histogram, we don't // do searches to the end (potentially) for ever nextK find. This // makes the Quantiles.java algo work well when reused for multiple quantiles // here in Summary2 // The use of nextK, rather than just our bin, improves accuracy for various cases. // (mirroring what Quantiles does for perfect answers) // start at the end. don't need to fill the 0 case ever, but should for consistency double backfill = valEnd; for (int b=(maxBinCnt-1); b>=0; --b) { if ( hcnt2[b] == 0 ) { hcnt2_min[b] = backfill; // Log.debug("QS_ backfilling "+b+" "+backfill); } else { backfill = hcnt2_min[b]; } } for(int j = 0; j < thres.length; ++j) { // 0 okay for threshold? assert 0 <= thres[j] && thres[j] <= 1; qtiles[j] = approxLikeInQuantiles(thres[j], valStart, valEnd); } } //****************************************************************************** // Compute majority categories for enums only public void computeMajorities() { if ( _type != T_ENUM ) return; for (int i = 0; i < _mins.length; i++) _mins[i] = i; for (int i = 0; i < _maxs.length; i++) _maxs[i] = i; int mini = 0, maxi = 0; for( int i = 0; i < hcnt.length; i++ ) { if (hcnt[i] < hcnt[(int)_mins[mini]]) { _mins[mini] = i; for (int j = 0; j < _mins.length; j++) if (hcnt[(int)_mins[j]] > hcnt[(int)_mins[mini]]) mini = j; } if (hcnt[i] > hcnt[(int)_maxs[maxi]]) { _maxs[maxi] = i; for (int j = 0; j < _maxs.length; j++) if (hcnt[(int)_maxs[j]] < hcnt[(int)_maxs[maxi]]) maxi = j; } } for (int i = 0; i < _mins.length - 1; i++) for (int j = 0; j < i; j++) { if (hcnt[(int)_mins[j]] > hcnt[(int)_mins[j+1]]) { double t = _mins[j]; _mins[j] = _mins[j+1]; _mins[j+1] = t; } } for (int i = 0; i < _maxs.length - 1; i++) for (int j = 0; j < i; j++) if (hcnt[(int)_maxs[j]] < hcnt[(int)_maxs[j+1]]) { double t = _maxs[j]; _maxs[j] = _maxs[j+1]; _maxs[j+1] = t; } } public double percentileValue(int idx) { if( _type == T_ENUM ) return Double.NaN; return _pctile[idx]; } public void toHTML( Vec vec, String cname, StringBuilder sb ) { // should be a better way/place to decode this back to string. String typeStr; if ( _type == T_REAL) typeStr = "Real"; else if ( _type == T_INT) typeStr = "Int"; else if ( _type == T_ENUM) typeStr = "Enum"; else typeStr = "Undefined"; sb.append("<div class='table' id='col_" + cname + "' style='width:90%;heigth:90%;border-top-style:solid;'>" + "<div class='alert-success'><h4>Column: " + cname + " (type: " + typeStr + ")</h4></div>\n"); if ( _stat0._len == _stat0._nas ) { sb.append("<div class='alert'>Empty column, no summary!</div></div>\n"); return; } // Base stats if( _type != T_ENUM ) { NumStats stats = (NumStats)this.stats; sb.append("<div style='width:100%;'><table class='table-bordered'>"); sb.append("<tr><th colspan='"+20+"' style='text-align:center;'>Base Stats</th></tr>"); sb.append("<tr>"); sb.append("<th>NAs</th> <td>" + nacnt + "</td>"); sb.append("<th>mean</th><td>" + Utils.p2d(stats.mean)+"</td>"); sb.append("<th>sd</th><td>" + Utils.p2d(stats.sd) + "</td>"); sb.append("<th>zeros</th><td>" + stats.zeros + "</td>"); sb.append("<tr>"); sb.append("<th>min[" + stats.mins.length + "]</th>"); for( double min : stats.mins ) { sb.append("<td>" + Utils.p2d(min) + "</td>"); } sb.append("<tr>"); sb.append("<th>max[" + stats.maxs.length + "]</th>"); for( double max : stats.maxs ) { sb.append("<td>" + Utils.p2d(max) + "</td>"); } // End of base stats sb.append("</tr> </table>"); sb.append("</div>"); } else { // Enums sb.append("<div style='width:100%'><table class='table-bordered'>"); sb.append("<tr><th colspan='" + 4 + "' style='text-align:center;'>Base Stats</th></tr>"); sb.append("<tr><th>NAs</th> <td>" + nacnt + "</td>"); sb.append("<th>cardinality</th> <td>" + vec.domain().length + "</td></tr>"); sb.append("</table></div>"); } // Histogram final int MAX_HISTO_BINS_DISPLAYED = 1000; int len = Math.min(hcnt.length,MAX_HISTO_BINS_DISPLAYED); sb.append("<div style='width:100%;overflow-x:auto;'><table class='table-bordered'>"); sb.append("<tr> <th colspan="+len+" style='text-align:center'>Histogram</th></tr>"); sb.append("<tr>"); if ( _type == T_ENUM ) for( int i=0; i<len; i++ ) sb.append("<th>" + vec.domain(i) + "</th>"); else for( int i=0; i<len; i++ ) sb.append("<th>" + Utils.p2d(i==0?_start:binValue(i)) + "</th>"); sb.append("</tr>"); sb.append("<tr>"); for( int i=0; i<len; i++ ) sb.append("<td>" + hcnt[i] + "</td>"); sb.append("</tr>"); sb.append("<tr>"); for( int i=0; i<len; i++ ) sb.append(String.format("<td>%.1f%%</td>",(100.0*hcnt[i]/_stat0._len))); sb.append("</tr>"); if( hcnt.length >= MAX_HISTO_BINS_DISPLAYED ) sb.append("<div class='alert'>Histogram for this column was too big and was truncated to 1000 values!</div>"); sb.append("</table></div>"); if (_type != T_ENUM) { NumStats stats = (NumStats)this.stats; // Percentiles sb.append("<div style='width:100%;overflow-x:auto;'><table class='table-bordered'>"); sb.append("<tr> <th colspan='" + stats.pct.length + "' " + "style='text-align:center' " + ">Percentiles</th></tr>"); sb.append("<tr><th>Threshold(%)</th>"); for (double pc : stats.pct) sb.append("<td>" + Utils.p2d(pc * 100.0) + "</td>"); // sb.append("<td>" + (int) Math.round(pc * 100) + "</td>"); sb.append("</tr>"); sb.append("<tr><th>Value</th>"); for (double pv : stats.pctile) sb.append("<td>" + pv + "</td>"); sb.append("</tr>"); sb.append("</table>"); sb.append("</div>"); } sb.append("</div>\n"); } }