package hex; import water.*; import water.api.*; import water.api.Request.API; import water.fvec.*; import water.util.Utils; import water.util.Log; /** * Quantile of a column. */ // R doesn't like NAs in a column // Error in quantile.default(nah[, 1], c(1)) : // missing values and NaN's not allowed if 'na.rm' is FALSE // suppose we have to tolerate empty columns and all NA cols, and single value col public class Quantiles extends Iced { static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code. // This Request supports the HTML 'GET' command, and this is the help text // for GET. static final String DOC_GET = "Returns a quantile of a fluid-vec frame"; public static final int MAX_ENUM_SIZE = H2O.DATA_MAX_FACTOR_LEVELS; public long _totalRows; // non-empty rows per group // FIX! not sure if I need to save these here from vec // why were these 'transient' ? doesn't make sense if hcnt2 stuff wasn't transient // they're not very big. are they serialized in the map/reduce? final double _max; final double _min; final boolean _isInt; final boolean _isEnum; final String[] _domain; // used to feed the next iteration for multipass? // used in exactQuantilesMultiPass only final double _valStart; final double _valEnd; final long _valMaxBinCnt; // just for info on current pass? public double _valRange; public double _valBinSize; public double _newValStart; public double _newValEnd; public double[] _pctile; public boolean _interpolated = false; // FIX! do I need this? public boolean _done = false; // FIX! do I need this? // OUTPUTS // Basic info @API(help="name" ) public String colname; // FIX! currently not set. Need at least one for class loading public long[] hcnt2; // finer histogram. not visible public double[] hcnt2_min; // min actual for each bin public double[] hcnt2_max; // max actual for each bin public long hcnt2_low; // count below current binning public long hcnt2_high; // count above current binning public double hcnt2_high_min; // min above current binning public static class BinTask2 extends MRTask2<BinTask2> { private final int _max_qbins; private final double _valStart; private final double _valEnd; public Quantiles _qbins[]; public BinTask2 (int max_qbins, double valStart, double valEnd) { _max_qbins = max_qbins; _valStart = valStart; _valEnd = valEnd; } @Override public void map(Chunk[] cs) { _qbins = new Quantiles[cs.length]; for (int i = 0; i < cs.length; i++) _qbins[i] = new Quantiles(_fr.vecs()[i], _max_qbins, _valStart, _valEnd).add(cs[i]); } @Override public void reduce(BinTask2 other) { for (int i = 0; i < _qbins.length; i++) _qbins[i].add(other._qbins[i]); // will all the map memory get reclaimed now, since the reduce has gathered it? // we want to keep 1st iteration object around in for lists of thresholds to do // so hopefully this means just the reduce histogram will stay around. // FIX! Maybe unnecesary/implied or better way? other = null; } } // FIX! currently only take one quantile at a time here..ability to do a list though public void finishUp(Vec vec, double[] quantiles_to_do, int interpolation_type, boolean multiPass) { assert quantiles_to_do.length == 1 : "currently one quantile at a time. caller can reuse qbin for now."; // below, we force it to ignore length and only do [0] // need to figure out if we need to do a list and how that's returned _pctile = new double[quantiles_to_do.length]; if ( _isEnum ) { _done = false; } else { if ( multiPass ) { _done = exactQuantilesMultiPass(_pctile, quantiles_to_do, interpolation_type); } else { _done = approxQuantilesOnePass(_pctile, quantiles_to_do, interpolation_type); } } } public Quantiles(Vec vec, int max_qbins, double valStart, double valEnd) { _isEnum = vec.isEnum(); _isInt = vec.isInt(); _domain = vec.isEnum() ? vec.domain() : null; _max = vec.max(); _min = vec.min(); _totalRows = 0; _valStart = valStart; _valEnd = valEnd; _valRange = valEnd - valStart; assert max_qbins > 0 && max_qbins <= 1000000 : "max_qbins must be >0 and <= 1000000"; int desiredBinCnt = max_qbins; int maxBinCnt = desiredBinCnt + 1; _valBinSize = _valRange / (desiredBinCnt + 0.0); _valMaxBinCnt = maxBinCnt; if( vec.isEnum() && _domain.length < MAX_ENUM_SIZE ) { hcnt2 = new long[_domain.length]; hcnt2_min = new double[_domain.length]; hcnt2_max = new double[_domain.length]; } else if ( !Double.isNaN(_min) ) { assert maxBinCnt > 0; // Log.debug("Q_ Multiple pass histogram starts at "+_valStart); // Log.debug("Q_ _min "+_min+" _max "+_max); hcnt2 = new long[maxBinCnt]; hcnt2_min = new double[maxBinCnt]; hcnt2_max = new double[maxBinCnt]; } else { // vec does not contain finite numbers // okay this one entry hcnt2 stuff is making the algo die ( I guess the min was nan above) // for now, just make it length 2 hcnt2 = new long[2]; hcnt2_min = new double[2]; hcnt2_max = new double[2]; } hcnt2_low = 0; hcnt2_high = 0; hcnt2_high_min = 0; // hcnt2 implicitly zeroed on new } public Quantiles(Vec vec) { // default to 1000 bin // still would need to call the finishUp you want, to get a result, // and do multipass iteration/finishUp, if desired this(vec, 1000, vec.min(), vec.max()); } public Quantiles add(Chunk chk) { for (int i = 0; i < chk._len; i++) add(chk.at0(i)); return this; } public void add(double val) { if ( Double.isNaN(val) ) return; // can get infinity due to bad enum parse to real // histogram is sized ok, but the index calc below will be too big // just drop them. not sure if something better to do? if( val==Double.POSITIVE_INFINITY ) return; if( val==Double.NEGATIVE_INFINITY ) return; if ( _isEnum ) return; _totalRows++; long maxBinCnt = _valMaxBinCnt; // multi pass exact. Should be able to do this for both, if the valStart param is correct long binIdx2; // Need to count the stuff outside the bin-gathering, // since threshold compare is based on total row compare double valOffset = val - _valStart; // FIX! do we really need this special case? Not hurting. if (hcnt2.length==1) { binIdx2 = 0; } else { binIdx2 = (int) Math.floor(valOffset / _valBinSize); } int binIdx2Int = (int) binIdx2; // we always need the start condition in the bins? // maybe some redundancy in two compares if ( valOffset < 0 || binIdx2Int<0 ) { ++hcnt2_low; } // we always need the end condition in the bins? // would using valOffset here be less accurate? maybe some redundancy in two compares // can't use maxBinCnt-1, because the extra bin is used for one value (the bounds) else if ( val > _valEnd || binIdx2>=maxBinCnt ) { if ( (hcnt2_high==0) || (val < hcnt2_high_min) ) hcnt2_high_min = val; ++hcnt2_high; } else { assert (binIdx2Int >= 0 && binIdx2Int < hcnt2.length) : "binIdx2Int too big for hcnt2 "+binIdx2Int+" "+hcnt2.length; // Log.debug("Q_ val: "+val+" valOffset: "+valOffset+" _valBinSize: "+_valBinSize); assert (binIdx2Int>=0) && (binIdx2Int<=maxBinCnt) : "binIdx2Int "+binIdx2Int+" out of range"; if ( hcnt2[binIdx2Int]==0 || (val < hcnt2_min[binIdx2Int]) ) hcnt2_min[binIdx2Int] = val; if ( hcnt2[binIdx2Int]==0 || (val > hcnt2_max[binIdx2Int]) ) hcnt2_max[binIdx2Int] = val; ++hcnt2[binIdx2Int]; // For debug/info, can report when it goes into extra bin. // is it ever due to fp arith? Or just the max value? // not an error! should be protected by newValEnd below, and nextK // estimates should go into the extra bin if interpolation is needed if ( false && (binIdx2 == (maxBinCnt-1)) ) { Log.debug("\nQ_ FP! val went into the extra maxBinCnt bin:"+ binIdx2+" "+hcnt2_high_min+" "+valOffset+" "+ val+" "+_valStart+" "+hcnt2_high+" "+val+" "+_valEnd,"\n"); } } } public Quantiles add(Quantiles other) { if ( _isEnum ) return this; assert !Double.isNaN(other._totalRows) : "NaN in other._totalRows merging"; assert !Double.isNaN(_totalRows) : "NaN in _totalRows merging"; _totalRows += other._totalRows; // merge hcnt2 per-bin mins // other must be same length, but use it's length for safety // could add assert on lengths? for (int k = 0; k < other.hcnt2_min.length; k++) { // Shouldn't get any assert !Double.isNaN(other.hcnt2_min[k]) : "NaN in other.hcnt2_min merging"; assert !Double.isNaN(other.hcnt2[k]) : "NaN in hcnt2_min merging"; assert !Double.isNaN(hcnt2_min[k]) : "NaN in hcnt2_min merging"; assert !Double.isNaN(hcnt2[k]) : "NaN in hcnt2_min merging"; // cover the initial case (relying on initial min = 0 to work is wrong) // Only take the new max if it's hcnt2 is non-zero. like a valid bit // can hcnt2 ever be null here? if (other.hcnt2[k] > 0) { if ( hcnt2[k]==0 || ( other.hcnt2_min[k] < hcnt2_min[k] )) { hcnt2_min[k] = other.hcnt2_min[k]; } } } // merge hcnt2 per-bin maxs // other must be same length, but use it's length for safety for (int k = 0; k < other.hcnt2_max.length; k++) { // shouldn't get any assert !Double.isNaN(other.hcnt2_max[k]) : "NaN in other.hcnt2_max merging"; assert !Double.isNaN(other.hcnt2[k]) : "NaN in hcnt2_min merging"; assert !Double.isNaN(hcnt2_max[k]) : "NaN in hcnt2_max merging"; assert !Double.isNaN(hcnt2[k]) : "NaN in hcnt2_max merging"; // cover the initial case (relying on initial min = 0 to work is wrong) // Only take the new max if it's hcnt2 is non-zero. like a valid bit // can hcnt2 ever be null here? if (other.hcnt2[k] > 0) { if ( hcnt2[k]==0 || ( other.hcnt2_max[k] > hcnt2_max[k] )) { hcnt2_max[k] = other.hcnt2_max[k]; } } } // 3 new things to merge for multipass histgrams (counts above/below the bins, and the min above the bins) assert !Double.isNaN(other.hcnt2_high) : "NaN in other.hcnt2_high merging"; assert !Double.isNaN(other.hcnt2_low) : "NaN in other.hcnt2_low merging"; assert !Double.isNaN(hcnt2_high) : "NaN in hcnt2_high merging"; assert !Double.isNaN(hcnt2_low) : "NaN in hcnt2_low merging"; assert other.hcnt2_high==0 || !Double.isNaN(other.hcnt2_high_min) : "0 or NaN in hcnt2_high_min merging"; // these are count merges hcnt2_low = hcnt2_low + other.hcnt2_low; hcnt2_high = hcnt2_high + other.hcnt2_high; // hcnt2_high_min validity is hcnt2_high!=0 (count) if (other.hcnt2_high > 0) { if ( hcnt2_high==0 || ( other.hcnt2_high_min < hcnt2_high_min )) { hcnt2_high_min = other.hcnt2_high_min; } } // can hcnt2 ever be null here?. Inc last, so the zero case is detected above // seems like everything would fail if hcnt2 doesn't exist here assert hcnt2 != null; Utils.add(hcnt2, other.hcnt2); return this; } // need to count >4B rows private long htot2(long low, long high) { long cnt = 0; for (int i = 0; i < hcnt2.length; i++) cnt+=hcnt2[i]; // add the stuff outside the bins, 0,0 for single pass cnt = cnt + low + high; return cnt; } private boolean exactQuantilesMultiPass(double[] qtiles, double[] quantiles_to_do, int interpolation_type) { // looked at outside this method. setup for all NA or empty case // done could be the return value, really should make these 3 available differently // qtiles is an array just in case we support iterating on quantiles_to_do // but that would only work for approx, since we won't redo bins here. boolean done = false; boolean interpolated = false; qtiles[0] = Double.NaN; if( hcnt2.length < 2 ) return false; assert !_isEnum; if ( _totalRows==0 ) return false; assert _totalRows >=0 : _totalRows; double newValStart = Double.NaN; double newValEnd = Double.NaN; double newValRange = Double.NaN; double newValBinSize = Double.NaN; boolean forceBestApprox = interpolation_type==-1; long newValLowCnt; long maxBinCnt = _valMaxBinCnt; assert maxBinCnt>1; long desiredBinCnt = maxBinCnt - 1; double threshold = quantiles_to_do[0]; assert _valEnd!=Double.NaN : _valEnd; assert _valStart!=Double.NaN : _valStart; assert _valBinSize!=Double.NaN : _valBinSize; if ( _valStart==_valEnd ) Log.debug("exactQuantilesMultiPass: start/end are equal. "+_valStart+" "+_valEnd); else assert (_valBinSize!=0 && _valBinSize!=Double.NaN) : _valBinSize; // everything should either be in low, the bins, or high long totalBinnedRows = htot2(hcnt2_low, hcnt2_high); Log.debug("Q_ totalRows check: "+_totalRows+" "+totalBinnedRows+" "+hcnt2_low+" "+hcnt2_high+" "+_valStart+" "+_valEnd); assert _totalRows==totalBinnedRows : _totalRows+" "+totalBinnedRows+" "+hcnt2_low+" "+hcnt2_high; // Find the row count we want to hit, within some bin. long currentCnt = hcnt2_low; double targetCntFull = threshold * (_totalRows-1); // zero based indexing long targetCntInt = (long) Math.floor(targetCntFull); double targetCntFract = targetCntFull - (double) targetCntInt; assert (targetCntFract>=0) && (targetCntFract<=1); Log.debug("Q_ targetCntInt: "+targetCntInt+" targetCntFract: "+targetCntFract); // walk thru and find out what bin to look inside int k = 0; while(k!=maxBinCnt && ((currentCnt + hcnt2[k]) <= targetCntInt)) { // Log.debug("Q_ Looping for k: "+threshold+" "+k+" "+maxBinCnt+" "+currentCnt+" "+targetCntInt+ // " "+hcnt2[k]+" "+hcnt2_min[k]+" "+hcnt2_max[k]); currentCnt += hcnt2[k]; ++k; // Note the loop condition covers the breakout condition: // (currentCnt==targetCntInt && (hcnt2[k]!=0) // also: don't go pass array bounds } Log.debug("Q_ Found k: "+threshold+" "+k+" "+currentCnt+" "+targetCntInt+ " "+_totalRows+" "+hcnt2[k]+" "+hcnt2_min[k]+" "+hcnt2_max[k]); assert (currentCnt + hcnt2[k]) > targetCntInt : targetCntInt+" "+currentCnt+" "+k+" "+" "+maxBinCnt; assert hcnt2[k]!=1 || hcnt2_min[k]==hcnt2_max[k]; // Do mean and linear interpolation, if we don't land on a row // WATCH OUT when comparing results if linear interpolation...it's dependent on // the number of rows in the dataset, not just adjacent values. So if you skipped a row // for some reason (header guess?) in a comparison tool, you can get small errors // both type 2 and type 7 give exact answers that match alternate tools // (if they do type 2 and 7). scklearn doesn't do type 2 but does do type 7 // (but not by default in mquantiles()) // the linear interpolation for k between row a (vala) and row b (valb) is // pctDiff = (k-a)/(b-a) // dDiff = pctDiff * (valb - vala) // result = vala + dDiff double guess = Double.NaN; double pctDiff, dDiff; // -1 is for single pass approximation assert (interpolation_type==2) || (interpolation_type==7) || (interpolation_type==-1): "Unsupported type "+interpolation_type; // special cases. If the desired row is the last of equal values in this bin (2 or more) // we will need to intepolate with a nextK out-of-bin value // we can't iterate, since it won't improve things and the bin-size will be zero! // trying to resolve case of binsize=0 for next pass, after this, is flawed thinking. // implies the values are not the same..end of bin interpolate to next boolean atStartOfBin = hcnt2[k]>=1 && (currentCnt == targetCntInt); boolean atEndOfBin = !atStartOfBin && (hcnt2[k]>=2 && ((currentCnt + hcnt2[k] - 1) == targetCntInt)); boolean inMidOfBin = !atStartOfBin && !atEndOfBin && (hcnt2[k]>=3) && (hcnt2_min[k]==hcnt2_max[k]); boolean interpolateEndNeeded = false; if ( atEndOfBin ) { if ( targetCntFract != 0 ) { interpolateEndNeeded = true; } else { guess = hcnt2_max[k]; done = true; Log.debug("Q_ Guess M "+guess); } } else if ( inMidOfBin ) { // if we know there is something before and after us with same value, // we never need to interpolate (only allowed when min=max guess = hcnt2_min[k]; done = true; Log.debug("Q_ Guess N "+guess); } if ( !done && atStartOfBin ) { // no interpolation needed if ( hcnt2[k]>2 && (hcnt2_min[k]==hcnt2_max[k]) ) { guess = hcnt2_min[k]; done = true; Log.debug("Q_ Guess A "+guess); } // min/max can be equal or not equal here else if ( hcnt2[k]==2 ) { // interpolate between min/max for the two value bin if ( interpolation_type==2 ) { // type 2 (mean) guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0; } else { // default to type 7 (linear interpolation) // Unlike mean, which just depends on two adjacent values, this adjustment // adds possible errors related to the arithmetic on the total # of rows. dDiff = hcnt2_max[k] - hcnt2_min[k]; // two adjacent..as if sorted! // targetCntFract is fraction of total rows guess = hcnt2_min[k] + (targetCntFract * dDiff); } done = true; interpolated = true; Log.debug("Q_ Guess B "+guess+" with type "+interpolation_type+" targetCntFract: "+targetCntFract); } // no interpolation needed else if ( (hcnt2[k]==1) && (targetCntFract==0) ) { assert hcnt2_min[k]==hcnt2_max[k]; guess = hcnt2_min[k]; done = true; Log.debug("Q_ Guess C "+guess); } } // interpolate into a nextK value // all the qualification is so we don't set done when we're not, for multipass // interpolate from single bin, end of two entry bin, or for approx boolean stillCanGetIt = atStartOfBin && hcnt2[k]==1 && targetCntFract!=0; if ( !done && (stillCanGetIt || interpolateEndNeeded || forceBestApprox)) { if ( hcnt2[k]==1 ) { assert hcnt2_min[k]==hcnt2_max[k]; Log.debug("Q_ Single value in this bin, but fractional means we need to interpolate to next non-zero"); } if ( interpolateEndNeeded ) { Log.debug("Q_ Interpolating off the end of a bin!"); } int nextK; if ( k<maxBinCnt ) nextK = k + 1; // could put it over maxBinCnt else nextK = k; // definitely see stuff going into the extra bin, so search that too! while ( (nextK<maxBinCnt) && (hcnt2[nextK]==0) ) ++nextK; assert nextK > k : k+" "+nextK; // have the "extra bin" for this double nextVal; if ( nextK >= maxBinCnt ) { // assume we didn't set hcnt2_high_min on first pass, because tighter start/end bounds if ( forceBestApprox ) { Log.debug("Q_ Using _valEnd for approx interpolate: "+_valEnd); nextVal = _valEnd; } else { assert hcnt2_high!=0; Log.debug("Q_ Using hcnt2_high_min for interpolate: "+hcnt2_high_min); nextVal = hcnt2_high_min; } } else { Log.debug("Q_ Using nextK for interpolate: "+nextK); assert hcnt2[nextK]!=0; nextVal = hcnt2_min[nextK]; } Log.debug("Q_ k hcnt2_max[k] nextVal"); Log.debug("Q_ "+k+" "+hcnt2_max[k]+" "+nextVal); Log.debug("Q_ \nInterpolating result using nextK: "+nextK+ " nextVal: "+nextVal); // type 7 (linear interpolation) || // single pass approx..with unresolved bin if ( (forceBestApprox & stillCanGetIt) || interpolation_type==7) { dDiff = nextVal - hcnt2_max[k]; // two adjacent, as if sorted! // targetCntFract is fraction of total rows guess = hcnt2_max[k] + (targetCntFract * dDiff); } else if ( forceBestApprox ) { // single pass approx..with unresolved bin // best to use hcnt2_max[k] instead of nextVal here, to keep // within the guaranteed worst case error bounds dDiff = (hcnt2_max[k] - hcnt2_min[k]) / hcnt2[k]; guess = hcnt2_min[k] + (targetCntFull-currentCnt) * dDiff; } else { // type 2 (mean) guess = (hcnt2_max[k] + nextVal) / 2.0; } interpolated = true; done = true; // has to be one above us when needed. (or we're at end) Log.debug("Q_ Guess D "+guess+" with type "+interpolation_type+ " targetCntFull: "+targetCntFull+" targetCntFract: "+targetCntFract+ " _totalRows: " + _totalRows+" "+stillCanGetIt+" "+forceBestApprox); } if ( !done && !forceBestApprox) { // don't need for 1 pass approx // Possible bin leakage at start/end edges due to fp arith. // bin index arith may resolve OVER the boundary created by the compare for // hcnt2_high compare. // I suppose just one value should be in desiredBinCnt+1 bin -> the end value?) // To cover possible fp issues: // See if there's a non-zero bin below (min) or above (max) you, to avoid shrinking wrong. // Just need to check the one bin below and above k, if they exist. // They might have zero entries, but then it's okay to ignore them. // update: use the closest edge in the next bin. better forward progress for small bin counts // This code may make the practical min bin count around 4 or so (not 2). // what has length 1 hcnt2 that makese this fail? Enums? shouldn't get here. newValStart = hcnt2_min[k]; if ( k > 0 ) { if ( hcnt2[k-1]>0 && (hcnt2_max[k-1]<hcnt2_min[k]) ) { newValStart = hcnt2_max[k-1]; } } // subtle. we do sometimes put stuff in the extra end bin (see above) // k might be pointing to one less than that (like k=0 for 1 bin case) newValEnd = hcnt2_max[k]; if ( k < (maxBinCnt-1) ) { assert k+1 < hcnt2.length : k+" "+hcnt2.length+" "+_valMaxBinCnt+" "+_isEnum+" "+_isInt; if ( hcnt2[k+1]>0 && (hcnt2_min[k+1]>hcnt2_max[k]) ) { newValEnd = hcnt2_min[k+1]; } } newValRange = newValEnd - newValStart; // maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues? newValBinSize = newValRange / (desiredBinCnt + 0.0); newValLowCnt = currentCnt - 1; // is this right? don't use for anything (debug?) // Since we always may need an interpolation, this seems bad if we get this with !done if ( newValBinSize==0 ) { Log.debug("Q_ Assuming done because newValBinSize is 0."); Log.debug("Q_ newValRange: "+newValRange+ " hcnt2[k]: "+hcnt2[k]+ " hcnt2_min[k]: "+hcnt2_min[k]+ " hcnt2_max[k]: "+hcnt2_max[k]); guess = newValStart; Log.debug("Q_ Guess G "+guess); // maybe make this assert false, to see? assert true : "Should never get newValBinSize==0 in !done branch"; done = true; } } Log.debug("Q_ guess: "+guess+" done: "+done+" hcnt2[k]: "+hcnt2[k]); Log.debug("Q_ currentCnt: "+currentCnt+" targetCntInt: "+targetCntInt+" hcnt2_low: "+hcnt2_low+" hcnt2_high: "+hcnt2_high); Log.debug("Q_ was "+_valStart+" "+_valEnd+" "+_valRange+" "+_valBinSize); Log.debug("Q_ next "+newValStart+" "+newValEnd+" "+newValRange+" "+newValBinSize); qtiles[0] = guess; // We want to leave them now! we reuse in exec for multi-thresholds // hcnt2 = null; // hcnt2_min = null; // hcnt2_max = null; _newValStart = newValStart; _newValEnd = newValEnd; _interpolated = interpolated; return done; } // this won't be used with a multipass iteration of qbins. So it alays has to return a best guess // Also, it needs to interpolate for bins that have different values that aren't resolved by min/max // so we give it a special interpolation type (-1) that we'll decode and use above private boolean approxQuantilesOnePass(double[] qtiles, double[] quantiles_to_do, int interpolation_type) { // exactQuantilesMultiPass(qtiles, quantiles_to_do, -1) ; exactQuantilesMultiPass(qtiles, quantiles_to_do, -1) ; return true; } }