Quantiles.java example

Explorer
h2o-2-master
package hex;

import water.*;
import water.api.*;
import water.api.Request.API;
import water.fvec.*;
import water.util.Utils;
import water.util.Log;

/**
 * Quantile of a column.
 */

// R doesn't like NAs in a column
// Error in quantile.default(nah[, 1], c(1)) : 
// missing values and NaN's not allowed if 'na.rm' is FALSE
// suppose we have to tolerate empty columns and all NA cols, and single value col

public class Quantiles extends Iced {
  static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields
  static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.

  // This Request supports the HTML 'GET' command, and this is the help text
  // for GET.
  static final String DOC_GET = "Returns a quantile of a fluid-vec frame";

  public static final int MAX_ENUM_SIZE = H2O.DATA_MAX_FACTOR_LEVELS;

  public long      _totalRows;    // non-empty rows per group
  // FIX! not sure if I need to save these here from vec
  // why were these 'transient' ? doesn't make sense if hcnt2 stuff wasn't transient
  // they're not very big. are they serialized in the map/reduce?
  final double     _max;
  final double     _min;
  final boolean    _isInt;
  final boolean    _isEnum;
  final String[]   _domain;

  // used to feed the next iteration for multipass?
  // used in exactQuantilesMultiPass only
  final double     _valStart;
  final double     _valEnd;
  final long       _valMaxBinCnt;

  // just for info on current pass?
  public double    _valRange;
  public double    _valBinSize;

  public double    _newValStart;
  public double    _newValEnd;
  public double[]  _pctile;
  public boolean   _interpolated = false; // FIX! do I need this?
  public boolean   _done = false; // FIX! do I need this?

  // OUTPUTS
  // Basic info
  @API(help="name"    ) public String colname; // FIX! currently not set. Need at least one for class loading

  public long[]  hcnt2; // finer histogram. not visible
  public double[]  hcnt2_min; // min actual for each bin
  public double[]  hcnt2_max; // max actual for each bin
  public long  hcnt2_low; // count below current binning
  public long  hcnt2_high; // count above current binning
  public double hcnt2_high_min; // min above current binning

  public static class BinTask2 extends MRTask2<BinTask2> {
    private final int _max_qbins;
    private final double _valStart;
    private final double _valEnd;

    public Quantiles _qbins[];

    public BinTask2 (int max_qbins, double valStart, double valEnd) { 
      _max_qbins = max_qbins; 
      _valStart = valStart; 
      _valEnd = valEnd; 
    }

    @Override public void map(Chunk[] cs) {
      _qbins = new Quantiles[cs.length];
      for (int i = 0; i < cs.length; i++)
        _qbins[i] = new Quantiles(_fr.vecs()[i], _max_qbins, _valStart, _valEnd).add(cs[i]);
    }

    @Override public void reduce(BinTask2 other) {
      for (int i = 0; i < _qbins.length; i++)
        _qbins[i].add(other._qbins[i]);
      // will all the map memory get reclaimed now, since the reduce has gathered it?
      // we want to keep 1st iteration object around in for lists of thresholds to do
      // so hopefully this means just the reduce histogram will stay around.
      // FIX! Maybe unnecesary/implied or better way?
      other = null;
    }
  }

  // FIX! currently only take one quantile at a time here..ability to do a list though
  public void finishUp(Vec vec, double[] quantiles_to_do, int interpolation_type, boolean multiPass) {
    assert quantiles_to_do.length == 1 : "currently one quantile at a time. caller can reuse qbin for now.";
    // below, we force it to ignore length and only do [0]
    // need to figure out if we need to do a list and how that's returned
    _pctile = new double[quantiles_to_do.length];
    if ( _isEnum ) {
      _done = false;
    } 
    else {
      if ( multiPass ) {
        _done = exactQuantilesMultiPass(_pctile, quantiles_to_do, interpolation_type);
      } 
      else {
        _done = approxQuantilesOnePass(_pctile, quantiles_to_do, interpolation_type);
      }
    }
  }

  public Quantiles(Vec vec, int max_qbins, double valStart, double valEnd) {

    _isEnum = vec.isEnum();
    _isInt = vec.isInt();
    _domain = vec.isEnum() ? vec.domain() : null;
    _max = vec.max();
    _min = vec.min();

    _totalRows = 0;
    _valStart = valStart;
    _valEnd = valEnd;
    _valRange = valEnd - valStart;

    assert max_qbins > 0 && max_qbins <= 1000000 : "max_qbins must be >0 and <= 1000000";
    int desiredBinCnt = max_qbins;
    int maxBinCnt = desiredBinCnt + 1;
    _valBinSize = _valRange / (desiredBinCnt + 0.0);
    _valMaxBinCnt = maxBinCnt;

    if( vec.isEnum() && _domain.length < MAX_ENUM_SIZE ) {
      hcnt2 = new long[_domain.length];
      hcnt2_min = new double[_domain.length];
      hcnt2_max = new double[_domain.length];
    } 
    else if ( !Double.isNaN(_min) ) {
      assert maxBinCnt > 0;
      // Log.debug("Q_ Multiple pass histogram starts at "+_valStart);
      // Log.debug("Q_ _min "+_min+" _max "+_max);
      hcnt2 = new long[maxBinCnt];
      hcnt2_min = new double[maxBinCnt];
      hcnt2_max = new double[maxBinCnt];
    } 
    else { // vec does not contain finite numbers
      // okay this one entry hcnt2 stuff is making the algo die ( I guess the min was nan above)
      // for now, just make it length 2
      hcnt2 = new long[2];
      hcnt2_min = new double[2];
      hcnt2_max = new double[2];
    }
    hcnt2_low = 0;
    hcnt2_high = 0;
    hcnt2_high_min = 0;
    // hcnt2 implicitly zeroed on new 
  }

  public Quantiles(Vec vec) {
    // default to 1000 bin
    // still would need to call the finishUp you want, to get a result,
    // and do multipass iteration/finishUp, if desired
    this(vec, 1000, vec.min(), vec.max());
  }

  public Quantiles add(Chunk chk) {
    for (int i = 0; i < chk._len; i++)
      add(chk.at0(i));
    return this;
  }
  public void add(double val) {
    if ( Double.isNaN(val) ) return;
    // can get infinity due to bad enum parse to real
    // histogram is sized ok, but the index calc below will be too big
    // just drop them. not sure if something better to do?
    if( val==Double.POSITIVE_INFINITY ) return;
    if( val==Double.NEGATIVE_INFINITY ) return;
    if ( _isEnum ) return;

    _totalRows++;
    long maxBinCnt = _valMaxBinCnt;

    // multi pass exact. Should be able to do this for both, if the valStart param is correct
    long binIdx2;
    // Need to count the stuff outside the bin-gathering, 
    // since threshold compare is based on total row compare
    double valOffset = val - _valStart;

    // FIX! do we really need this special case? Not hurting.
    if (hcnt2.length==1) {
      binIdx2 = 0;
    }
    else {
      binIdx2 = (int) Math.floor(valOffset / _valBinSize);
    }
    int binIdx2Int = (int) binIdx2;

    // we always need the start condition in the bins?
    // maybe some redundancy in two compares
    if ( valOffset < 0 || binIdx2Int<0 ) { 
      ++hcnt2_low;
    }
    // we always need the end condition in the bins?
    // would using valOffset here be less accurate? maybe some redundancy in two compares
    // can't use maxBinCnt-1, because the extra bin is used for one value (the bounds)
    else if ( val > _valEnd || binIdx2>=maxBinCnt ) { 
      if ( (hcnt2_high==0) || (val < hcnt2_high_min) ) hcnt2_high_min = val;
      ++hcnt2_high;
    } 
    else {
      assert (binIdx2Int >= 0 && binIdx2Int < hcnt2.length) : 
        "binIdx2Int too big for hcnt2 "+binIdx2Int+" "+hcnt2.length;
      // Log.debug("Q_ val: "+val+" valOffset: "+valOffset+" _valBinSize: "+_valBinSize);
      assert (binIdx2Int>=0) && (binIdx2Int<=maxBinCnt) : "binIdx2Int "+binIdx2Int+" out of range";

      if ( hcnt2[binIdx2Int]==0 || (val < hcnt2_min[binIdx2Int]) ) hcnt2_min[binIdx2Int] = val;
      if ( hcnt2[binIdx2Int]==0 || (val > hcnt2_max[binIdx2Int]) ) hcnt2_max[binIdx2Int] = val;
      ++hcnt2[binIdx2Int];

      // For debug/info, can report when it goes into extra bin.
      // is it ever due to fp arith? Or just the max value?
      // not an error! should be protected by newValEnd below, and nextK 
      // estimates should go into the extra bin if interpolation is needed
      if ( false && (binIdx2 == (maxBinCnt-1)) ) {
          Log.debug("\nQ_ FP! val went into the extra maxBinCnt bin:"+
            binIdx2+" "+hcnt2_high_min+" "+valOffset+" "+
            val+" "+_valStart+" "+hcnt2_high+" "+val+" "+_valEnd,"\n");
      }
    }
  } 

  public Quantiles add(Quantiles other) {
    if ( _isEnum ) return this;

    assert !Double.isNaN(other._totalRows) : "NaN in other._totalRows merging";
    assert !Double.isNaN(_totalRows) : "NaN in _totalRows merging";
    _totalRows += other._totalRows;

    // merge hcnt2 per-bin mins 
    // other must be same length, but use it's length for safety
    // could add assert on lengths?
    for (int k = 0; k < other.hcnt2_min.length; k++) {
      // Shouldn't get any
      assert !Double.isNaN(other.hcnt2_min[k]) : "NaN in other.hcnt2_min merging";
      assert !Double.isNaN(other.hcnt2[k]) : "NaN in hcnt2_min merging";
      assert !Double.isNaN(hcnt2_min[k]) : "NaN in hcnt2_min merging";
      assert !Double.isNaN(hcnt2[k]) : "NaN in hcnt2_min merging";

      // cover the initial case (relying on initial min = 0 to work is wrong)
      // Only take the new max if it's hcnt2 is non-zero. like a valid bit
      // can hcnt2 ever be null here?
      if (other.hcnt2[k] > 0) {
        if ( hcnt2[k]==0 || ( other.hcnt2_min[k] < hcnt2_min[k] )) {
          hcnt2_min[k] = other.hcnt2_min[k];
        }
      }
    }

    // merge hcnt2 per-bin maxs
    // other must be same length, but use it's length for safety
    for (int k = 0; k < other.hcnt2_max.length; k++) {
      // shouldn't get any
      assert !Double.isNaN(other.hcnt2_max[k]) : "NaN in other.hcnt2_max merging";
      assert !Double.isNaN(other.hcnt2[k]) : "NaN in hcnt2_min merging";
      assert !Double.isNaN(hcnt2_max[k]) : "NaN in hcnt2_max merging";
      assert !Double.isNaN(hcnt2[k]) : "NaN in hcnt2_max merging";

      // cover the initial case (relying on initial min = 0 to work is wrong)
      // Only take the new max if it's hcnt2 is non-zero. like a valid bit
      // can hcnt2 ever be null here?
      if (other.hcnt2[k] > 0) {
        if ( hcnt2[k]==0 || ( other.hcnt2_max[k] > hcnt2_max[k] )) {
          hcnt2_max[k] = other.hcnt2_max[k];
        }
      }
    }

    // 3 new things to merge for multipass histgrams (counts above/below the bins, and the min above the bins)
    assert !Double.isNaN(other.hcnt2_high) : "NaN in other.hcnt2_high merging";
    assert !Double.isNaN(other.hcnt2_low) : "NaN in other.hcnt2_low merging";
    assert !Double.isNaN(hcnt2_high) : "NaN in hcnt2_high merging";
    assert !Double.isNaN(hcnt2_low) : "NaN in hcnt2_low merging";
    assert other.hcnt2_high==0 || !Double.isNaN(other.hcnt2_high_min) : "0 or NaN in hcnt2_high_min merging";

    // these are count merges
    hcnt2_low = hcnt2_low + other.hcnt2_low;
    hcnt2_high = hcnt2_high + other.hcnt2_high;

    // hcnt2_high_min validity is hcnt2_high!=0 (count)
    if (other.hcnt2_high > 0) {
      if ( hcnt2_high==0 || ( other.hcnt2_high_min < hcnt2_high_min )) {
        hcnt2_high_min = other.hcnt2_high_min;
      }
    }

    // can hcnt2 ever be null here?. Inc last, so the zero case is detected above
    // seems like everything would fail if hcnt2 doesn't exist here
    assert hcnt2 != null;
    Utils.add(hcnt2, other.hcnt2);
    return this;
  }

  // need to count >4B rows
  private long htot2(long low, long high) {
    long cnt = 0;
    for (int i = 0; i < hcnt2.length; i++) cnt+=hcnt2[i];
    // add the stuff outside the bins, 0,0 for single pass
    cnt = cnt + low + high;
    return cnt;
  }

  private boolean exactQuantilesMultiPass(double[] qtiles, double[] quantiles_to_do, int interpolation_type) {

    // looked at outside this method. setup for all NA or empty case
    // done could be the return value, really should make these 3 available differently
    // qtiles is an array just in case we support iterating on quantiles_to_do
    // but that would only work for approx, since we won't redo bins here.
    boolean done = false;
    boolean interpolated = false;
    qtiles[0] =  Double.NaN;

    if( hcnt2.length < 2 ) return false;
    assert !_isEnum;

    if ( _totalRows==0 ) return false;
    assert _totalRows >=0 : _totalRows;

    double newValStart = Double.NaN; 
    double newValEnd = Double.NaN;
    double newValRange = Double.NaN;
    double newValBinSize = Double.NaN;

    boolean forceBestApprox = interpolation_type==-1;

    long newValLowCnt;
    long maxBinCnt = _valMaxBinCnt;
    assert maxBinCnt>1;
    long desiredBinCnt = maxBinCnt - 1;

    double threshold = quantiles_to_do[0];

    assert _valEnd!=Double.NaN : _valEnd;
    assert _valStart!=Double.NaN : _valStart;
    assert _valBinSize!=Double.NaN : _valBinSize;
    if ( _valStart==_valEnd ) Log.debug("exactQuantilesMultiPass: start/end are equal. "+_valStart+" "+_valEnd);
    else assert (_valBinSize!=0 && _valBinSize!=Double.NaN) : _valBinSize;

    //  everything should either be in low, the bins, or high
    long totalBinnedRows = htot2(hcnt2_low, hcnt2_high);
    Log.debug("Q_ totalRows check: "+_totalRows+" "+totalBinnedRows+" "+hcnt2_low+" "+hcnt2_high+" "+_valStart+" "+_valEnd);
    assert _totalRows==totalBinnedRows : _totalRows+" "+totalBinnedRows+" "+hcnt2_low+" "+hcnt2_high;

    // Find the row count we want to hit, within some bin.
    long currentCnt = hcnt2_low;
    double targetCntFull = threshold * (_totalRows-1);  //  zero based indexing
    long targetCntInt = (long) Math.floor(targetCntFull);
    double targetCntFract = targetCntFull  - (double) targetCntInt;
    assert (targetCntFract>=0) && (targetCntFract<=1);
    Log.debug("Q_ targetCntInt: "+targetCntInt+" targetCntFract: "+targetCntFract);

    // walk thru and find out what bin to look inside
    int k = 0;
    while(k!=maxBinCnt && ((currentCnt + hcnt2[k]) <= targetCntInt)) {
      // Log.debug("Q_ Looping for k: "+threshold+" "+k+" "+maxBinCnt+" "+currentCnt+" "+targetCntInt+
      //   " "+hcnt2[k]+" "+hcnt2_min[k]+" "+hcnt2_max[k]);
      currentCnt += hcnt2[k];
      ++k;
      // Note the loop condition covers the breakout condition:
      // (currentCnt==targetCntInt && (hcnt2[k]!=0)
      // also: don't go pass array bounds
    }

    Log.debug("Q_ Found k: "+threshold+" "+k+" "+currentCnt+" "+targetCntInt+
      " "+_totalRows+" "+hcnt2[k]+" "+hcnt2_min[k]+" "+hcnt2_max[k]);

    assert (currentCnt + hcnt2[k]) > targetCntInt : targetCntInt+" "+currentCnt+" "+k+" "+" "+maxBinCnt;
    assert hcnt2[k]!=1 || hcnt2_min[k]==hcnt2_max[k];

    // Do mean and linear interpolation, if we don't land on a row
    // WATCH OUT when comparing results if linear interpolation...it's dependent on 
    // the number of rows in the dataset, not just adjacent values. So if you skipped a row
    // for some reason (header guess?) in a comparison tool, you can get small errors
    // both type 2 and type 7 give exact answers that match alternate tools 
    // (if they do type 2 and 7). scklearn doesn't do type 2 but does do type 7 
    // (but not by default in mquantiles())

    // the linear interpolation for k between row a (vala) and row b (valb) is
    //    pctDiff = (k-a)/(b-a)
    //    dDiff = pctDiff * (valb - vala)
    //    result = vala + dDiff

    double guess = Double.NaN;
    double pctDiff, dDiff;
    // -1 is for single pass approximation
    assert (interpolation_type==2) || (interpolation_type==7) || (interpolation_type==-1): "Unsupported type "+interpolation_type;

    // special cases. If the desired row is the last of equal values in this bin (2 or more)
    // we will need to intepolate with a nextK out-of-bin value
    // we can't iterate, since it won't improve things and the bin-size will be zero!
    // trying to resolve case of binsize=0 for next pass, after this, is flawed thinking.
    // implies the values are not the same..end of bin interpolate to next
    boolean atStartOfBin = hcnt2[k]>=1 && (currentCnt == targetCntInt);
    boolean atEndOfBin = !atStartOfBin && (hcnt2[k]>=2 && ((currentCnt + hcnt2[k] - 1) == targetCntInt));
    boolean inMidOfBin = !atStartOfBin && !atEndOfBin && (hcnt2[k]>=3) && (hcnt2_min[k]==hcnt2_max[k]);

    boolean interpolateEndNeeded = false;
    if ( atEndOfBin ) {
      if ( targetCntFract != 0 ) {
        interpolateEndNeeded = true;
      }
      else {
        guess = hcnt2_max[k];
        done = true;
        Log.debug("Q_ Guess M "+guess);
      }
    }
    else if ( inMidOfBin ) {
      // if we know there is something before and after us with same value, 
      // we never need to interpolate (only allowed when min=max
      guess = hcnt2_min[k];
      done = true;
      Log.debug("Q_ Guess N "+guess);
    }

    if ( !done && atStartOfBin ) {
      // no interpolation needed
      if ( hcnt2[k]>2 && (hcnt2_min[k]==hcnt2_max[k]) ) { 
        guess = hcnt2_min[k];
        done = true;
        Log.debug("Q_ Guess A "+guess);
      } 
      // min/max can be equal or not equal here
      else if ( hcnt2[k]==2 ) { // interpolate between min/max for the two value bin
        if ( interpolation_type==2 ) { // type 2 (mean)
          guess = (hcnt2_max[k] + hcnt2_min[k]) / 2.0;
        }
        else { // default to type 7 (linear interpolation)
          // Unlike mean, which just depends on two adjacent values, this adjustment  
          // adds possible errors related to the arithmetic on the total # of rows.
          dDiff = hcnt2_max[k] - hcnt2_min[k]; // two adjacent..as if sorted!
          // targetCntFract is fraction of total rows
          guess = hcnt2_min[k] + (targetCntFract * dDiff);
        }
        done = true;
        interpolated = true;
        Log.debug("Q_ Guess B "+guess+" with type "+interpolation_type+" targetCntFract: "+targetCntFract);
      } 
      // no interpolation needed
      else if ( (hcnt2[k]==1) && (targetCntFract==0) ) {
        assert hcnt2_min[k]==hcnt2_max[k];
        guess = hcnt2_min[k];
        done = true;
        Log.debug("Q_ Guess C "+guess);
      } 
    }

    // interpolate into a nextK value
    // all the qualification is so we don't set done when we're not, for multipass
    // interpolate from single bin, end of two entry bin, or for approx
    boolean stillCanGetIt = atStartOfBin && hcnt2[k]==1 && targetCntFract!=0;
    if ( !done && (stillCanGetIt || interpolateEndNeeded || forceBestApprox)) {

      if ( hcnt2[k]==1 ) {
        assert hcnt2_min[k]==hcnt2_max[k];
        Log.debug("Q_ Single value in this bin, but fractional means we need to interpolate to next non-zero");
      }
      if ( interpolateEndNeeded ) {
        Log.debug("Q_ Interpolating off the end of a bin!");
      }

      int nextK;
      if ( k<maxBinCnt ) nextK = k + 1; //  could put it over maxBinCnt
      else nextK = k;
      // definitely see stuff going into the extra bin, so search that too!
      while ( (nextK<maxBinCnt) && (hcnt2[nextK]==0) ) ++nextK;

      assert nextK > k : k+" "+nextK;
      //  have the "extra bin" for this
      double nextVal;
      if ( nextK >= maxBinCnt ) {
        // assume we didn't set hcnt2_high_min on first pass, because tighter start/end bounds
        if ( forceBestApprox ) { 
          Log.debug("Q_ Using _valEnd for approx interpolate: "+_valEnd);
          nextVal = _valEnd;
        }
        else {
          assert hcnt2_high!=0;
          Log.debug("Q_ Using hcnt2_high_min for interpolate: "+hcnt2_high_min);
          nextVal = hcnt2_high_min;
        }
      } 
      else {
        Log.debug("Q_ Using nextK for interpolate: "+nextK);
        assert hcnt2[nextK]!=0;
        nextVal = hcnt2_min[nextK];
      }

      Log.debug("Q_ k hcnt2_max[k] nextVal");
      Log.debug("Q_ "+k+" "+hcnt2_max[k]+" "+nextVal);
      Log.debug("Q_ \nInterpolating result using nextK: "+nextK+ " nextVal: "+nextVal);

      // type 7 (linear interpolation) || 
      // single pass approx..with unresolved bin
      if ( (forceBestApprox & stillCanGetIt) || interpolation_type==7) { 
        dDiff = nextVal - hcnt2_max[k]; // two adjacent, as if sorted!
        // targetCntFract is fraction of total rows
        guess = hcnt2_max[k] + (targetCntFract * dDiff);
      }
      else if ( forceBestApprox ) { // single pass approx..with unresolved bin
        // best to use hcnt2_max[k] instead of nextVal here, to keep
        // within the guaranteed worst case error bounds
        dDiff = (hcnt2_max[k] - hcnt2_min[k]) / hcnt2[k]; 
        guess = hcnt2_min[k] + (targetCntFull-currentCnt) * dDiff;
      }
      else { // type 2 (mean)
        guess = (hcnt2_max[k] + nextVal) / 2.0;
      }

      interpolated = true;
      done = true; //  has to be one above us when needed. (or we're at end)
      Log.debug("Q_ Guess D "+guess+" with type "+interpolation_type+
        " targetCntFull: "+targetCntFull+" targetCntFract: "+targetCntFract+
        " _totalRows: " + _totalRows+" "+stillCanGetIt+" "+forceBestApprox);
    }

    if ( !done  && !forceBestApprox) { // don't need for 1 pass approx

      // Possible bin leakage at start/end edges due to fp arith.
      // bin index arith may resolve OVER the boundary created by the compare for 
      // hcnt2_high compare. 
      // I suppose just one value should be in desiredBinCnt+1 bin -> the end value?)

      // To cover possible fp issues:
      // See if there's a non-zero bin below (min) or above (max) you, to avoid shrinking wrong.
      // Just need to check the one bin below and above k, if they exist. 
      // They might have zero entries, but then it's okay to ignore them.
      // update: use the closest edge in the next bin. better forward progress for small bin counts
      // This code may make the practical min bin count around 4 or so (not 2).
      // what has length 1 hcnt2 that makese this fail? Enums? shouldn't get here.
      newValStart = hcnt2_min[k];
      if ( k > 0 ) {
        if ( hcnt2[k-1]>0 && (hcnt2_max[k-1]<hcnt2_min[k]) ) {
          newValStart = hcnt2_max[k-1];
        }
      }

      // subtle. we do sometimes put stuff in the extra end bin (see above)
      // k might be pointing to one less than that (like k=0 for 1 bin case)
      newValEnd = hcnt2_max[k];
      if ( k < (maxBinCnt-1) )  {
        assert k+1 < hcnt2.length : k+" "+hcnt2.length+" "+_valMaxBinCnt+" "+_isEnum+" "+_isInt;
        if ( hcnt2[k+1]>0 && (hcnt2_min[k+1]>hcnt2_max[k]) ) {
          newValEnd = hcnt2_min[k+1];
        }
      }

      newValRange = newValEnd - newValStart;
      //  maxBinCnt is always binCount + 1, since we might cover over due to rounding/fp issues?
      newValBinSize = newValRange / (desiredBinCnt + 0.0);
      newValLowCnt = currentCnt - 1; // is this right? don't use for anything (debug?)
 
      // Since we always may need an interpolation, this seems bad if we get this with !done
      if ( newValBinSize==0 ) {
        Log.debug("Q_ Assuming done because newValBinSize is 0.");
        Log.debug("Q_ newValRange: "+newValRange+
          " hcnt2[k]: "+hcnt2[k]+
          " hcnt2_min[k]: "+hcnt2_min[k]+
          " hcnt2_max[k]: "+hcnt2_max[k]);
        guess = newValStart;
        Log.debug("Q_ Guess G "+guess);
        // maybe make this assert false, to see?
        assert true : "Should never get newValBinSize==0 in !done branch";
        done = true;
      }
    }

    Log.debug("Q_ guess: "+guess+" done: "+done+" hcnt2[k]: "+hcnt2[k]);
    Log.debug("Q_ currentCnt: "+currentCnt+" targetCntInt: "+targetCntInt+" hcnt2_low: "+hcnt2_low+" hcnt2_high: "+hcnt2_high);
    Log.debug("Q_ was "+_valStart+" "+_valEnd+" "+_valRange+" "+_valBinSize);
    Log.debug("Q_ next "+newValStart+" "+newValEnd+" "+newValRange+" "+newValBinSize);

    qtiles[0] = guess;
    // We want to leave them now! we reuse in exec for multi-thresholds
    // hcnt2 = null;
    // hcnt2_min = null;
    // hcnt2_max = null;
    _newValStart = newValStart;
    _newValEnd = newValEnd;
    _interpolated = interpolated;
    return done;
  }

  // this won't be used with a multipass iteration of qbins. So it alays has to return a best guess
  // Also, it needs to interpolate for bins that have different values that aren't resolved by min/max
  // so we give it a special interpolation type (-1) that we'll decode and use above
  private boolean approxQuantilesOnePass(double[] qtiles, double[] quantiles_to_do, int interpolation_type) {
    // exactQuantilesMultiPass(qtiles, quantiles_to_do, -1) ;
    exactQuantilesMultiPass(qtiles, quantiles_to_do, -1) ;
    return true;
  }
}