Summary2.java example

Explorer
h2o-2-master
package hex;

import water.*;
import water.api.*;
import water.api.Request.API;
import water.fvec.*;
import water.exec.Flow;
import water.util.Utils;
import water.util.Log;

import java.util.Arrays;

/**
 * Summary of a column.
 */
public class Summary2 extends Iced {
  static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields
  static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.

  // This Request supports the HTML 'GET' command, and this is the help text
  // for GET.
  static final String DOC_GET = "Returns a summary of a fluid-vec frame";

  public static final int    MAX_HIST_SZ = H2O.DATA_MAX_FACTOR_LEVELS;
  public static final int    NMAX = 5;
  // updated boundaries to be 0.1% 1%...99%, 99.9% so R code didn't have to change
  // ideally we extend the array here, and just update the R extraction of 25/50/75 percentiles
  // note python tests (junit?) may look at result
  public static final double DEFAULT_PERCENTILES[] = {0.001,0.01,0.10,0.25,0.33,0.50,0.66,0.75,0.90,0.99,0.999};
  private static final int   T_REAL = 0;
  private static final int   T_INT  = 1;
  private static final int   T_ENUM = 2;
  public BasicStat           _stat0;     /* Basic Vec stats collected by PrePass. */
  public final int           _type;      // 0 - real; 1 - int; 2 - enum
  public double[]            _mins;
  public double[]            _maxs;
  long                       _gprows;    // non-empty rows per group

  final transient String[]   _domain;
  final transient double     _start;
  final transient double     _start2;
  final transient double     _binsz;
  final transient double     _binsz2;    // 2nd finer grained histogram used for quantile estimates for numerics
  transient int              _len1;      /* Size of filled elements in a chunk. */
  transient double[]         _pctile;


  static abstract class Stats extends Iced {
    static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.
    static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields

    @API(help="stats type"   ) public String type;
    Stats(String type) { this.type = type; }
  }
  // An internal JSON-output-only class
  @SuppressWarnings("unused")
  static class EnumStats extends Stats {
    static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.
    static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields
    public EnumStats( int card ) {
      super("Enum");
      this.cardinality = card;
    }
    @API(help="cardinality"  ) public final int     cardinality;
  }

  static class NumStats extends Stats {
    static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.
    static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields
    public NumStats( double mean, double sigma, long zeros, double[] mins, double[] maxs, double[] pctile) {
      super("Numeric");
      this.mean  = mean;
      this.sd    = sigma;
      this.zeros = zeros;
      this.mins  = mins;
      this.maxs  = maxs;
      this.pctile = pctile;
      this.pct   = DEFAULT_PERCENTILES;
    }
    @API(help="mean"        ) public final double   mean;
    @API(help="sd"          ) public final double   sd;
    @API(help="#zeros"      ) public final long     zeros;
    @API(help="min elements") public final double[] mins; // min N elements
    @API(help="max elements") public final double[] maxs; // max N elements
    @API(help="percentile thresholds" ) public final double[] pct;
    @API(help="percentiles" ) public final double[] pctile;
  }
  // OUTPUTS
  // Basic info
  @API(help="name"        ) public String    colname;
  @API(help="type"        ) public String    type;
  // Basic stats
  @API(help="NAs"         ) public long      nacnt;
  @API(help="Base Stats"  ) public Stats     stats;

  @API(help="histogram start")    public double    hstart;
  @API(help="histogram bin step") public double    hstep;
  @API(help="histogram headers" ) public String[]  hbrk;
  @API(help="histogram bin values") public long[]  hcnt;
  public long[]  hcnt2; // finer histogram. not visible
  public double[]  hcnt2_min; // min actual for each bin
  public double[]  hcnt2_max; // max actual for each bin

  public static class BasicStat extends Iced {
    public long _len;   /* length of vec */
    public long _nas;   /* number of NA's */
    public long _nans;   /* number of NaN's */
    public long _pinfs;   /* number of positive infinity's */
    public long _ninfs;   /* number of positive infinity's */
    public long _zeros;   /* number of zeros */
    public double _min1;   /* if there's -Inf, then -Inf, o/w min2 */
    public double _max1;   /* if there's Inf, then Inf, o/w max2 */
    public double _min2;   /* min of the finite numbers. NaN if there's none. */
    public double _max2;   /* max of the finite numbers. NaN if there's none. */
    public BasicStat( ) {
      _len = 0;
      _nas = 0;
      _nans = 0;
      _pinfs = 0;
      _ninfs = 0;
      _zeros = 0;
      _min1 = Double.NaN;
      _max1 = Double.NaN;
      _min2 = Double.NaN;
      _max2 = Double.NaN;
    }
    public BasicStat add(Chunk chk) {
      _len = chk._len;
      for(int i = 0; i < chk._len; i++) {
        double val;
        if (chk.isNA0(i)) { _nas++; continue; }
        if( chk._vec.isUUID() ) continue;
        if (Double.isNaN(val = chk.at0(i))) { _nans++; continue; }
        if      (val == Double.POSITIVE_INFINITY) _pinfs++;
        else if (val == Double.NEGATIVE_INFINITY) _ninfs++;
        else {
          _min2 = Double.isNaN(_min2)? val : Math.min(_min2,val);
          _max2 = Double.isNaN(_max2)? val : Math.max(_max2,val);
          if (val == .0) _zeros++;
        }
      }
      return this;
    }
    public BasicStat add(BasicStat other) {
      _len += other._len;
      _nas += other._nas;
      _nans += other._nans;
      _pinfs += other._pinfs;
      _ninfs += other._ninfs;
      _zeros += other._zeros;
      if (Double.isNaN(_min2)) _min2 = other._min2;
      else if (!Double.isNaN(other._min2)) _min2 = Math.min(_min2, other._min2);
      if (Double.isNaN(_max2)) _max2 = other._max2;
      else if (!Double.isNaN(other._max2)) _max2 = Math.max(_max2, other._max2);
      return this;
    }
    public BasicStat finishUp() {
      _min1 = _ninfs>0?               Double.NEGATIVE_INFINITY   /* there's -Inf */
              : !Double.isNaN(_min2)? _min2                      /* min is finite */
              : _pinfs>0?             Double.POSITIVE_INFINITY   /* Only Infs exist */
              :                       Double.NaN;                /* All NaN's or NAs */
      _max1 = _pinfs>0?               Double.POSITIVE_INFINITY   /* there's Inf */
              : !Double.isNaN(_max2)? _max2                      /* max is finite */
              : _ninfs>0?             Double.NEGATIVE_INFINITY   /* Only -Infs exist */
              :                       Double.NaN;                /* All NaN's or NAs */
      return this;
    }

  }

  public static class PrePass extends MRTask2<PrePass> {
    public BasicStat _basicStats[];
    @Override public void map(Chunk[] cs) {
      _basicStats = new BasicStat[cs.length];
      for (int c=0; c < cs.length; c++)
        _basicStats[c] = new BasicStat().add(cs[c]);
    }
    @Override public void reduce(PrePass other){
      for (int c = 0; c < _basicStats.length; c++)
        _basicStats[c].add(other._basicStats[c]);
    }
    public PrePass finishUp() {
      for (BasicStat stat : _basicStats) stat.finishUp();
      return this;
    }
  }
  public static class SummaryTask2 extends MRTask2<SummaryTask2> {
    private BasicStat[] _basics;
    private int _max_qbins;
    public Summary2 _summaries[];
    public SummaryTask2 (BasicStat[] basicStats, int max_qbins) { _basics = basicStats; _max_qbins = max_qbins; }
    @Override public void map(Chunk[] cs) {
      _summaries = new Summary2[cs.length];
      for (int i = 0; i < cs.length; i++)
        _summaries[i] = new Summary2(_fr.vecs()[i], _fr.names()[i], _basics[i], _max_qbins).add(cs[i]);
    }
    @Override public void reduce(SummaryTask2 other) {
      for (int i = 0; i < _summaries.length; i++)
        _summaries[i].add(other._summaries[i]);
    }
  }

  // Entry point for the Flow passes, to allow easy percentiles on filtered GroupBy
  public static class SummaryPerRow extends Flow.PerRow<SummaryPerRow> {
    public final Frame _fr;
    public final Summary2 _summaries[];
    public SummaryPerRow( Frame fr ) { this(fr,null); }
    private SummaryPerRow( Frame fr, Summary2[] sums ) { _fr = fr; _summaries = sums; }
    @Override public void mapreduce( double ds[] ) {
      for( int i=0; i<ds.length; i++ )
        _summaries[i].add(ds[i]);
    }
    @Override public void reduce( SummaryPerRow that ) {
      for (int i = 0; i < _summaries.length; i++)
        _summaries[i].add(that._summaries[i]);
    }
    @Override public SummaryPerRow make() {
      Vec[] vecs = _fr.vecs();
      Summary2 sums[] = new Summary2[vecs.length];
      BasicStat basics[] = new PrePass().doAll(_fr).finishUp()._basicStats;
      for( int i=0; i<vecs.length; i++ )
        sums[i] = new Summary2(vecs[i], _fr._names[i], basics[i]);
      return new SummaryPerRow(_fr,sums);
    }
    @Override public String toString() {
      String s = "";
      for( int i=0; i<_summaries.length; i++ )
        s += _fr._names[i]+" "+_summaries[i]+"\n";
      return s;
    }
    public void finishUp() {
      Vec[] vecs = _fr.vecs();
      for (int i = 0; i < vecs.length; i++)
        _summaries[i].finishUp(vecs[i]);
    }
  }

  @Override public String toString() {
    String s = "";
    if( stats instanceof NumStats ) {
      double pct   [] = ((NumStats)stats).pct   ;
      double pctile[] = ((NumStats)stats).pctile;
      for( int i=0; i<pct.length; i++ )
        s += ""+(pct[i]*100)+"%="+pctile[i]+", ";
    } else {
      s += "cardinality="+((EnumStats)stats).cardinality;
    }
    return s;
  }

  public void finishUp(Vec vec) {
    nacnt = _stat0._nas;
    if (_type == T_ENUM) {
      // Compute majority items for enum data
      computeMajorities();
    } else {
      _pctile = new double[DEFAULT_PERCENTILES.length];
      approxQuantiles(_pctile, DEFAULT_PERCENTILES, _stat0._min2, _stat0._max2);
    }

    // remove the trailing NaNs
    for (int i = 0; i < _mins.length; i++) {
      if (Double.isNaN(_mins[i])) {
        _mins = Arrays.copyOf(_mins, i);
        break;
      }
    }
    for (int i = 0; i < _maxs.length; i++) {
      if (Double.isNaN(_maxs[i])) {
        _maxs = Arrays.copyOf(_maxs, i);
        break;
      }
    }
    for (int i = 0; i < _maxs.length>>>1; i++) {
      double t = _maxs[i];
      _maxs[i] = _maxs[_maxs.length-1-i];
      _maxs[_maxs.length-1-i] = t;
    }
    this.stats = _type==T_ENUM ?
      new EnumStats(vec.domain().length) :
      new NumStats(vec.mean(), vec.sigma(), _stat0._zeros, _mins, _maxs, _pctile);

    if (_type == T_ENUM) {
      this.hstart = 0;
      this.hstep = 1;
      this.hbrk = _domain;
    } else {
      this.hstart = _start;
      this.hstep  = _binsz;
      this.hbrk = new String[hcnt.length];
      for (int i = 0; i < hbrk.length; i++)
        hbrk[i] = Utils.p2d(i==0 ? _start : binValue(i));
    }
  }

  public Summary2(Vec vec, String name, BasicStat stat0, int max_qbins) {
    colname = name;
    _stat0 = stat0;
    _type = vec.isEnum()?T_ENUM:vec.isInt()?T_INT:T_REAL;
    _domain = vec.isEnum() ? vec.domain() : null;
    _gprows = 0;
    double sigma = Double.isNaN(vec.sigma()) ? 0 : vec.sigma();
    if ( _type != T_ENUM ) {
      _mins = MemoryManager.malloc8d((int)Math.min(vec.length(),NMAX));
      _maxs = MemoryManager.malloc8d((int)Math.min(vec.length(),NMAX));
      Arrays.fill(_mins, Double.NaN);
      Arrays.fill(_maxs, Double.NaN);
    } else {
      _mins = MemoryManager.malloc8d(Math.min(_domain.length,NMAX));
      _maxs = MemoryManager.malloc8d(Math.min(_domain.length,NMAX));
    }

    if( vec.isEnum() && _domain.length < MAX_HIST_SZ ) {
      _start = 0;
      _start2 = 0;
      _binsz = 1;
      _binsz2 = 1;
      // hack for now. if there are no enum values, keep these length 1, for consistency
      // in asserts below
      int dlength = _domain.length==0 ? 1 : _domain.length;
      hcnt = new long[dlength];
      hcnt2 = new long[dlength];
      hcnt2_min = new double[dlength];
      hcnt2_max = new double[dlength];
    }
    else if ( !(Double.isNaN(stat0._min2) || Double.isNaN(stat0._max2)) ) {
      // guard against improper parse (date type) or zero c._sigma
      long N = _stat0._len - stat0._nas - stat0._nans - stat0._pinfs - stat0._ninfs;
      double b = Math.max(1e-4,3.5 * sigma/ Math.cbrt(N));
      double d = Math.pow(10, Math.floor(Math.log10(b)));
      if (b > 20*d/3)
        d *= 10;
      else if (b > 5*d/3)
        d *= 5;

      // tweak for integers
      if (d < 1. && vec.isInt()) d = 1.;

      // Result from the dynamic bin sizing equations
      double startSuggest = d * Math.floor(stat0._min2 / d);
      double binszSuggest = d;
      int nbinSuggest = (int) Math.ceil((stat0._max2 - startSuggest)/d) + 1;

      // Protect against massive binning. browser doesn't need
      int BROWSER_BIN_TARGET = 100;

      //  _binsz/_start is used in the histogramming.
      // nbin is used in the array declaration. must be big enough.
      // the resulting nbin, could be really large number. We need to cap it.
      // should also be obsessive and check that it's not 0 and force to 1.
      // Since nbin is implied by _binsz, ratio _binsz and recompute nbin
      int binCase = 0; // keep track in case we assert
      double start;
      if ( stat0._max2==stat0._min2) {
        binszSuggest = 0; // fixed next with other 0 cases.
        start = stat0._min2;
        binCase = 1;
      }
      // minimum 2 if min/max different
      else if ( stat0._max2!=stat0._min2 && nbinSuggest<2 ) {
        binszSuggest = (stat0._max2 - stat0._min2) / 2.0;
        start = stat0._min2;
        binCase = 2;
      }
      else if (nbinSuggest<1 || nbinSuggest>BROWSER_BIN_TARGET ) {
        // switch to a static equation with a fixed bin count, and recompute binszSuggest
        // one more bin than necessary for the range (99 exact. causes one extra
        binszSuggest = (stat0._max2 - stat0._min2) / (BROWSER_BIN_TARGET - 1.0);
        start = binszSuggest * Math.floor(stat0._min2 / binszSuggest);
        binCase = 3;
      }
      else {
        // align to binszSuggest boundary. (this is for reals)
        start = binszSuggest * Math.floor(stat0._min2 / binszSuggest);
        binCase = 4;
      }

      // _binsz = 0 means min/max are equal for reals?. Just make it a little number
      // this won't show up in browser display, since bins are labelled by start value

      // Now that we know the best bin size that will fit..Floor the _binsz if integer so visible
      // histogram looks good for integers. This is our final best bin size.
      double binsz = (binszSuggest!=0) ? binszSuggest : (vec.isInt() ? 1 : 1e-13d);
      _binsz = vec.isInt() ? Math.floor(binsz) : binsz;
      // make integers start on an integer too!
      _start = vec.isInt() ? Math.floor(start) : start;

      // This equation creates possibility of some of the first bins being empty
      // also: _binsz means many _binsz2 could be empty at the start if we resused _start there
      // FIX! is this okay if the dynamic range is > 2**32
      // align to bin size?
      int nbin = (int) Math.ceil((stat0._max2 - _start)/_binsz) + 1;
      double impliedBinEnd = _start + (nbin * _binsz);
      String assertMsg = _start+" "+_stat0._min2+" "+_stat0._max2+
        " "+impliedBinEnd+" "+_binsz+" "+nbin+" "+startSuggest+" "+nbinSuggest+" "+binCase;

      // Log.debug("Summary2 bin1. "+assertMsg);
      assert _start <= _stat0._min2 : assertMsg;
      // just in case, make sure it's big enough
      assert nbin > 0: assertMsg;
      // just for double checking we're okay (nothing outside the bin rang)
      assert impliedBinEnd>=_stat0._max2 : assertMsg;

      // create a 2nd finer grained historam for quantile estimates.
      // okay if it is approx. 1000 bins (+-1)
      // update: we allow api to change max_qbins. default 1000. larger = more accuracy
      assert max_qbins > 0 && max_qbins <= 10000000 : "max_qbins must be >0 and <= 10000000";

      // okay if 1 more than max_qbins gets created
      double d2 = (stat0._max2 - stat0._min2) / max_qbins;
      // _binsz2 = 0 means min/max are equal for reals?. Just make it a little number
      // this won't show up in browser display, since bins are labelled by start value
      _binsz2 = (d2!=0) ? d2 : (vec.isInt() ? 1 : 1e-13d);
      _start2 = stat0._min2;
      int nbin2 = (int) Math.ceil((stat0._max2 - _start2)/_binsz2) + 1;
      double impliedBinEnd2 = _start2 + (nbin2 * _binsz2);

      assertMsg = _start2+" "+_stat0._min2+" "+_stat0._max2+
        " "+impliedBinEnd2+" "+_binsz2+" "+nbin2;
      // Log.debug("Summary2 bin2. "+assertMsg);
      assert _start2 <= stat0._min2 : assertMsg;
      assert nbin2 > 0 : assertMsg;
      // can't make any assertion about _start2 vs _start  (either can be smaller due to fp issues)
      assert impliedBinEnd2>=_stat0._max2 : assertMsg;

      hcnt = new long[nbin];
      hcnt2 = new long[nbin2];
      hcnt2_min = new double[nbin2];
      hcnt2_max = new double[nbin2];

      // Log.debug("Finer histogram has "+nbin2+" bins. Visible histogram has "+nbin);
      // Log.debug("Finer histogram starts at "+_start2+" Visible histogram starts at "+_start);
      // Log.debug("stat0._min2 "+stat0._min2+" stat0._max2 "+stat0._max2);

    }
    else { // vec does not contain finite numbers
      Log.debug("Summary2: NaN in stat0._min2: "+stat0._min2+" or stat0._max2: "+stat0._max2);
      // vec.min() wouldn't be any better here. It could be NaN? 4/13/14
      // _start = vec.min();
      // _start2 = vec.min();
      // _binsz = Double.POSITIVE_INFINITY;
      // _binsz2 = Double.POSITIVE_INFINITY;
      _start = Double.NaN;
      _start2 = Double.NaN;
      _binsz = Double.NaN;
      _binsz2 = Double.NaN;
      hcnt = new long[1];
      hcnt2 = new long[1];
      hcnt2_min = new double[1];
      hcnt2_max = new double[1];
    }
  }

  public Summary2(Vec vec, String name, BasicStat stat0) {
    this(vec, name, stat0, 1000);
  }

  public Summary2 add(Chunk chk) {
    if( chk._vec.isUUID() ) {
      // Log.info("Summary2: isUUID() in add");
      return this;
    }
    for (int i = 0; i < chk._len; i++)
      add(chk.at0(i));
    return this;
  }
  public void add(double val) {
    if( Double.isNaN(val) ) return;
    // can get infinity due to bad enum parse to real
    // histogram is sized ok, but the index calc below will be too big
    // just drop them. not sure if something better to do?
    if( val==Double.POSITIVE_INFINITY ) return;
    if( val==Double.NEGATIVE_INFINITY ) return;
    _len1++; _gprows++;

    if ( _type != T_ENUM ) {
      int index;
      // update min/max
      if (val < _mins[_mins.length-1] || Double.isNaN(_mins[_mins.length-1])) {
        index = Arrays.binarySearch(_mins, val);
        if (index < 0) {
          index = -(index + 1);
          for (int j = _mins.length -1; j > index; j--)
            _mins[j] = _mins[j-1];
          _mins[index] = val;
        }
      }
      boolean hasNan = Double.isNaN(_maxs[_maxs.length-1]);
      if (val > _maxs[0] || hasNan) {
        index = Arrays.binarySearch(_maxs, val);
        if (index < 0) {
          index = -(index + 1);
          if (hasNan) {
            for (int j = _maxs.length -1; j > index; j--)
              _maxs[j] = _maxs[j-1];
            _maxs[index] = val;
          } else {
            for (int j = 0; j < index-1; j++)
              _maxs[j] = _maxs[j+1];
            _maxs[index-1] = val;
          }
        }
      }
      // update the finer histogram (used for quantile estimates on numerics)
      long binIdx2;
      if (hcnt2.length==1) {
        binIdx2 = 0; // not used
      }
      else {
        binIdx2 = (int) Math.floor((val - _start2) / _binsz2);
      }

      int binIdx2Int = (int) binIdx2;
      assert (_start2 <= val) : "Why is val < _start2? val:"+val+" _start2:";
      assert (binIdx2Int >= 0 && binIdx2Int < hcnt2.length) :
        "binIdx2Int too big for hcnt2 "+binIdx2Int+" "+hcnt2.length+" "+val+" "+_start2+" "+_binsz2;

      if (hcnt2[binIdx2Int] == 0) {
        // Log.debug("New init: "+val+" for index "+binIdx2Int);
        hcnt2_min[binIdx2Int] = val;
        hcnt2_max[binIdx2Int] = val;
      }
      else {
        if (val < hcnt2_min[binIdx2Int]) {
            // Log.debug("New min: "+val+" for index "+binIdx2Int);
            hcnt2_min[binIdx2Int] = val;
        }
        if (val > hcnt2_max[binIdx2Int]) {
            // if ( binIdx2Int == 500 ) Log.debug("New max: "+val+" for index "+binIdx2Int);
            hcnt2_max[binIdx2Int] = val;
        }
      }
      ++hcnt2[binIdx2Int];
    }

    // update the histogram the browser/json uses
    long binIdx;
    if (hcnt.length == 1) {
      binIdx = 0;
    }
    // interesting. do we really track Infs in the histogram?
    else if (val == Double.NEGATIVE_INFINITY) {
      binIdx = 0;
    }
    else if (val == Double.POSITIVE_INFINITY) {
      binIdx = hcnt.length-1;
    }
    else {
      binIdx = (int) Math.floor((val - _start) / _binsz);
    }

    int binIdxInt = (int) binIdx;
    assert (_start <= val) : "Why is val < _start? val:"+val+" _start:";
    assert (binIdxInt >= 0 && binIdx < hcnt.length) :
        "binIdxInt bad for hcnt2. binIdxInt:"+binIdxInt+" hcnt.length:"+hcnt.length+" val:"+val+" _start:"+_start+" _binsz:"+_binsz;
    ++hcnt[binIdxInt];
  }

  public Summary2 add(Summary2 other) {
    // merge hcnt and hcnt just by adding
    if (hcnt != null)
      Utils.add(hcnt, other.hcnt);

    _gprows += other._gprows;

    if (_type == T_ENUM) return this;

    // merge hcnt2 per-bin mins
    // other must be same length, but use it's length for safety
    // could add assert on lengths?
    for (int k = 0; k < other.hcnt2_min.length; k++) {
      // for now..die on NaNs
      assert !Double.isNaN(other.hcnt2_min[k]) : "NaN in other.hcnt2_min merging";
      assert !Double.isNaN(other.hcnt2[k]) : "NaN in hcnt2_min merging";
      assert !Double.isNaN(hcnt2_min[k]) : "NaN in hcnt2_min merging";
      assert !Double.isNaN(hcnt2[k]) : "NaN in hcnt2_min merging";

      // cover the initial case (relying on initial min = 0 to work is wrong)
      // Only take the new max if it's hcnt2 is non-zero. like a valid bit
      // can hcnt2 ever be null here?
      if (other.hcnt2[k] > 0) {
        if ( hcnt2[k]==0 || ( other.hcnt2_min[k] < hcnt2_min[k] )) {
          hcnt2_min[k] = other.hcnt2_min[k];
        }
      }
    }

    // merge hcnt2 per-bin maxs
    // other must be same length, but use it's length for safety
    for (int k = 0; k < other.hcnt2_max.length; k++) {
      // for now..die on NaNs
      assert !Double.isNaN(other.hcnt2_max[k]) : "NaN in other.hcnt2_max merging";
      assert !Double.isNaN(other.hcnt2[k]) : "NaN in hcnt2_min merging";
      assert !Double.isNaN(hcnt2_max[k]) : "NaN in hcnt2_max merging";
      assert !Double.isNaN(hcnt2[k]) : "NaN in hcnt2_max merging";

      // cover the initial case (relying on initial min = 0 to work is wrong)
      // Only take the new max if it's hcnt2 is non-zero. like a valid bit
      // can hcnt2 ever be null here?
      if (other.hcnt2[k] > 0) {
        if ( hcnt2[k]==0 || ( other.hcnt2_max[k] > hcnt2_max[k] )) {
          hcnt2_max[k] = other.hcnt2_max[k];
        }
      }
    }

    // can hcnt2 ever be null here?. Inc last, so the zero case is detected above
    // seems like everything would fail if hcnt2 doesn't exist here
    if (hcnt2 != null)
      Utils.add(hcnt2, other.hcnt2);

    // merge hcnt mins
    double[] ds = MemoryManager.malloc8d(_mins.length);
    int i = 0, j = 0;
    for (int k = 0; k < ds.length; k++)
      if (_mins[i] < other._mins[j])
        ds[k] = _mins[i++];
      else if (Double.isNaN(other._mins[j]))
        ds[k] = _mins[i++];
      else {            // _min[i] >= other._min[j]
        if (_mins[i] == other._mins[j]) i++;
        ds[k] = other._mins[j++];
      }
    System.arraycopy(ds,0,_mins,0,ds.length);

    for (i = _maxs.length - 1; Double.isNaN(_maxs[i]); i--) if (i == 0) {i--; break;}
    for (j = _maxs.length - 1; Double.isNaN(other._maxs[j]); j--) if (j == 0) {j--; break;}

    ds = MemoryManager.malloc8d(i + j + 2);
    // merge hcnt maxs, also deduplicating against mins?
    int k = 0, ii = 0, jj = 0;
    while (ii <= i && jj <= j) {
      if (_maxs[ii] < other._maxs[jj])
        ds[k] = _maxs[ii++];
      else if (_maxs[ii] > other._maxs[jj])
        ds[k] = other._maxs[jj++];
      else { // _maxs[ii] == other.maxs[jj]
        ds[k] = _maxs[ii++];
        jj++;
      }
      k++;
    }
    while (ii <= i) ds[k++] = _maxs[ii++];
    while (jj <= j) ds[k++] = other._maxs[jj++];
    System.arraycopy(ds,Math.max(0, k - _maxs.length),_maxs,0,Math.min(k,_maxs.length));
    for (int t = k; t < _maxs.length; t++) _maxs[t] = Double.NaN;
    return this;
  }

  // _start of each hcnt bin
  public double binValue(int b) { return _start + b*_binsz; }
  // can we assert against something here?
  // assert _gprows==htot2(0, 0) : "_gprows: "+_gprows+" htot2(): "+htot2(0, 0);

  // need to count >4B rows
  private long htot2(long low, long high) {
    long cnt = 0;
    for (int i = 0; i < hcnt2.length; i++) cnt+=hcnt2[i];
    // add the stuff outside the bins, 0,0 for single pass
    cnt = cnt + low + high;
    return cnt;
  }


  //******************************************************************************
  // NOTE: only works on a backfilled hcnt2, unlike Quantiles. eliminates nextK search
  // The backfill is not done here, so it's only done once (because 10 calls here)
  private double approxLikeInQuantiles(double threshold, double valStart, double valEnd) {
    // Code is lifted from Quantiles.java, with only a little jiggering
    // on the branches around forceBestApprox/interpolation type, and use of globals
    // that have different names. Need to merge sometime.
    // the 'intent' is to be the same as the single pass Quantiles approx, interpolation_type==-1

    // max_qbins was the goal for sizing.
    // nbins2 was what was used for size, after various calcs
    // just assume hcnt2 is the right length!
    // Don't need at least two bins..since we'll always have 'some' answer
    // are we being called on constant 0?
    int maxBinCnt = hcnt2.length;

    // Find the row count we want to hit, within some bin.
    long currentCnt = 0;
    double targetCntFull = threshold * (_gprows-1);  //  zero based indexing
    long targetCntInt = (long) Math.floor(targetCntFull);
    double targetCntFract = targetCntFull  - (double) targetCntInt;
    assert (targetCntFract>=0) && (targetCntFract<=1);
    // Log.debug("QS_ targetCntInt: "+targetCntInt+" targetCntFract: "+targetCntFract);

    // walk thru and find out what bin to look inside
    int k = 0;
    while(k!=maxBinCnt && ((currentCnt + hcnt2[k]) <= targetCntInt)) {
      // Log.debug("Q_ Looping for k: "+threshold+" "+k+" "+maxBinCnt+" "+currentCnt+" "+targetCntInt+
      //   " "+hcnt2[k]+" "+hcnt2_min[k]+" "+hcnt2_max[k]);
      currentCnt += hcnt2[k];
      ++k;
      // Note the loop condition covers the breakout condition:
      // (currentCnt==targetCntInt && (hcnt2[k]!=0)
      // also: don't go pass array bounds
    }


    assert hcnt2[k]!=0;
    // Log.debug("QS_ Found k (approx): "+threshold+" "+k+" "+currentCnt+" "+targetCntInt+
    //  " "+_gprows+" "+hcnt2[k]+" "+hcnt2_min[k]+" "+hcnt2_max[k]);

    assert (currentCnt + hcnt2[k]) > targetCntInt : targetCntInt+" "+currentCnt+" "+k+" "+" "+maxBinCnt;
    assert hcnt2[k]!=1 || hcnt2_min[k]==hcnt2_max[k];

    boolean done = false;
    double guess = Double.NaN;
    boolean interpolated = false;
    double dDiff;

    // special cases. If the desired row is the last of equal values in this bin (2 or more)
    // we will need to intepolate with a nextK out-of-bin value
    // we can't iterate, since it won't improve things and the bin-size will be zero!
    // trying to resolve case of binsize=0 for next pass, after this, is flawed thinking.
    // implies the values are not the same..end of bin interpolate to next
    boolean atStartOfBin = hcnt2[k]>=1 && (currentCnt == targetCntInt);
    boolean atEndOfBin = !atStartOfBin && (hcnt2[k]>=2 && ((currentCnt + hcnt2[k] - 1) == targetCntInt));
    boolean inMidOfBin = !atStartOfBin && !atEndOfBin && (hcnt2[k]>=3) && (hcnt2_min[k]==hcnt2_max[k]);

    boolean interpolateEndNeeded = false;
    if ( atEndOfBin ) {
      if ( targetCntFract != 0 ) {
        interpolateEndNeeded = true;
      }
      else {
        guess = hcnt2_max[k];
        done = true;
        // Log.debug("QS_ Guess M "+guess);
      }
    }
    else if ( inMidOfBin ) {
      // if we know there is something before and after us with same value,
      // we never need to interpolate (only allowed when min=max
      guess = hcnt2_min[k];
      done = true;
      // Log.debug("QS_ Guess N "+guess);
    }

    if ( !done && atStartOfBin ) {
      // no interpolation needed
      if ( hcnt2[k]>2 && (hcnt2_min[k]==hcnt2_max[k]) ) {
        guess = hcnt2_min[k];
        done = true;
        // Log.debug("QS_ Guess A "+guess);
      }
      // min/max can be equal or not equal here
      else if ( hcnt2[k]==2 ) { // interpolate between min/max for the two value bin
        // type 7 (linear interpolation)
        // Unlike mean, which just depends on two adjacent values, this adjustment
        // adds possible errors related to the arithmetic on the total # of rows.
        dDiff = hcnt2_max[k] - hcnt2_min[k]; // two adjacent..as if sorted!
        // targetCntFract is fraction of total rows
        guess = hcnt2_min[k] + (targetCntFract * dDiff);

        done = true;
        interpolated = true;
        // Log.debug("QS_ Guess B "+guess+" targetCntFract: "+targetCntFract);
      }
      // no interpolation needed
      else if ( (hcnt2[k]==1) && (targetCntFract==0) ) {
        assert hcnt2_min[k]==hcnt2_max[k];
        guess = hcnt2_min[k];
        done = true;
        // Log.debug("QS_ Guess C "+guess);
      }
    }

    // interpolate into a nextK value
    // all the qualification is so we don't set done when we're not, for multipass
    // interpolate from single bin, end of two entry bin, or for approx
    boolean stillCanGetIt = atStartOfBin && hcnt2[k]==1 && targetCntFract!=0;
    if ( !done ) {
      if ( hcnt2[k]==1 ) {
        assert hcnt2_min[k]==hcnt2_max[k];
        // Log.debug("QS_ Single value in this bin, but fractional means we need to interpolate to next non-zero");
      }
      if ( interpolateEndNeeded ) {
        // Log.debug("QS_ Interpolating off the end of a bin!");
      }

      double nextVal;
      int nextK;
      // if we're at the end
      assert k < maxBinCnt : k+" "+maxBinCnt;
      if ( (k+1)==maxBinCnt) {
        // Log.debug("QS_ Using valEnd for approx interpolate: "+valEnd);
        nextVal = valEnd; // just in case the binning didn't max in a bin before the last
      }
      else {
        nextK = k + 1;
        nextVal = hcnt2_min[nextK];
        // Log.debug("QS_ Using nextK for interpolate: "+nextK+" "+hcnt2_min[nextK]);
        // hcnt2[nextK] may be zero here if we backfilled
      }

      // can still get an exact interpolation, when hcnt2[k]=2
      if ( stillCanGetIt ) {
        dDiff = nextVal - hcnt2_max[k]; // two adjacent, as if sorted!
        // targetCntFract is fraction of total rows
        guess = hcnt2_max[k] + (targetCntFract * dDiff);
        interpolated = true;
        done = true; //  has to be one above us when needed. (or we're at end)
        // Log.debug("QS_ Guess D "+guess+" "+nextVal+" "+hcnt2_min[k]+" "+hcnt2_max[k]+" "+hcnt2[k]+" "+nextVal+
        //   " targetCntFull: "+targetCntFull+" targetCntFract: "+targetCntFract+
        //   " _gprows: " + _gprows+" "+stillCanGetIt);

      }
      else { // single pass approx..with unresolved bin
        assert hcnt2[k]!=0 : hcnt2[k]+" "+k;
        // use max within this bin, to stay within the guaranteed error bounds
        dDiff = (hcnt2_max[k] - hcnt2_min[k]) / hcnt2[k];
        guess = hcnt2_min[k] + (targetCntFull-currentCnt) * dDiff;
        interpolated = true;
        done = true; //  has to be one above us when needed. (or we're at end)
        // Log.debug("QS_ Guess E "+guess+" "+nextVal+" "+hcnt2_min[k]+" "+hcnt2_max[k]+" "+hcnt2[k]+" "+nextVal+
        //   " targetCntFull: "+targetCntFull+" targetCntFract: "+targetCntFract+
        //   " _gprows: " + _gprows);
      }
    }
    assert !Double.isNaN(guess); // covers positive/negative inf also (if we divide by 0)
    return guess;
  }
  //******************************************************************************
  private void approxQuantiles(double[] qtiles, double[] thres, double valStart, double valEnd){
    // not called for enums
    assert _type != T_ENUM;

    // hcnt2 may have been sized differently than the max_qbins goal
    int maxBinCnt = hcnt2.length;
    if ( maxBinCnt==0 ) return;
    // this would imply we didn't get anything correctly. Maybe real col with all NA?
    if ( (maxBinCnt==1) && (hcnt2[0]==0) )  return;

    // Perf hack that is currently different than Quantiles.java
    // back fill hcnt2_min where it's zero, so we can avoid the nextK search
    // when we need to interpolate. Keep hcnt2[k]=0 so we know not to use it
    // other than for getting nextK without searching. This is powerful
    // because if we're getting 10 quantiles from a histogram, we don't
    // do searches to the end (potentially) for ever nextK find. This
    // makes the Quantiles.java algo work well when reused for multiple quantiles
    // here in Summary2

    // The use of nextK, rather than just our bin, improves accuracy for various cases.
    // (mirroring what Quantiles does for perfect answers)

    // start at the end. don't need to fill the 0 case ever, but should for consistency
    double backfill = valEnd;
    for (int b=(maxBinCnt-1); b>=0; --b) {
      if ( hcnt2[b] == 0 ) {
        hcnt2_min[b] = backfill;
        // Log.debug("QS_ backfilling "+b+" "+backfill);
      }
      else {
        backfill = hcnt2_min[b];
      }
    }

    for(int j = 0; j < thres.length; ++j) {
      // 0 okay for threshold?
      assert 0 <= thres[j] && thres[j] <= 1;
      qtiles[j] = approxLikeInQuantiles(thres[j], valStart, valEnd);
    }
  }

  //******************************************************************************
  // Compute majority categories for enums only
  public void computeMajorities() {
    if ( _type != T_ENUM ) return;
    for (int i = 0; i < _mins.length; i++) _mins[i] = i;
    for (int i = 0; i < _maxs.length; i++) _maxs[i] = i;
    int mini = 0, maxi = 0;
    for( int i = 0; i < hcnt.length; i++ ) {
      if (hcnt[i] < hcnt[(int)_mins[mini]]) {
        _mins[mini] = i;
        for (int j = 0; j < _mins.length; j++)
          if (hcnt[(int)_mins[j]] > hcnt[(int)_mins[mini]]) mini = j;
      }
      if (hcnt[i] > hcnt[(int)_maxs[maxi]]) {
        _maxs[maxi] = i;
        for (int j = 0; j < _maxs.length; j++)
          if (hcnt[(int)_maxs[j]] < hcnt[(int)_maxs[maxi]]) maxi = j;
      }
    }
    for (int i = 0; i < _mins.length - 1; i++)
      for (int j = 0; j < i; j++) {
        if (hcnt[(int)_mins[j]] > hcnt[(int)_mins[j+1]]) {
          double t = _mins[j]; _mins[j] = _mins[j+1]; _mins[j+1] = t;
        }
      }
    for (int i = 0; i < _maxs.length - 1; i++)
      for (int j = 0; j < i; j++)
        if (hcnt[(int)_maxs[j]] < hcnt[(int)_maxs[j+1]]) {
          double t = _maxs[j]; _maxs[j] = _maxs[j+1]; _maxs[j+1] = t;
        }
  }

  public double percentileValue(int idx) {
    if( _type == T_ENUM ) return Double.NaN;
     return _pctile[idx];
  }

  public void toHTML( Vec vec, String cname, StringBuilder sb ) {
    // should be a better way/place to decode this back to string.
    String typeStr;
    if ( _type == T_REAL) typeStr = "Real";
    else if ( _type == T_INT) typeStr = "Int";
    else if ( _type == T_ENUM) typeStr = "Enum";
    else typeStr = "Undefined";

    sb.append("<div class='table' id='col_" + cname + "' style='width:90%;heigth:90%;border-top-style:solid;'>" +
    "<div class='alert-success'><h4>Column: " + cname + " (type: " + typeStr + ")</h4></div>\n");
    if ( _stat0._len == _stat0._nas ) {
      sb.append("<div class='alert'>Empty column, no summary!</div></div>\n");
      return;
    }
    // Base stats
    if( _type != T_ENUM ) {
      NumStats stats = (NumStats)this.stats;
      sb.append("<div style='width:100%;'><table class='table-bordered'>");
      sb.append("<tr><th colspan='"+20+"' style='text-align:center;'>Base Stats</th></tr>");
      sb.append("<tr>");
      sb.append("<th>NAs</th>  <td>" + nacnt + "</td>");
      sb.append("<th>mean</th><td>" + Utils.p2d(stats.mean)+"</td>");
      sb.append("<th>sd</th><td>" + Utils.p2d(stats.sd) + "</td>");
      sb.append("<th>zeros</th><td>" + stats.zeros + "</td>");
      sb.append("<tr>");
      sb.append("<th>min[" + stats.mins.length + "]</th>");
      for( double min : stats.mins ) {
        sb.append("<td>" + Utils.p2d(min) + "</td>");
      }
      sb.append("<tr>");
      sb.append("<th>max[" + stats.maxs.length + "]</th>");
      for( double max : stats.maxs ) {
        sb.append("<td>" + Utils.p2d(max) + "</td>");
      }
      // End of base stats
      sb.append("</tr> </table>");
      sb.append("</div>");
    } else {                    // Enums
      sb.append("<div style='width:100%'><table class='table-bordered'>");
      sb.append("<tr><th colspan='" + 4 + "' style='text-align:center;'>Base Stats</th></tr>");
      sb.append("<tr><th>NAs</th>  <td>" + nacnt + "</td>");
      sb.append("<th>cardinality</th>  <td>" + vec.domain().length + "</td></tr>");
      sb.append("</table></div>");
    }
    // Histogram
    final int MAX_HISTO_BINS_DISPLAYED = 1000;
    int len = Math.min(hcnt.length,MAX_HISTO_BINS_DISPLAYED);
    sb.append("<div style='width:100%;overflow-x:auto;'><table class='table-bordered'>");
    sb.append("<tr> <th colspan="+len+" style='text-align:center'>Histogram</th></tr>");
    sb.append("<tr>");
    if ( _type == T_ENUM )
       for( int i=0; i<len; i++ ) sb.append("<th>" + vec.domain(i) + "</th>");
    else
       for( int i=0; i<len; i++ ) sb.append("<th>" + Utils.p2d(i==0?_start:binValue(i)) + "</th>");
    sb.append("</tr>");
    sb.append("<tr>");
    for( int i=0; i<len; i++ ) sb.append("<td>" + hcnt[i] + "</td>");
    sb.append("</tr>");
    sb.append("<tr>");
    for( int i=0; i<len; i++ )
      sb.append(String.format("<td>%.1f%%</td>",(100.0*hcnt[i]/_stat0._len)));
    sb.append("</tr>");
    if( hcnt.length >= MAX_HISTO_BINS_DISPLAYED )
      sb.append("<div class='alert'>Histogram for this column was too big and was truncated to 1000 values!</div>");
    sb.append("</table></div>");

    if (_type != T_ENUM) {
      NumStats stats = (NumStats)this.stats;
      // Percentiles
      sb.append("<div style='width:100%;overflow-x:auto;'><table class='table-bordered'>");
      sb.append("<tr> <th colspan='" + stats.pct.length + "' " +
              "style='text-align:center' " +
              ">Percentiles</th></tr>");
      sb.append("<tr><th>Threshold(%)</th>");
      for (double pc : stats.pct)
        sb.append("<td>" + Utils.p2d(pc * 100.0) + "</td>");
        // sb.append("<td>" + (int) Math.round(pc * 100) + "</td>");
      sb.append("</tr>");
      sb.append("<tr><th>Value</th>");
      for (double pv : stats.pctile)
        sb.append("<td>" + pv + "</td>");
      sb.append("</tr>");
      sb.append("</table>");
      sb.append("</div>");
    }
    sb.append("</div>\n");
  }
}