package water.fvec;
import water.Futures;
import jsr166y.CountedCompleter;
import jsr166y.ForkJoinTask;
import water.*;
import water.H2O.H2OCallback;
import water.H2O.H2OCountedCompleter;
import water.nbhm.NonBlockingHashMap;
import water.parser.Categorical;
import water.parser.BufferedString;
import water.util.ArrayUtils;
import water.util.Log;
import java.util.Arrays;
/** A class to compute the rollup stats. These are computed lazily, thrown
* away if the Vec is written into, and then recomputed lazily. Error to ask
* for them if the Vec is actively being written into. It is common for all
* cores to ask for the same Vec rollup at once, so it is crucial that it be
* computed once across the cluster.
*
* Rollups are kept in the K/V store, which also controls who manages the
* rollup work and final results. Winner of a DKV CAS/PutIfMatch race gets to
* manage the M/R job computing the rollups. Losers block for the same
* rollup. Remote requests *always* forward to the Rollup Key's master.
*/
final class RollupStats extends Iced {
/** The count of missing elements.... or -2 if we have active writers and no
* rollup info can be computed (because the vector is being rapidly
* modified!), or -1 if rollups have not been computed since the last
* modification. */
volatile transient ForkJoinTask _tsk;
// Computed in 1st pass
volatile long _naCnt; //count(isNA(X))
double _mean, _sigma; //mean(X) and sqrt(sum((X-mean(X))^2)) for non-NA values
long _rows, //count(X) for non-NA values excluding negative/positive infinities (for numeric Vecs)
_nzCnt, //count(X!=0) for non-NA values
_size, //byte size
_pinfs, //count(+inf)
_ninfs; //count(-inf)
boolean _isInt=true;
double[] _mins, _maxs;
long _checksum;
// Expensive histogram & percentiles
// Computed in a 2nd pass, on-demand, by calling computeHisto
private static final int MAX_SIZE = 1000; // Standard bin count; categoricals can have more bins
// the choice of MAX_SIZE being a power of 10 (rather than 1024) just aligns-to-the-grid of the common input of fixed decimal
// precision numbers. It is still an estimate and makes no difference mathematically. It just gives tidier output in some
// simple cases without penalty.
volatile long[] _bins;
// Approximate data value closest to the Xth percentile
double[] _pctiles;
public boolean hasHisto(){return _bins != null;}
// Check for: Vector is mutating and rollups cannot be asked for
boolean isMutating() { return _naCnt==-2; }
// Check for: Rollups currently being computed
private boolean isComputing() { return _naCnt==-1; }
// Check for: Rollups available
private boolean isReady() { return _naCnt>=0; }
private RollupStats(int mode) {
_mins = new double[5];
_maxs = new double[5];
Arrays.fill(_mins, Double.MAX_VALUE);
Arrays.fill(_maxs,-Double.MAX_VALUE);
_pctiles = new double[Vec.PERCENTILES.length]; Arrays.fill(_pctiles, Double.NaN);
_mean = _sigma = 0;
_size = 0;
_naCnt = mode;
}
private static RollupStats makeComputing() { return new RollupStats(-1); }
static RollupStats makeMutating () { return new RollupStats(-2); }
private RollupStats map( Chunk c ) {
_size = c.byteSize();
boolean isUUID = c._vec.isUUID();
boolean isString = c._vec.isString();
BufferedString tmpStr = new BufferedString();
if (isString) _isInt = false;
// Checksum support
long checksum = 0;
long start = c._start;
long l = 81985529216486895L;
// Check for popular easy cases: All Constant
double min=c.min(), max=c.max();
if( min==max ) { // All constant or all NaN
double d = min; // It's the min, it's the max, it's the alpha and omega
_checksum = (c.hasFloat()?Double.doubleToRawLongBits(d):(long)d)*c._len;
Arrays.fill(_mins, d);
Arrays.fill(_maxs, d);
if( d == Double.POSITIVE_INFINITY) _pinfs++;
else if( d == Double.NEGATIVE_INFINITY) _ninfs++;
else {
if( Double.isNaN(d)) _naCnt=c._len;
else if( d != 0 ) _nzCnt=c._len;
_mean = d;
_rows=c._len;
}
_isInt = ((long)d) == d;
_sigma = 0; // No variance for constants
return this;
}
//all const NaNs
if ((c instanceof C0DChunk && c.isNA_impl(0))) {
_sigma=0; //count of non-NAs * variance of non-NAs
_mean = 0; //sum of non-NAs (will get turned into mean)
_naCnt=c._len;
_nzCnt=0;
return this;
}
// Check for popular easy cases: Boolean, possibly sparse, possibly NaN
if( min==0 && max==1 ) {
int zs = c._len-c.sparseLenZero(); // Easy zeros
int nans = 0;
// Hard-count sparse-but-zero (weird case of setting a zero over a non-zero)
for( int i=c.nextNZ(-1); i< c._len; i=c.nextNZ(i) )
if( c.isNA(i) ) nans++;
else if( c.at8(i)==0 ) zs++;
int os = c._len-zs-nans; // Ones
_nzCnt += os;
_naCnt += nans;
for( int i=0; i<Math.min(_mins.length,zs); i++ ) { min(0); max(0); }
for( int i=0; i<Math.min(_mins.length,os); i++ ) { min(1); max(1); }
_rows += zs+os;
_mean = (double)os/_rows;
_sigma = zs*(0.0-_mean)*(0.0-_mean) + os*(1.0-_mean)*(1.0-_mean);
return this;
}
// Walk the non-zeros
if( isUUID ) { // UUID columns do not compute min/max/mean/sigma
for( int i=c.nextNZ(-1); i< c._len; i=c.nextNZ(i) ) {
if( c.isNA(i) ) _naCnt++;
else {
long lo = c.at16l(i), hi = c.at16h(i);
if (lo != 0 || hi != 0) _nzCnt++;
l = lo ^ 37*hi;
}
if(l != 0) // ignore 0s in checksum to be consistent with sparse chunks
checksum ^= (17 * (start+i)) ^ 23*l;
}
} else if( isString ) { // String columns do not compute min/max/mean/sigma
for (int i = c.nextNZ(-1); i < c._len; i = c.nextNZ(i)) {
if (c.isNA(i)) _naCnt++;
else {
_nzCnt++;
l = c.atStr(tmpStr, i).hashCode();
}
if (l != 0) // ignore 0s in checksum to be consistent with sparse chunks
checksum ^= (17 * (start + i)) ^ 23 * l;
}
} else {
// Work off all numeric rows, or only the nonzeros for sparse
if (c instanceof C1Chunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C1Chunk) c, start, checksum);
else if (c instanceof C1SChunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C1SChunk) c, start, checksum);
else if (c instanceof C1NChunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C1NChunk) c, start, checksum);
else if (c instanceof C2Chunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C2Chunk) c, start, checksum);
else if (c instanceof C2SChunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C2SChunk) c, start, checksum);
else if (c instanceof C4SChunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C4SChunk) c, start, checksum);
else if (c instanceof C4FChunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C4FChunk) c, start, checksum);
else if (c instanceof C4Chunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C4Chunk) c, start, checksum);
else if (c instanceof C8Chunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C8Chunk) c, start, checksum);
else if (c instanceof C8DChunk)
checksum=new RollupStatsHelpers(this).numericChunkRollup((C8DChunk) c, start, checksum);
else
checksum=new RollupStatsHelpers(this).numericChunkRollup(c, start, checksum);
// special case for sparse chunks
// we need to merge with the mean (0) and variance (0) of the zeros count of 0s of the sparse chunk - which were skipped above
// _rows is the count of non-zero rows
// _mean is the mean of non-zero rows
// _sigma is the mean of non-zero rows
// handle the zeros
if( c.isSparseZero() ) {
int zeros = c._len - c.sparseLenZero();
if (zeros > 0) {
for( int i=0; i<Math.min(_mins.length,zeros); i++ ) { min(0); max(0); }
double zeromean = 0;
double zeroM2 = 0;
double delta = _mean - zeromean;
_mean = (_mean * _rows + zeromean * zeros) / (_rows + zeros);
_sigma += zeroM2 + delta*delta * _rows * zeros / (_rows + zeros); //this is the variance*(N-1), will do sqrt(_sigma/(N-1)) later in postGlobal
_rows += zeros;
}
} else if(c.isSparseNA()){
_naCnt = c._len - c.sparseLenNA();
}
}
_checksum = checksum;
// UUID and String columns do not compute min/max/mean/sigma
if( isUUID || isString) {
Arrays.fill(_mins,Double.NaN);
Arrays.fill(_maxs,Double.NaN);
_mean = _sigma = Double.NaN;
}
return this;
}
private void reduce( RollupStats rs ) {
for( double d : rs._mins ) if (!Double.isNaN(d)) min(d);
for( double d : rs._maxs ) if (!Double.isNaN(d)) max(d);
_naCnt += rs._naCnt;
_nzCnt += rs._nzCnt;
_pinfs += rs._pinfs;
_ninfs += rs._ninfs;
if (_rows == 0) { _mean = rs._mean; _sigma = rs._sigma; }
else if(rs._rows != 0){
double delta = _mean - rs._mean;
_mean = (_mean * _rows + rs._mean * rs._rows) / (_rows + rs._rows);
_sigma += rs._sigma + delta*delta * _rows*rs._rows / (_rows+rs._rows);
}
_rows += rs._rows;
_size += rs._size;
_isInt &= rs._isInt;
_checksum ^= rs._checksum;
}
double min( double d ) {
assert(!Double.isNaN(d));
for( int i=0; i<_mins.length; i++ )
if( d < _mins[i] )
{ double tmp = _mins[i]; _mins[i] = d; d = tmp; }
return _mins[_mins.length-1];
}
double max( double d ) {
assert(!Double.isNaN(d));
for( int i=0; i<_maxs.length; i++ )
if( d > _maxs[i] )
{ double tmp = _maxs[i]; _maxs[i] = d; d = tmp; }
return _maxs[_maxs.length-1];
}
private static class Roll extends MRTask<Roll> {
final Key _rskey;
RollupStats _rs;
@Override
protected boolean modifiesVolatileVecs(){return false;}
Roll( H2OCountedCompleter cmp, Key rskey ) { super(cmp); _rskey=rskey; }
@Override public void map( Chunk c ) { _rs = new RollupStats(0).map(c); }
@Override public void reduce( Roll roll ) { _rs.reduce(roll._rs); }
@Override public void postGlobal() {
if( _rs == null )
_rs = new RollupStats(0);
else {
_rs._sigma = Math.sqrt(_rs._sigma/(_rs._rows-1));
if (_rs._rows == 1) _rs._sigma = 0;
if (_rs._rows < 5) for (int i=0; i<5-_rs._rows; i++) { // Fix PUBDEV-150 for files under 5 rows
_rs._maxs[4-i] = Double.NaN;
_rs._mins[4-i] = Double.NaN;
}
}
// mean & sigma not allowed on more than 2 classes; for 2 classes the assumption is that it's true/false
Vec vec = _fr.anyVec();
String[] ss = vec.domain();
if( vec.isCategorical() && ss.length > 2 )
_rs._mean = _rs._sigma = Double.NaN;
if( ss != null ) {
long dsz = (2/*hdr*/+1/*len*/+ss.length)*8; // Size of base domain array
for( String s : vec.domain() )
if( s != null )
dsz += 2*s.length() + (2/*hdr*/+1/*value*/+1/*hash*/+2/*hdr*/+1/*len*/)*8;
_rs._size += dsz; // Account for domain size in Vec size
// Account for Chunk key size
int keysize = (2/*hdr*/+1/*kb*/+1/*hash*/+2/*hdr*/+1/*len*/)*8+ vec._key._kb.length;
_rs._size += vec.nChunks()*(keysize*4/*key+value ptr in DKV, plus 50% fill rate*/);
}
}
// Just toooo common to report always. Drowning in multi-megabyte log file writes.
@Override public boolean logVerbose() { return false; }
/**
* Added to avoid deadlocks when running from idea in debug mode (evaluating toSgtring on mr task causes rollups to be computed)
* @return
*/
@Override public String toString(){return "Roll(" + _fr.anyVec()._key +")";}
}
static void start(final Vec vec, Futures fs, boolean computeHisto) {
if( vec instanceof InteractionWrappedVec ) return;
if( DKV.get(vec._key)== null )
throw new RuntimeException("Rollups not possible, because Vec was deleted: "+vec._key);
if( vec.isString() ) computeHisto = false; // No histogram for string columns
final Key rskey = vec.rollupStatsKey();
RollupStats rs = getOrNull(vec,rskey);
if(rs == null || (computeHisto && !rs.hasHisto()))
fs.add(new RPC(rskey.home_node(),new ComputeRollupsTask(vec,computeHisto)).addCompleter(new H2OCallback() {
@Override public void callback(H2OCountedCompleter h2OCountedCompleter) {
DKV.get(rskey); // fetch new results via DKV to enable caching of the results.
}
}).call());
}
private static NonBlockingHashMap<Key,RPC> _pendingRollups = new NonBlockingHashMap<>();
static RollupStats get(Vec vec, boolean computeHisto) {
if( DKV.get(vec._key)== null ) throw new RuntimeException("Rollups not possible, because Vec was deleted: "+vec._key);
if( vec.isString() ) computeHisto = false; // No histogram for string columns
final Key rskey = vec.rollupStatsKey();
RollupStats rs = DKV.getGet(rskey);
while(rs == null || (!rs.isReady() || (computeHisto && !rs.hasHisto()))){
if(rs != null && rs.isMutating())
throw new IllegalArgumentException("Can not compute rollup stats while vec is being modified. (1)");
// 1. compute only once
try {
RPC rpcNew = new RPC(rskey.home_node(),new ComputeRollupsTask(vec, computeHisto));
RPC rpcOld = _pendingRollups.putIfAbsent(rskey, rpcNew);
if(rpcOld == null) { // no prior pending task, need to send this one
rpcNew.call().get();
_pendingRollups.remove(rskey);
} else // rollups computation is already in progress, wait for it to finish
rpcOld.get();
} catch( Throwable t ) {
System.err.println("Remote rollups failed with an exception, wrapping and rethrowing: "+t);
throw new RuntimeException(t);
}
// 2. fetch - done in two steps to go through standard DKV.get and enable local caching
rs = DKV.getGet(rskey);
}
return rs;
}
// Allow a bunch of rollups to run in parallel. If Futures is passed in, run
// the rollup in the background and do not return.
static RollupStats get(Vec vec) { return get(vec,false);}
// Fetch if present, but do not compute
static RollupStats getOrNull(Vec vec, final Key rskey ) {
Value val = DKV.get(rskey);
if( val == null ) // No rollup stats present?
return vec.length() > 0 ? /*not computed*/null : /*empty vec*/new RollupStats(0);
RollupStats rs = val.get(RollupStats.class);
return rs.isReady() ? rs : null;
}
// Histogram base & stride
double h_base() { return _mins[0]; }
double h_stride() { return h_stride(_bins.length); }
private double h_stride(int nbins) { return (_maxs[0]-_mins[0]+(_isInt?1:0))/nbins; }
// Compute expensive histogram
private static class Histo extends MRTask<Histo> {
final double _base, _stride; // Inputs
final int _nbins; // Inputs
long[] _bins; // Outputs
Histo( H2OCountedCompleter cmp, RollupStats rs, int nbins ) { super(cmp);_base = rs.h_base(); _stride = rs.h_stride(nbins); _nbins = nbins; }
@Override public void map( Chunk c ) {
_bins = new long[_nbins];
for( int i=c.nextNZ(-1); i< c._len; i=c.nextNZ(i) ) {
double d = c.atd(i);
if( !Double.isNaN(d) ) _bins[idx(d)]++;
}
// Sparse? We skipped all the zeros; do them now
if( c.isSparseZero() )
_bins[idx(0.0)] += (c._len - c.sparseLenZero());
}
private int idx( double d ) { int idx = (int)((d-_base)/_stride); return Math.min(idx,_bins.length-1); }
@Override public void reduce( Histo h ) { ArrayUtils.add(_bins,h._bins); }
// Just toooo common to report always. Drowning in multi-megabyte log file writes.
@Override public boolean logVerbose() { return false; }
}
// Task to compute rollups on its homenode if needed.
// Only computes the rollups, does not fetch them, caller should fetch them via DKV store (to preserve caching).
// Only comutes the rollups if needed (i.e. are null or do not have histo and histo is required)
// If rs computation is already in progress, it will wait for it to finish.
// Throws IAE if the Vec is being modified (or removed) while this task is in progress.
static final class ComputeRollupsTask extends DTask<ComputeRollupsTask>{
final Key _vecKey;
final Key _rsKey;
final boolean _computeHisto;
public ComputeRollupsTask(Vec v, boolean computeHisto){
super((byte)(Thread.currentThread() instanceof H2O.FJWThr ? currThrPriority()+1 : H2O.MIN_HI_PRIORITY-3));
_vecKey = v._key;
_rsKey = v.rollupStatsKey();
_computeHisto = computeHisto;
}
private Value makeComputing(){
RollupStats newRs = RollupStats.makeComputing();
CountedCompleter cc = getCompleter(); // should be null or RPCCall
if(cc != null) assert cc.getCompleter() == null;
newRs._tsk = cc == null?this:cc;
return new Value(_rsKey,newRs);
}
private void installResponse(Value nnn, RollupStats rs) {
Futures fs = new Futures();
Value old = DKV.DputIfMatch(_rsKey, new Value(_rsKey, rs), nnn, fs);
assert rs.isReady();
if(old != nnn)
throw new IllegalArgumentException("Can not compute rollup stats while vec is being modified. (2)");
fs.blockForPending();
}
@Override
public void compute2() {
assert _rsKey.home();
final Vec vec = DKV.getGet(_vecKey);
while(true) {
Value v = DKV.get(_rsKey);
RollupStats rs = (v == null) ? null : v.<RollupStats>get();
// Fetched current rs from the DKV, rs can be:
// a) computed
// a.1) has histo or histo not required => do nothing
// a.2) no histo and histo is required => only compute histo
// b) computing => wait for the task computing it to finish and check again
// c) mutating => throw IAE
// d) null => compute new rollups
if (rs != null) {
if (rs.isReady()) {
if (_computeHisto && !rs.hasHisto()) { // a.2 => compute rollups
CountedCompleter cc = getCompleter(); // should be null or RPCCall
if(cc != null) assert cc.getCompleter() == null;
// note: if cc == null then onExceptionalCompletion tasks waiting on this may be woken up before exception handling iff exception is thrown.
Value nnn = makeComputing();
Futures fs = new Futures();
Value oldv = DKV.DputIfMatch(_rsKey, nnn, v, fs);
fs.blockForPending();
if(oldv == v){ // got the lock
computeHisto(rs, vec, nnn);
break;
} // else someone else is modifying the rollups => try again
} else
break; // a.1 => do nothing
} else if (rs.isComputing()) { // b) => wait for current computation to finish
rs._tsk.join();
} else if(rs.isMutating()) // c) => throw IAE
throw new IllegalArgumentException("Can not compute rollup stats while vec is being modified. (3)");
} else { // d) => compute the rollups
final Value nnn = makeComputing();
Futures fs = new Futures();
Value oldv = DKV.DputIfMatch(_rsKey, nnn, v, fs);
fs.blockForPending();
if(oldv == v){ // got the lock, compute the rollups
try {
Roll r = new Roll(null, _rsKey).doAll(vec);
// computed the stats, now compute histo if needed and install the response and quit
r._rs._checksum ^= vec.length();
if (_computeHisto)
computeHisto(r._rs, vec, nnn);
else
installResponse(nnn, r._rs);
break;
} catch (Exception e) {
Log.err(e);
cleanupStats(nnn);
throw e;
}
} // else someone else is modifying the rollups => try again
}
}
tryComplete();
}
private boolean cleanupStats(Value current) {
Futures fs = new Futures();
Value old = DKV.DputIfMatch(_rsKey, null, current, fs);
boolean success = old != current;
fs.blockForPending();
return success;
}
final void computeHisto(final RollupStats rs, Vec vec, final Value nnn) {
// All NAs or non-math; histogram has zero bins
if (rs._naCnt == vec.length() || vec.isUUID()) {
rs._bins = new long[0];
installResponse(nnn, rs);
return;
}
// Constant: use a single bin
double span = rs._maxs[0] - rs._mins[0];
final long rows = vec.length() - rs._naCnt;
assert rows > 0 : "rows = " + rows + ", vec.len() = " + vec.length() + ", naCnt = " + rs._naCnt;
if (span == 0) {
rs._bins = new long[]{rows};
installResponse(nnn, rs);
return;
}
// Number of bins: MAX_SIZE by default. For integers, bins for each unique int
// - unless the count gets too high; allow a very high count for categoricals.
int nbins = MAX_SIZE;
if (rs._isInt && span < Integer.MAX_VALUE) {
nbins = (int) span + 1; // 1 bin per int
int lim = vec.isCategorical() ? Categorical.MAX_CATEGORICAL_COUNT : MAX_SIZE;
nbins = Math.min(lim, nbins); // Cap nbins at sane levels
}
Histo histo = new Histo(null, rs, nbins).doAll(vec);
long sum = ArrayUtils.sum(histo._bins);
assert sum == rows:"expected " + rows + " rows, got " + sum;
rs._bins = histo._bins;
// Compute percentiles from histogram
rs._pctiles = new double[Vec.PERCENTILES.length];
int j = 0; // Histogram bin number
int k = 0; // The next non-zero bin after j
long hsum = 0; // Rolling histogram sum
double base = rs.h_base();
double stride = rs.h_stride();
double lastP = -1.0; // any negative value to pass assert below first time
for (int i = 0; i < Vec.PERCENTILES.length; i++) {
final double P = Vec.PERCENTILES[i];
assert P >= 0 && P <= 1 && P >= lastP; // rely on increasing percentiles here. If P has dup then strange but accept, hence >= not >
lastP = P;
double pdouble = 1.0 + P * (rows - 1); // following stats:::quantile.default type 7
long pint = (long) pdouble; // 1-based into bin vector
double h = pdouble - pint; // any fraction h to linearly interpolate between?
assert P != 1 || (h == 0.0 && pint == rows); // i.e. max
while (hsum < pint) hsum += rs._bins[j++];
// j overshot by 1 bin; we added _bins[j-1] and this goes from too low to either exactly right or too big
// pint now falls in bin j-1 (the ++ happened even when hsum==pint), so grab that bin value now
rs._pctiles[i] = base + stride * (j - 1);
if (h > 0 && pint == hsum) {
// linearly interpolate between adjacent non-zero bins
// i) pint is the last of (j-1)'s bin count (>1 when either duplicates exist in input, or stride makes dups at lower accuracy)
// AND ii) h>0 so we do need to find the next non-zero bin
if (k < j) k = j; // if j jumped over the k needed for the last P, catch k up to j
// Saves potentially winding k forward over the same zero stretch many times
while (rs._bins[k] == 0) k++; // find the next non-zero bin
rs._pctiles[i] += h * stride * (k - j + 1);
} // otherwise either h==0 and we know which bin, or fraction is between two positions that fall in the same bin
// this guarantees we are within one bin of the exact answer; i.e. within (max-min)/MAX_SIZE
}
installResponse(nnn, rs);
}
}
}