package ai.h2o.automl.collectors; import hex.tree.DHistogram; import hex.tree.SharedTreeModel; import jsr166y.CountedCompleter; import water.H2O; import water.MRTask; import water.MemoryManager; import water.fvec.Chunk; import water.util.ArrayUtils; import water.util.AtomicUtils; import water.util.Log; import java.util.concurrent.atomic.AtomicInteger; /** * Collect metadata over a single column. */ public class MetaCollector { enum COLLECT { skew() { @Override void op( double[] d0s, double d1 ) { d0s[0]++; } @Override void atomic_op( double[] d0s, double[] d1s ) { d0s[0] += d1s[0]; } @Override double postPass( double ds[], long n ) { return ds[0]; } }, kurtosis() { @Override void op( double[] d0s, double d1 ) { d0s[0]+=d1; } @Override void atomic_op( double[] d0s, double[] d1s ) { d0s[0] += d1s[0]; } @Override double postPass( double ds[], long n ) { return ds[0]/n; } }, uniqPerChk() { @Override void op( double[] d0s, double d1 ) { d0s[0]+=d1; } @Override void atomic_op( double[] d0s, double[] d1s ) { d0s[0] += d1s[0]; } @Override double postPass( double ds[], long n ) { return ds[0]; } }, timePerChunk() { @Override void op( double[] d0s, double d1 ) { d0s[0]+=d1*d1; } @Override void atomic_op( double[] d0s, double[] d1s ) { d0s[0] += d1s[0]; } @Override double postPass( double ds[], long n) { return ds[0]; } }, mode() { @Override void op( double[] d0s, double d1 ) { d0s[(int)d1]++; } @Override void atomic_op( double[] d0s, double[] d1s ) { ArrayUtils.add(d0s,d1s); } @Override double postPass( double ds[], long n ) { return ArrayUtils.maxIndex(ds); } @Override double[] initVal(int maxx) { return new double[maxx]; } }, ; abstract void op( double[] d0, double d1 ); abstract void atomic_op( double[] d0, double[] d1 ); abstract double postPass( double ds[], long n ); double[] initVal(int maxx) { return new double[]{0}; } } // TODO: add hiddenNAFinder https://0xdata.atlassian.net/browse/STEAM-76 public static class ParallelTasks<T extends H2O.H2OCountedCompleter<T>> extends H2O.H2OCountedCompleter { private final AtomicInteger _ctr; // Concurrency control private static int MAXP = 100; // Max number of concurrent columns private final T[] _tasks; // task holder (will be 1 per column) public ParallelTasks(T[] tasks) { _ctr = new AtomicInteger(MAXP-1); _tasks = tasks; } @Override public void compute2() { final int nTasks = _tasks.length; addToPendingCount(nTasks-1); for (int i=0; i < Math.min(MAXP, nTasks); ++i) asyncVecTask(i); } private void asyncVecTask(final int task) { _tasks[task].setCompleter(new Callback()); _tasks[task].fork(); } private class Callback extends H2O.H2OCallback{ public Callback(){super(ParallelTasks.this);} @Override public void callback(H2O.H2OCountedCompleter cc) { int i = _ctr.incrementAndGet(); if (i < _tasks.length) asyncVecTask(i); } @Override public boolean onExceptionalCompletion(Throwable ex, CountedCompleter caller) { ex.printStackTrace(); return super.onExceptionalCompletion(ex, caller); } } } /** * A wrapper class around DHistogram. * * NB: _sums and _ssqs are not the same as those found in DHistogram instances. * The difference being that these are compounded over the column data, rather than * over the target column. */ public final static class DynamicHisto extends MRTask<DynamicHisto> { public DHistogram _h; public double[] _sums; // different from _h._sums public double[] _ssqs; // different from _h._ssqs public DynamicHisto(DHistogram h) { _h=h; } DynamicHisto(String name, final int nbins, int nbins_cats, byte isInt, double min, double max) { if(!(Double.isNaN(min)) && !(Double.isNaN(max))) { //If both are NaN then we don't need a histogram _h = makeDHistogram(name, nbins, nbins_cats, isInt, min, max); }else{ Log.info("Ignoring all NaN column -> "+ name); } } private static class SharedTreeParameters extends SharedTreeModel.SharedTreeParameters { public String algoName() { return "DUM"; } public String fullName() { return "dummy"; } public String javaName() { return "this.is.unused"; } } public static DHistogram makeDHistogram(String name, int nbins, int nbins_cats, byte isInt, double min, double max) { final double minIn = Math.max(min,-Double.MAX_VALUE); // inclusive vector min final double maxIn = Math.min(max, Double.MAX_VALUE); // inclusive vector max final double maxEx = DHistogram.find_maxEx(maxIn,isInt==1?1:0); // smallest exclusive max SharedTreeModel.SharedTreeParameters parms = new SharedTreeParameters(); // make(String name, final int nbins, byte isInt, double min, double maxEx, long seed, SharedTreeModel.SharedTreeParameters parms, Key globalQuantilesKey) { parms._nbins = nbins; parms._nbins_cats = nbins_cats; return DHistogram.make(name, nbins, isInt, minIn, maxEx, 0, parms, null); } public double binAt(int b) { return _h.binAt(b); } // TODO: move into DHistogram public double mean(int b ) { double n = _h.w(b); return n>0 ? _sums[b]/n : _h.binAt(b); } // TODO: move into DHistogram public double var (int b) { // sample variance double n = _h.w(b); if( n<=1 ) return 0; return Math.max(0, (_ssqs[b] - _sums[b]*_sums[b]/n)/(n-1)); } protected void init() { _h.init(); _sums = MemoryManager.malloc8d(_h._nbin); _ssqs = MemoryManager.malloc8d(_h._nbin); } @Override public void setupLocal() { init(); } @Override public void map(Chunk c) { accum(c); } @Override public void reduce(DynamicHisto ht) { merge(ht._h); if( _sums!=ht._sums ) ArrayUtils.add(_sums, ht._sums); if( _ssqs!=ht._ssqs ) ArrayUtils.add(_ssqs, ht._ssqs); } void accum(Chunk C) { double min = _h.find_min(); double max = _h.find_maxIn(); double[] bins = new double[_h._nbin]; double[] sums = new double[_h._nbin]; double[] ssqs = new double[_h._nbin]; for(int r=0; r<C._len; ++r) { double colData = C.atd(r); if( colData < min ) min = colData; if( colData > max ) max = colData; int b = _h.bin(colData); bins[b] += 1; sums[b] += colData; ssqs[b] += colData*colData; } _h.setMin(min); _h.setMaxIn(max); for(int b=0; b<bins.length; ++b) if( bins[b]!=0 ) { _h.addWAtomic(b, bins[b]); AtomicUtils.DoubleArray.add(_sums, b, sums[b]); AtomicUtils.DoubleArray.add(_ssqs, b, ssqs[b]); } } public void merge(DHistogram h) { if( _h==h ) return; if( _h==null ) _h=h; else if( h!=null ) _h.add(h); } } }