package hex.tree;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import water.*;
import water.util.*;
import java.util.Arrays;
import java.util.Random;
/**
* PUBDEV-451: Prove that histogram addition of float-casted doubles leads to reproducible AND accurate histogram counts
*/
public class HistogramTest extends TestUtil {
final static int BUCKETS = 100; //how many histogram buckets
final static int THREADS = 100; //how many threads
final static int THREAD_LOOPS = 100; //how much work per thread
@BeforeClass
public static void stall() { stall_till_cloudsize(1); }
@Test
public void run() {
Futures fs = new Futures();
long seed = 0xDECAF;
Log.info("Histogram size: " + BUCKETS);
Log.info("Threads: " + THREADS);
Log.info("Loops per Thread: " + THREAD_LOOPS);
// Run 1
Histo hist = new Histo(BUCKETS);
for (int i=0; i<THREADS; ++i)
fs.add(H2O.submitTask(new Filler(hist, seed+i)));
fs.blockForPending();
// Run 2
Histo hist2 = new Histo(BUCKETS);
for (int i=0; i<THREADS; ++i)
fs.add(H2O.submitTask(new Filler(hist2, seed+i)));
fs.blockForPending();
// Check that only the float-casted histograms are reproducible
double maxRelErrorDD = 0;
for (int i = 0; i < hist._sumsD.length; ++i) {
maxRelErrorDD = Math.max( Math.abs(hist._sumsD[i] - hist2._sumsD[i]) / Math.abs(hist._sumsD[i]), maxRelErrorDD);
}
Log.info("Max rel. error between D and D: " + maxRelErrorDD);
assert(!Arrays.equals(hist._sumsD, hist2._sumsD)); //FP noise leads to indeterminism (max error > double epsilon)
double maxRelErrorFF = 0;
for (int i = 0; i < hist._sumsF.length; ++i) {
maxRelErrorFF = Math.max( Math.abs(hist._sumsF[i] - hist2._sumsF[i]) / Math.abs(hist._sumsF[i]), maxRelErrorFF);
}
Log.info("Max rel. error between F and F: " + maxRelErrorFF);
assert(maxRelErrorDD > maxRelErrorFF);
// Check that we don't lose accuracy by doing the float-casting
double maxRelErrorDF = 0;
for (Histo h : new Histo[]{hist, hist2}) {
for (int i = 0; i < h._sumsD.length; ++i) {
maxRelErrorDF = Math.max( Math.abs(h._sumsD[i] - h._sumsF[i]) / Math.abs(h._sumsD[i]), maxRelErrorDF);
}
}
Log.info("Max rel. error between D and F: " + maxRelErrorDF);
assert(maxRelErrorDF < 1e-6);
}
/**
* Helper class to fill two histograms in the same way as DHistogram
*/
private class Histo {
Histo(int len) {
_sumsD = new double[len];
_sumsF = new double[len];
}
public double _sumsD[];
public double _sumsF[];
public void incrDouble(int b, double y) {
AtomicUtils.DoubleArray.add(_sumsD,b,y);
}
public void incrFloat(int b, double y) {
AtomicUtils.DoubleArray.add(_sumsF,b,(float)y);
}
}
/**
* Each thread adds a deterministic set of numbers to the histograms owned by histo, but in a race with other threads
*/
static public class Filler extends H2O.H2OCountedCompleter<Filler> {
private final long _seed;
private final Histo _histo;
Filler(Histo histo, long seed) { _seed = seed; _histo = histo; }
@Override
public void compute2() {
Random rng = new Random(_seed);
// make sure there's enough work for each thread (and hence enough race conditions)
for (int loop=0; loop<THREAD_LOOPS; ++loop) {
// add to every bucket in the histogram
for (int b = 0; b < _histo._sumsD.length; ++b) {
double val = rng.nextDouble();
_histo.incrDouble(b, val);
_histo.incrFloat(b, val);
}
}
tryComplete();
}
}
@Test public void testSplits() {
int nbins = 13;
int nbins_cats = nbins;
byte isInt = 0;
double min = 1;
double maxEx = 6.900000000000001;
for (SharedTreeModel.SharedTreeParameters.HistogramType histoType : SharedTreeModel.SharedTreeParameters.HistogramType.values()) {
Log.info();
Log.info("random split points: " + histoType);
long seed = new Random().nextLong();
if (histoType== SharedTreeModel.SharedTreeParameters.HistogramType.Random)
Log.info("random seed: " + seed);
double[] splitPts = null;
if (histoType == SharedTreeModel.SharedTreeParameters.HistogramType.QuantilesGlobal) {
splitPts = new double[]{1,1.5,2,2.5,3,4,5,6.1,6.2,6.3,6.7,6.8,6.85};
}
Key k = Key.make();
DKV.put(new DHistogram.HistoQuantiles(k,splitPts));
DHistogram hist = new DHistogram("myhisto",nbins,nbins_cats,isInt,min,maxEx,0,histoType,seed,k);
hist.init();
int N=10000000;
int bin=-1;
double[] l1 = new double[nbins];
for (int i=0;i<N;++i) {
double col_data = min + (double)i/N*(maxEx-min);
int b = hist.bin(col_data);
if (b>bin) {
bin=b;
Log.info("Histogram maps " + col_data + " to bin : " + hist.bin(col_data));
l1[b] = col_data;
}
}
double[] l2 = new double[nbins];
for (int i=0;i<nbins;++i) {
double col_data = hist.binAt(i);
Log.info("Histogram maps bin " + i + " to col_data: " + col_data);
l2[i] = col_data;
}
for (int i=0;i<nbins;++i) {
Assert.assertTrue(Math.abs(l1[i]-l2[i]) < 1e-6);
}
k.remove();
}
}
@Test public void testUniformAdaptiveRange() {
int nbins = 13;
int nbins_cats = nbins;
byte isInt = 0;
double min = 1;
double maxEx = 6.900000000000001;
long seed = 1234;
SharedTreeModel.SharedTreeParameters.HistogramType histoType = SharedTreeModel.SharedTreeParameters.HistogramType.UniformAdaptive;
DHistogram hist = new DHistogram("myhisto", nbins, nbins_cats, isInt, min, maxEx, 0, histoType, seed, null);
hist.init();
assert(hist.binAt(0)==min);
assert(hist.binAt(nbins-1)<maxEx);
assert(hist.bin(min) == 0);
assert(hist.bin(maxEx-1e-15) == nbins-1);
}
@Test public void testRandomRange() {
int nbins = 13;
int nbins_cats = nbins;
byte isInt = 0;
double min = 1;
double maxEx = 6.900000000000001;
long seed = 1234;
SharedTreeModel.SharedTreeParameters.HistogramType histoType = SharedTreeModel.SharedTreeParameters.HistogramType.Random;
DHistogram hist = new DHistogram("myhisto", nbins, nbins_cats, isInt, min, maxEx, 0, histoType, seed, null);
hist.init();
assert(hist.binAt(0)==min);
assert(hist.binAt(nbins-1)<maxEx);
assert(hist.bin(min) == 0);
assert(hist.bin(maxEx-1e-15) == nbins-1);
}
@Test public void testQuantilesRange() {
int nbins = 13;
int nbins_cats = nbins;
byte isInt = 0;
double min = 1;
double maxEx = 6.900000000000001;
long seed = 1234;
SharedTreeModel.SharedTreeParameters.HistogramType histoType = SharedTreeModel.SharedTreeParameters.HistogramType.QuantilesGlobal;
double[] splitPts = new double[]{1,1.5,2,2.5,3,4,5,6.1,6.2,6.3,6.7,6.8,6.85};
Key k = Key.make();
DKV.put(new DHistogram.HistoQuantiles(k,splitPts));
DHistogram hist = new DHistogram("myhisto",nbins,nbins_cats,isInt,min,maxEx,0,histoType,seed,k);
hist.init();
assert(hist.binAt(0)==min);
assert(hist.binAt(nbins-1)<maxEx);
assert(hist.bin(min) == 0);
assert(hist.bin(maxEx-1e-15) == nbins-1);
k.remove();
}
@Test public void testShrinking() {
double[] before = new double[]{0.2,0.28,0.31,0.32,0.32,0.4,0.7,0.81,0.84};
double[] after = ArrayUtils.makeUniqueAndLimitToRange(before, 0.3,0.8);
assert(Arrays.equals(after, new double[]{0.3,0.31,0.32,0.4,0.7}));
}
@Test public void testShrinking2() {
double[] before = new double[]{-0.3,0.2,0.28,0.28,0.3,0.3,0.31,0.32,0.32,0.4,0.7,0.7,0.8,0.8,0.81,0.84};
double[] after = ArrayUtils.makeUniqueAndLimitToRange(before, 0.3,0.8);
assert(Arrays.equals(after, new double[]{0.3,0.31,0.32,0.4,0.7}));
}
@Test public void testShrinking3() {
double[] before = new double[]{-0.3,0.2,0.28,0.28,0.3,0.3,0.31,0.32,0.32,0.4,0.7,0.7,0.8,0.8,0.81,0.84};
double[] after = ArrayUtils.makeUniqueAndLimitToRange(before, 0.3,0.9);
assert(Arrays.equals(after, new double[]{0.3,0.31,0.32,0.4,0.7,0.8,0.81,0.84}));
}
@Test public void testShrinking4() {
double[] before = new double[]{0.31,0.32,0.32,0.4,0.7,0.7};
double[] after = ArrayUtils.makeUniqueAndLimitToRange(before, 0.3,0.9);
assert(Arrays.equals(after, new double[]{0.3,0.31,0.32,0.4,0.7}));
}
@Test public void testShrinking5() {
double[] before = new double[]{0.3,0.31,0.32,0.4,0.7};
double[] after = ArrayUtils.limitToRange(before,0.31,0.9);
assert(Arrays.equals(after, new double[]{0.31,0.32,0.4,0.7}));
}
@Test public void testShrinking6() {
double[] before = new double[]{0.3,0.31,0.32,0.4,0.7};
double[] after = ArrayUtils.limitToRange(before,0.305,0.9);
assert(Arrays.equals(after, new double[]{0.3,0.31,0.32,0.4,0.7}));
}
@Test public void testShrinking7() {
double[] before = new double[]{0.3,0.31,0.32,0.4,0.7};
double[] after = ArrayUtils.limitToRange(before,0.305,0.699);
assert(Arrays.equals(after, new double[]{0.3,0.31,0.32,0.4}));
}
@Test public void testShrinking8() {
double[] before = new double[]{0.3,0.31,0.32,0.4,0.7};
double[] after = ArrayUtils.limitToRange(before,0.7,0.9);
assert(Arrays.equals(after, new double[]{0.7}));
}
@Test public void testShrinking9() {
double[] before = new double[]{0.3,0.31,0.32,0.4,0.7};
double[] after = ArrayUtils.limitToRange(before,0.8,0.9);
assert(Arrays.equals(after, new double[]{0.7}));
}
@Test public void testPadding() {
double[] before = new double[]{0.3,0.31,0.32,0.4,0.7};
double[] after = ArrayUtils.padUniformly(before,9);
assert(Arrays.equals(after, new double[]{0.3,0.305,0.31,0.315,0.32,0.36,0.4,0.55,0.7}));
}
@Test public void testPadding2() {
double[] before = new double[]{0.3,0.31,0.32,0.4,0.7};
double[] after = ArrayUtils.padUniformly(before,10);
assert(Arrays.equals(after, new double[]{0.3,0.3025,0.3075,0.31,0.315,0.32,0.36,0.4,0.55,0.7}));
}
@Test public void testPadding3() {
double[] before = new double[]{0.3,0.31,0.32,0.4,0.7};
double[] after = ArrayUtils.padUniformly(before,8);
assert(Arrays.equals(after, new double[]{0.3,0.305,0.31,0.315,0.32,0.36,0.4,0.7}));
}
@Test public void binarySearch() {
int R=1000000;
for (int N : new int[]{20,50,100}) {
double[] vals = new double[N];
for (int i = 0; i < N; ++i) {
vals[i] = i * 1.0 / N;
}
double[] pts = new double[N];
Random rnd = RandomUtils.getRNG(123);
for (int i = 0; i < N; ++i) {
pts[i] = rnd.nextInt(N) * 1. / N;
}
long sum = 0;
for (int r = 0; r < R; ++r) {
sum += Arrays.binarySearch(vals, pts[r % N]);
}
long start = System.currentTimeMillis();
for (int r = 0; r < R; ++r) {
sum += Arrays.binarySearch(vals, pts[r % N]);
}
long done = System.currentTimeMillis();
Log.info("N=" + N + " Sum:" + sum + " Time: " + PrettyPrint.msecs(done - start, true));
}
}
@Test public void linearSearch() {
int R=1000000;
for (int N : new int[]{20,50,100}) {
double[] vals = new double[N];
for (int i = 0; i < N; ++i) {
vals[i] = i * 1.0 / N;
}
double[] pts = new double[N];
Random rnd = RandomUtils.getRNG(123);
for (int i = 0; i < N; ++i) {
pts[i] = rnd.nextInt(N) * 1. / N;
}
long sum = 0;
for (int r = 0; r < R; ++r) {
sum += ArrayUtils.linearSearch(vals, pts[r % N]);
}
long start = System.currentTimeMillis();
for (int r = 0; r < R; ++r) {
sum += ArrayUtils.linearSearch(vals, pts[r % N]);
}
long done = System.currentTimeMillis();
Log.info("N=" + N + " Sum:" + sum + " Time: " + PrettyPrint.msecs(done - start, true));
}
}
}