package com.splout.db.benchmark;
/*
* #%L
* Splout SQL commons
* %%
* Copyright (C) 2012 Datasalt Systems S.L.
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
/**
* A class for computing approximated histograms, keeping a constant resolution, constant amount of memory, and without
* the needed of pre or post processing.
* <p/>
* WARNING: It only works for positive values. And if they are distributed far from 0, then will not work properly as
* well.
*/
public class PositiveHistogram {
protected int buckets[];
protected double upperLimit;
protected long count = 0;
/**
* 2^bits will be number of buckets. If known, it is useful to provide an initialUpperLimit. If it is not known, just
* set it as 0
*/
public PositiveHistogram(int bits, double initialUpperLimit) {
// Bits is only used to enforce that buckets are 2^k
buckets = new int[1 << (bits - 1)];
if (initialUpperLimit <= 0) {
throw new IllegalArgumentException("Upper limit must be > 0");
}
this.upperLimit = initialUpperLimit;
}
/**
* Add a new value to the histogram
*/
public synchronized void add(double value) {
if (value < 0.) {
throw new RuntimeException("Negatives values not allowed. Provided " + value);
}
if (value > upperLimit) {
resize(value);
}
buckets[bucketFor(value)] += 1.;
count++;
}
protected synchronized void resize(double value) {
int scale = (int) Math.ceil(value / upperLimit);
int[] newBuckets = new int[buckets.length];
for (int i = 0; i < buckets.length; i++) {
int toBucket = (i / scale);
newBuckets[toBucket] += buckets[i];
}
buckets = newBuckets;
upperLimit = upperLimit * scale;
}
/**
* Return the bucket index for a given value.
*/
public synchronized int bucketFor(Double value) {
return Math.min(buckets.length - 1, (int) ((value / upperLimit) * buckets.length));
}
/**
* Returns the maximum value that can be keep if the current histogram without needing to redistribute the buckets.
*/
public synchronized double getUpperLimit() {
return upperLimit;
}
/**
* Return the list of buckets. They represent the values between 0 and {@link #getUpperLimit()}
*/
public synchronized int[] getBuckets() {
return buckets;
}
/**
* Return the total sum of the values of the buckets. Normally, they will be the number of elements. If normalized,
* then it will be 1.
*/
private synchronized double getTotalSum() {
double ret = 0;
for (int i = 0; i < buckets.length; i++) {
ret += buckets[i];
}
return ret;
}
/**
* Returns the number of elements that was added to the histogram.
*/
public synchronized long getCount() {
return count;
}
/**
* Computes the accumulated probability for the values at the right of the provided value
*/
public synchronized double getRigthAccumulatedProbability(double value) {
if (value >= upperLimit) {
return 0;
}
int bucket = bucketFor(value);
double accum = 0.;
double bucketRedistribution;
// We only get some probability from the first bucket.
// As much as if data inside buckets would be evenly distributed.
double doubleBucket = (value / upperLimit) * buckets.length;
double ceilBucket = Math.ceil(doubleBucket);
// Strange case where ceil(doubleBucket) == doubleBucket, because doubleBucket is a pure integer
if (ceilBucket == doubleBucket) {
ceilBucket += 1;
}
bucketRedistribution = ceilBucket - doubleBucket;
accum += bucketRedistribution * buckets[bucket];
for (int i = bucket + 1; i < buckets.length; i++) {
accum += buckets[i];
}
return accum / getTotalSum();
}
/**
* Computes the accumulated probability for the values at the left of the provided value
*/
public synchronized double getLeftAccumulatedProbability(double value) {
return 1. - getRigthAccumulatedProbability(value);
}
public synchronized double[] getNormalizedHistogram() {
double[] histo = new double[buckets.length];
double totalSum = getTotalSum();
for (int i = 0; i < buckets.length; i++) {
histo[i] = buckets[i] / totalSum;
}
return histo;
}
public synchronized double getBucketSize() {
return upperLimit / (double) buckets.length;
}
public String toTSV() {
StringBuilder sb = new StringBuilder();
double bucketSize = getBucketSize();
for (int i = 0; i < buckets.length; i++) {
sb.append((bucketSize * i) + (bucketSize / 2));
sb.append("\t");
sb.append(buckets[i]);
sb.append("\n");
}
return sb.toString();
}
}