package mil.nga.giat.geowave.core.store.adapter.statistics.histogram;
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Random;
/**
* Dynamic Histogram:
*
* Derived from work for Hive and based on Yael Ben-Haim and Elad Tom-Tov,
* "A streaming parallel decision tree algorithm", J. Machine Learning Research
* 11 (2010), pp. 849--872.
*
* Note: the paper refers to a bins as a pair (p,m) where p = lower bound and m
* = count. Some of the interpolation treats the pair as a coordinate.
*
* Although there are no approximation guarantees, it appears to work well with
* adequate data and a large number of histogram bins.
*/
public class MinimalBinDistanceHistogram implements
NumericHistogram
{
// Class variables
private int nbins = 1024; // the fix maximum number of bins to maintain
private long totalCount; // cache to avoid counting all the bins
private ArrayList<Bin> bins;
private final Random prng;
private double maxValue; // the maximum value consumed
/**
* Creates a new histogram object.
*/
public MinimalBinDistanceHistogram() {
totalCount = 0;
// init the RNG for breaking ties in histogram merging.
prng = new Random(
System.currentTimeMillis());
bins = new ArrayList<Bin>(
1024);
}
/**
* Creates a new histogram object.
*/
public MinimalBinDistanceHistogram(
final int size ) {
totalCount = 0;
// init the RNG for breaking ties in histogram merging.
prng = new Random(
System.currentTimeMillis());
bins = new ArrayList<Bin>(
size);
nbins = size;
}
/**
* Resets a histogram object to its initial state.
*/
public void reset() {
bins.clear();
totalCount = 0;
}
/**
*
* @return the total number of consumed values
*/
public long getTotalCount() {
return totalCount;
}
/**
*
* @return the number of bins used
*/
public int getNumBins() {
return bins.size();
}
/**
*
* @param other
* A serialized histogram created by the serialize() method
* @see #merge
*/
public void merge(
final NumericHistogram other ) {
if (other == null) {
return;
}
MinimalBinDistanceHistogram myTypeOfHist = (MinimalBinDistanceHistogram) other;
totalCount += myTypeOfHist.totalCount;
maxValue = Math.max(
myTypeOfHist.maxValue,
maxValue);
if ((nbins == 0) || (bins.size() == 0)) {
// Just make a copy
bins = new ArrayList<Bin>(
myTypeOfHist.bins.size());
for (final Bin coord : myTypeOfHist.bins) {
bins.add(coord);
}
// the constrained bin sizes may not match
trim();
}
else {
// The aggregation buffer already contains a partial histogram.
// Merge using Algorithm #2 from the Ben-Haim and
// Tom-Tov paper.
final ArrayList<Bin> mergedBins = new ArrayList<Bin>(
getNumBins() + other.getNumBins());
mergedBins.addAll(bins);
for (final Bin oldBin : myTypeOfHist.bins) {
mergedBins.add(new Bin(
oldBin.lowerBound,
oldBin.count));
}
Collections.sort(mergedBins);
bins = mergedBins;
// Now trim the overstuffed histogram down to the correct number of
// bins
trim();
}
}
/**
* Adds a new data point to the histogram approximation. Make sure you have
* called either allocate() or merge() first. This method implements
* Algorithm #1 from Ben-Haim and Tom-Tov,
* "A Streaming Parallel Decision Tree Algorithm", JMLR 2010.
*
* @param v
* The data point to add to the histogram approximation.
*/
public void add(
final double v ) {
this.add(
1,
v);
}
public void add(
long count,
double v ) {
// Binary search to find the closest bucket that v should go into.
// 'bin' should be interpreted as the bin to shift right in order to
// accomodate
// v. As a result, bin is in the range [0,N], where N means that the
// value v is
// greater than all the N bins currently in the histogram. It is also
// possible that
// a bucket centered at 'v' already exists, so this must be checked in
// the next step.
totalCount++;
maxValue = Math.max(
maxValue,
v);
int bin = 0;
for (int l = 0, r = bins.size(); l < r;) {
bin = (l + r) / 2;
if (bins.get(bin).lowerBound > v) {
r = bin;
}
else {
if (bins.get(bin).lowerBound < v) {
l = ++bin;
}
else {
break; // break loop on equal comparator
}
}
}
// If we found an exact bin match for value v, then just increment that
// bin's count.
// Otherwise, we need to insert a new bin and trim the resulting
// histogram back to size.
// A possible optimization here might be to set some threshold under
// which 'v' is just
// assumed to be equal to the closest bin -- if fabs(v-bins[bin].x) <
// THRESHOLD, then
// just increment 'bin'. This is not done now because we don't want to
// make any
// assumptions about the range of numeric data being analyzed.
if ((bin < bins.size()) && Math.abs(bins.get(bin).lowerBound - v) < 1E-12) {
bins.get(bin).count += count;
}
else {
bins.add(
bin,
new Bin(
v,
count));
// Trim the bins down to the correct number of bins.
if (bins.size() > nbins) {
trim();
}
}
}
/**
* Trims a histogram down to 'nbins' bins by iteratively merging the closest
* bins. If two pairs of bins are equally close to each other, decide
* uniformly at random which pair to merge, based on a PRNG.
*/
private void trim() {
while (bins.size() > nbins) {
// Find the closest pair of bins in terms of x coordinates. Break
// ties randomly.
double smallestdiff = bins.get(1).lowerBound - bins.get(0).lowerBound;
int smallestdiffloc = 0, smallestdiffcount = 1;
final int s = bins.size() - 1;
for (int i = 1; i < s; i++) {
final double diff = bins.get(i + 1).lowerBound - bins.get(i).lowerBound;
if (diff < smallestdiff) {
smallestdiff = diff;
smallestdiffloc = i;
smallestdiffcount = 1;
}
else {
// HP Fortify "Insecure Randomness" false positive
// This random number is not used for any purpose
// related to security or cryptography
if (((diff - smallestdiff) < 1E-12) && (prng.nextDouble() <= (1.0 / ++smallestdiffcount))) {
smallestdiffloc = i;
}
}
}
// Merge the two closest bins into their average x location,
// weighted by their heights.
// The height of the new bin is the sum of the heights of the old
// bins.
final Bin smallestdiffbin = bins.get(smallestdiffloc);
final double d = smallestdiffbin.count + bins.get(smallestdiffloc + 1).count;
smallestdiffbin.lowerBound *= smallestdiffbin.count / d;
smallestdiffbin.lowerBound += (bins.get(smallestdiffloc + 1).lowerBound / d)
* bins.get(smallestdiffloc + 1).count;
smallestdiffbin.count = d;
// Shift the remaining bins left one position
bins.remove(smallestdiffloc + 1);
}
}
/**
*
* @return The quantiles over the given number of bins.
*/
public double[] quantile(
final int bins ) {
double increment = 1.0 / (double) bins;
double[] result = new double[bins];
double val = increment;
for (int i = 0; i < bins; i++, val += increment) {
result[i] = quantile(val);
}
return result;
}
/**
* Gets an approximate quantile value from the current histogram. Some
* popular quantiles are 0.5 (median), 0.95, and 0.98.
*
* @param q
* The requested quantile, must be strictly within the range
* (0,1).
* @return The quantile value.
*/
public double quantile(
final double q ) {
assert ((bins != null) && (bins.size() > 0) && (nbins > 0));
double csum = 0;
final int binsCount = bins.size();
for (int b = 0; b < binsCount; b++) {
csum += bins.get(b).count;
if ((csum / totalCount) >= q) {
if (b == 0) {
return bins.get(b).lowerBound;
}
csum -= bins.get(b).count;
final double r = bins.get(b - 1).lowerBound
+ ((((q * totalCount) - csum) * (bins.get(b).lowerBound - bins.get(b - 1).lowerBound)) / (bins
.get(b).count));
return r;
}
}
return maxValue; // should not get here
}
/**
* Estimate number of values consumed up to provided value.
*
* @param val
* @return the number of estimated points
*/
public double sum(
final double val,
final boolean inclusive ) {
if (bins.isEmpty()) {
return 0.0;
}
final double minValue = bins.get(0).lowerBound;
final double range = maxValue - minValue;
// one value
if ((range <= 0.0) || (val > maxValue)) {
return totalCount;
}
else if (val < minValue) {
return 0.0;
}
double foundCount = 0;
int i = 0;
for (final Bin coord : bins) {
if (coord.lowerBound < val) {
foundCount += coord.count;
}
else {
break;
}
i++;
}
final double upperBoundary = (i < getNumBins()) ? bins.get(i).lowerBound : maxValue;
final double lowerBoundary = i > 0 ? bins.get(i - 1).lowerBound : 0.0;
final double upperCount = (i < getNumBins()) ? bins.get(i).count : 0;
final double lowerCount = i > 0 ? bins.get(i - 1).count : 0;
foundCount -= lowerCount;
// from paper 'sum' procedure
// the paper treats Bins like coordinates, taking the area of histogram
// (lowerBoundary,0) (lowerBoundary,lowerCount)
// (upperBoundary,upperCount) (upperBoundary,0)
// divided by (upperBoundary - lowerBoundary).
final double mb = lowerCount
+ (((upperCount - lowerCount) / (upperBoundary - lowerBoundary)) * (val - lowerBoundary));
final double s = (((lowerCount + mb) / 2.0) * (val - lowerBoundary)) / (upperBoundary - lowerBoundary);
final double r = foundCount + s + (lowerCount / 2.0);
return r > 1.0 ? r : (inclusive ? 1.0 : r);
}
public double cdf(
final double val ) {
return sum(
val,
false) / totalCount;
}
public long[] count(
final int bins ) {
final long[] result = new long[bins];
double start = this.getMinValue();
double range = maxValue - start;
double increment = range / bins;
start += increment;
long last = 0;
for (int bin = 0; bin < bins; bin++, start += increment) {
final long aggSum = (long) Math.ceil(sum(
start,
false));
result[bin] = aggSum - last;
last = aggSum;
}
return result;
}
public int bufferSize() {
// 20 = 8 bytes for total count, 4 bytes for number of used bins, 4
// bytes for number of bins, 8 bytes for maxValue
return (bins.size() * Bin.bufferSize()) + 24;
}
public void toBinary(
final ByteBuffer buffer ) {
buffer.putLong(totalCount);
buffer.putDouble(maxValue);
buffer.putInt(nbins);
buffer.putInt(bins.size());
for (final Bin bin : bins) {
bin.toBuffer(buffer);
}
}
public void fromBinary(
final ByteBuffer buffer ) {
totalCount = buffer.getLong();
maxValue = buffer.getDouble();
nbins = buffer.getInt();
final int usedBinCount = buffer.getInt();
bins.clear();
bins.ensureCapacity(nbins);
for (int i = 0; i < usedBinCount; i++) {
bins.add(new Bin().fromBuffer(buffer));
}
}
/**
* The Bin class defines a histogram bin, which is just an (x,y) pair.
*/
static class Bin implements
Comparable<Bin>
{
double lowerBound;
// Counts can be split fractionally
double count;
public Bin() {
}
public Bin(
final double lowerBound,
final double count ) {
super();
this.lowerBound = lowerBound;
this.count = count;
}
@Override
public int compareTo(
final Bin other ) {
return Double.compare(
lowerBound,
other.lowerBound);
}
public void toBuffer(
final ByteBuffer buffer ) {
buffer.putDouble(lowerBound);
buffer.putDouble(count);
}
public Bin fromBuffer(
final ByteBuffer buffer ) {
lowerBound = buffer.getDouble();
count = buffer.getDouble();
return this;
}
static int bufferSize() {
return 16;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
long temp;
temp = Double.doubleToLongBits(count);
result = prime * result + (int) (temp ^ (temp >>> 32));
temp = Double.doubleToLongBits(lowerBound);
result = prime * result + (int) (temp ^ (temp >>> 32));
return result;
}
@Override
public boolean equals(
Object obj ) {
if (this == obj) return true;
if (obj == null) return false;
if (getClass() != obj.getClass()) return false;
Bin other = (Bin) obj;
if (Double.doubleToLongBits(count) != Double.doubleToLongBits(other.count)) return false;
if (Double.doubleToLongBits(lowerBound) != Double.doubleToLongBits(other.lowerBound)) return false;
return true;
}
}
public double getMaxValue() {
return maxValue;
};
public double getMinValue() {
return !bins.isEmpty() ? bins.get(0).lowerBound : 0.0;
};
public static class MinimalBinDistanceHistogramFactory implements
NumericHistogramFactory
{
@Override
public NumericHistogram create(
int bins ) {
return new MinimalBinDistanceHistogram(
bins);
}
@Override
public NumericHistogram create(
int bins,
double minValue,
double maxValue ) {
return new MinimalBinDistanceHistogram(
bins);
}
}
}