/*
* Copyright (c) 2017 OBiBa. All rights reserved.
*
* This program and the accompanying materials
* are made available under the terms of the GNU Public License v3.0.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package org.obiba.magma.math.stat;
import java.io.Serializable;
import java.math.BigDecimal;
import java.math.MathContext;
import java.math.RoundingMode;
import java.util.SortedSet;
import javax.annotation.Nullable;
import com.google.common.collect.ImmutableSortedSet;
import com.google.common.collect.Sets;
/**
* Computes a frequency distribution of a continuous variable split into constant-sized intervals. Given a lower and
* upper bound and a number of intervals to create, this class will count the frequency of observations for values
* within each interval. This can effectively be used for producing histograms.
*/
public class IntervalFrequency implements Serializable {
private static final long serialVersionUID = 8258609483659122494L;
// Used for rounding computations to 6 significant digits
private final static MathContext CTX = new MathContext(6);
private final SortedSet<Interval> freqTable = Sets.newTreeSet();
private final BigDecimal min;
private final BigDecimal max;
private final BigDecimal intervalSize;
private long n;
/**
* Builds a {@code IntervalFrequency} for values between {@code [lower, upper]} split into {@code intervals} intervals.
* The flag {@code roundToIntegers} controls whether the bounds of intervals should be rounded to the closest integer
* value (usually true when input data are integers).
*
* @param min the lower bound of all values to count
* @param max the upper bound of all values to count
* @param intervals the number of intervals to use, note that the current algorithm may end up using {@code intervals
* + 1} intervals.
* @param roundToIntegers when true, interval bounds and size will be rounded (if necessary) to an integer value
*/
public IntervalFrequency(double min, double max, int intervals, boolean roundToIntegers) {
if(min >= max) throw new IllegalArgumentException("lower bound must be less than upper bound: " + min + ">" + max);
if(intervals < 1) throw new IllegalArgumentException("intervals must be positive");
this.min = maybeRoundToInteger(BigDecimal.valueOf(min), roundToIntegers, RoundingMode.FLOOR)
.round(new MathContext(6, RoundingMode.FLOOR));
this.max = maybeRoundToInteger(BigDecimal.valueOf(max), roundToIntegers, RoundingMode.CEILING)
.round(new MathContext(6, RoundingMode.CEILING));
// (max - min) / intervals
BigDecimal is = this.max.subtract(this.min).divide(BigDecimal.valueOf(intervals), CTX);
is = maybeRoundToInteger(is, roundToIntegers, RoundingMode.HALF_UP);
// When rounding to integer, we may have rounded the interval size to 0: change it to 1, the smallest interval size
if(roundToIntegers && is.compareTo(BigDecimal.ZERO) == 0) {
is = BigDecimal.ONE;
}
// Can't seem to find a case that will actually throw the exception. We'll leave the condition anyway since the rest
// of the code expects non-zero intervalSize value.
if(is.compareTo(BigDecimal.valueOf(Double.MIN_VALUE)) == 0) {
throw new ArithmeticException("computed interval size was 0");
}
intervalSize = is;
BigDecimal lower = this.min;
while(lower.compareTo(this.max) <= 0) {
BigDecimal upper = lower.add(is);
freqTable.add(new Interval(lower, upper));
lower = upper;
}
}
/**
* Builds a {@code IntervalFrequency} for values between {@code [lower, upper]} split into {@code intervals} intervals.
*
* @param min the lower bound of all values to count
* @param max the upper bound of all values to count
* @param intervals the number of intervals to use, note that the current algorithm may end up using {@code intervals
* + 1} intervals.
*/
public IntervalFrequency(double min, double max, int intervals) {
this(min, max, intervals, false);
}
/**
* Adds 1 to the frequency of the interval that contains {@code d}.
*
* @param d
*/
public void add(double d) {
for(Interval interval : freqTable) {
if(interval.increment(d)) {
n++;
return;
}
}
throw new IllegalArgumentException("value is outside [" + min + "," + max + "] bound: " + d);
}
/**
* Returns an unmodifiable view of interval frequency computed by this instance. Note that the iterator will iterate
* on intervals in order ({@code Interval#compareTo(Interval)})
*
* @return an {@code Iterable} over the {@code Interval}
*/
public SortedSet<Interval> intervals() {
return ImmutableSortedSet.copyOfSorted(freqTable);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[").append(min).append(",").append(max).append("]/").append(freqTable.size()).append('(')
.append(intervalSize).append(')').append(" n:").append(n).append('\n');
for(Interval interval : freqTable) {
sb.append(interval).append('\n');
}
return sb.toString();
}
private BigDecimal maybeRoundToInteger(BigDecimal value, boolean roundToInteger, RoundingMode mode) {
return roundToInteger ? value.setScale(0, mode) : value;
}
/**
* Maintains the frequency of the values between {@code [lower, upper[}
*/
@edu.umd.cs.findbugs.annotations.SuppressWarnings("SE_INNER_CLASS")
public class Interval implements Comparable<Interval>, Serializable {
private static final long serialVersionUID = -166445440512189129L;
private final BigDecimal lower;
private final BigDecimal upper;
private long freq = 0;
private Interval(BigDecimal lower, BigDecimal upper) {
this.lower = lower;
this.upper = upper;
}
public double getLower() {
return lower.doubleValue();
}
public double getUpper() {
return upper.doubleValue();
}
/**
* Returns the absolute frequency of observations in this interval
*
* @return
*/
public long getFreq() {
return freq;
}
/**
* Returns the density of this interval (freq / width)
* <p/>
* This is the value usually plotted in a histogram, because the surface area of an interval is the frequency of
* observations.
*
* @return
*/
public double getDensity() {
return density().doubleValue();
}
/**
* Returns the density percentage of this interval (freq / total / width)
* <p/>
* This value represents the proportion of this interval in regards to all others
*
* @return
*/
public double getDensityPct() {
// freq / width / total
if(n > 0) {
return density().divide(BigDecimal.valueOf(n), CTX).doubleValue();
}
return 0d;
}
/**
* Returns true when {@code d} is within {@code [lower, upper[}, false otherwise
*
* @param d
* @return
*/
public boolean contains(double d) {
BigDecimal dd = BigDecimal.valueOf(d);
return dd.compareTo(lower) >= 0 && dd.compareTo(upper) < 0;
}
@Override
public int compareTo(Interval o) {
return (int) Math.signum(lower.doubleValue() - o.lower.doubleValue());
}
@Override
public boolean equals(@Nullable Object obj) {
if(obj == null) {
return false;
}
if(this == obj) {
return true;
}
if(obj instanceof Interval) {
Interval that = (Interval) obj;
return lower.compareTo(that.lower) == 0 && upper.compareTo(that.upper) == 0;
}
return super.equals(obj);
}
@Override
public int hashCode() {
int hashCode = 17;
hashCode = 37 * hashCode + lower.hashCode();
hashCode = 37 * hashCode + upper.hashCode();
return hashCode;
}
@Override
public String toString() {
return "[" + lower + ',' + upper + "[:" + freq + " (" + density() + ',' + getDensityPct() + ")";
}
/**
* increments the frequency and returns true if {@code d} is within {@code [lower, upper[}. Otherwise returns false
* and frequency remains unchanged.
*
* @param d
* @return
*/
boolean increment(double d) {
boolean contains = contains(d);
if(contains) freq++;
return contains;
}
/**
* Computes the density of this interval and rounds the result using {@code CTX}
*
* @return
*/
protected BigDecimal density() {
// fred / intervalSize, rounded to X significant digits (see CTX)
return BigDecimal.valueOf(freq / intervalSize.doubleValue()).round(CTX);
}
}
}