/* * The MIT License * * Copyright (c) 2009 The Broad Institute * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ package htsjdk.samtools.util; import htsjdk.samtools.util.Histogram.Bin; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.Iterator; import java.util.List; import java.util.TreeMap; import static java.lang.Math.*; /** * Class for computing and accessing histogram type data. Stored internally in * a sorted Map so that keys can be iterated in order. * * @author Tim Fennell */ public class Histogram<K extends Comparable> extends TreeMap<K, Bin> { private String binLabel = "BIN"; private String valueLabel = "VALUE"; private Double mean; /** Constructs a new Histogram with default bin and value labels. */ public Histogram() { } /** Constructs a new Histogram with supplied bin and value labels. */ public Histogram(final String binLabel, final String valueLabel) { this.binLabel = binLabel; this.valueLabel = valueLabel; } /** Constructs a new Histogram that'll use the supplied comparator to sort keys. */ public Histogram(final Comparator<K> comparator) { super(comparator); } /** Constructor that takes labels for the bin and values and a comparator to sort the bins. */ public Histogram(final String binLabel, final String valueLabel, final Comparator<K> comparator) { this(comparator); this.binLabel = binLabel; this.valueLabel = valueLabel; } /** Copy constructor for a histogram. */ public Histogram(final Histogram<K> in) { super(in); this.binLabel = in.binLabel; this.valueLabel = in.valueLabel; this.mean = in.mean; } /** Represents a bin in the Histogram. */ public class Bin { private final K id; private double value = 0; /** Constructs a new bin with the given ID. */ private Bin(final K id) { this.id = id; } /** Gets the ID of this bin. */ public K getId() { return id; } /** Gets the value in the bin. */ public double getValue() { return value; } /** Returns the String format for the value in the bin. */ public String toString() { return String.valueOf(this.value); } /** Checks the equality of the bin by ID and value. */ public boolean equals(final Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; final Bin bin = (Bin) o; if (Double.compare(bin.value, value) != 0) return false; if (!id.equals(bin.id)) return false; return true; } @Override public int hashCode() { int result; final long temp; result = id.hashCode(); temp = value != +0.0d ? Double.doubleToLongBits(value) : 0L; result = 31 * result + (int) (temp ^ (temp >>> 32)); return result; } public double getIdValue() { if (id instanceof Number) { return ((Number) id).doubleValue(); } else { throw new UnsupportedOperationException("getIdValue only supported for Histogram<? extends Number>"); } } } /** Prefill the histogram with the supplied set of bins. */ public void prefillBins(final K... ids) { for (final K id : ids) { put(id, new Bin(id)); } } /** Increments the value in the designated bin by 1. */ public void increment(final K id) { increment(id, 1d); } /** Increments the value in the designated bin by the supplied increment. */ public void increment(final K id, final double increment) { Bin bin = get(id); if (bin == null) { bin = new Bin(id); put(id, bin); } bin.value += increment; mean = null; } public String getBinLabel() { return binLabel; } public void setBinLabel(final String binLabel) { this.binLabel = binLabel; } public String getValueLabel() { return valueLabel; } public void setValueLabel(final String valueLabel) { this.valueLabel = valueLabel; } /** Checks that the labels and values in the two histograms are identical. */ public boolean equals(final Object o) { return o != null && (o instanceof Histogram) && ((Histogram) o).binLabel.equals(this.binLabel) && ((Histogram) o).valueLabel.equals(this.valueLabel) && super.equals(o); } public double getMean() { if (mean == null) { mean = getSum() / getCount(); } return mean; } /** * Returns the sum of the products of the histgram bin ids and the number of entries in each bin. */ public double getSum() { double total = 0; for (final Bin bin : values()) { total += bin.getValue() * bin.getIdValue(); } return total; } /** * Returns the sum of the number of entries in each bin. */ public double getSumOfValues() { double total = 0; for (final Bin bin : values()) { total += bin.getValue(); } return total; } public double getStandardDeviation() { final double mean = getMean(); double count = 0; double total = 0; for (final Bin bin : values()) { final double localCount = bin.getValue(); final double value = bin.getIdValue(); count += localCount; total += localCount * pow(value - mean, 2); } return Math.sqrt(total / (count-1)); } /** * Calculates the mean bin size */ public double getMeanBinSize() { return (getSumOfValues() / size()); } /** * Calculates the median bin size */ public double getMedianBinSize() { if (size() == 0) { return 0; } final List<Double> binValues = new ArrayList<Double>(); for (final Bin bin : values()) { binValues.add(bin.getValue()); } Collections.sort(binValues); final int midPoint = binValues.size() / 2; double median = binValues.get(midPoint); if (binValues.size() % 2 == 0) { median = (median + binValues.get(midPoint-1)) / 2; } return median; } /** * Calculates the standard deviation of the bin size */ public double getStandardDeviationBinSize(final double mean) { double total = 0; for(final Bin bin : values()) { total += Math.pow(bin.getValue() - mean, 2); } return Math.sqrt(total / (Math.max(1,values().size()-1))); } /** * Gets the bin in which the given percentile falls. * * @param percentile a value between 0 and 1 * @return the bin value in which the percentile falls */ public double getPercentile(double percentile) { if (percentile <= 0) throw new IllegalArgumentException("Cannot query percentiles of 0 or below"); if (percentile >= 1) throw new IllegalArgumentException("Cannot query percentiles of 1 or above"); double total = getCount(); double sofar = 0; for (Bin bin : values()) { sofar += bin.getValue(); if (sofar / total >= percentile) return bin.getIdValue(); } throw new IllegalStateException("Could not find percentile: " + percentile); } /** * Returns the cumulative probability of observing a value <= v when sampling the * distribution represented by this histogram. */ public double getCumulativeProbability(final double v) { double count = 0; double total = 0; for (final Bin bin : values()) { final double binValue = bin.getIdValue(); if (binValue <= v) count += bin.getValue(); total += bin.getValue(); } return count / total; } public double getMedian() { double total = 0; double count = getCount(); // Base cases if (count == 0) return 0; if (count == 1) return values().iterator().next().getIdValue(); final double midLow, midHigh; if (count % 2 == 0) { midLow = count / 2; midHigh = midLow + 1; } else { midLow = Math.ceil(count / 2); midHigh = midLow; } Double midLowValue = null; Double midHighValue = null; for (final Bin bin : values()) { total += bin.getValue(); if (midLowValue == null && total >= midLow) midLowValue = bin.getIdValue(); if (midHighValue == null && total >= midHigh) midHighValue = bin.getIdValue(); if (midLowValue != null && midHighValue != null) break; } return (midLowValue + midHighValue) / 2; } /** Gets the median absolute deviation of the distribution. */ public double getMedianAbsoluteDeviation() { final double median = getMedian(); final Histogram<Double> deviations = new Histogram<Double>(); for (final Bin bin : values()) { final double dev = abs(bin.getIdValue() - median); deviations.increment(dev, bin.getValue()); } return deviations.getMedian(); } /** * Returns a value that is intended to estimate the mean of the distribution, if the distribution is * essentially normal, by using the median absolute deviation to remove the effect of * erroneous massive outliers. */ public double estimateSdViaMad() { return 1.4826 * getMedianAbsoluteDeviation(); } /** Returns id of the Bin that's the mode of the distribution (i.e. the largest bin). */ public double getMode() { return getModeBin().getIdValue(); } /** Returns the Bin that's the mode of the distribution (i.e. the largest bin). */ private Bin getModeBin() { Bin modeBin = null; for (final Bin bin : values()) { if (modeBin == null || modeBin.value < bin.value) { modeBin = bin; } } return modeBin; } public double getMin() { return firstEntry().getValue().getIdValue(); } public double getMax() { return lastEntry().getValue().getIdValue(); } public double getCount() { double count = 0; for (final Bin bin : values()) { count += bin.value; } return count; } /** Gets the geometric mean of the distribution. */ public double getGeometricMean() { double total = 0; double count = 0; for (final Bin bin : values()) { total += bin.value * log(bin.getIdValue()); count += bin.value; } return exp(total / count); } /** * Trims the histogram when the bins in the tail of the distribution contain fewer than mode/tailLimit items */ public void trimByTailLimit(final int tailLimit) { if (isEmpty()) { return; } final Bin modeBin = getModeBin(); final double mode = modeBin.getIdValue(); final double sizeOfModeBin = modeBin.getValue(); final double minimumBinSize = sizeOfModeBin/tailLimit; Histogram<K>.Bin lastBin = null; final List<K> binsToKeep = new ArrayList<K>(); for (Histogram<K>.Bin bin : values()) { double binId = ((Number)bin.getId()).doubleValue(); if (binId <= mode) { binsToKeep.add(bin.getId()); } else if ((lastBin != null && ((Number)lastBin.getId()).doubleValue() != binId - 1) || bin.getValue() < minimumBinSize) { break; } else { binsToKeep.add(bin.getId()); } lastBin = bin; } final Object keys[] = keySet().toArray(); for (Object binId : keys) { if (!binsToKeep.contains((K)binId)) { remove(binId); } } } /** * Trims the histogram so that only bins <= width are kept. */ public void trimByWidth(final int width) { final Iterator<K> it = descendingKeySet().iterator(); while (it.hasNext()) { if (((Number)it.next()).doubleValue() > width) { it.remove(); } else break; } } /*** * Immutable method that divides the current Histogram by an input Histogram and generates a new one * Throws an exception if the bins don't match up exactly * @param divisorHistogram * @return * @throws IllegalArgumentException */ public Histogram<K> divideByHistogram(final Histogram<K> divisorHistogram) throws IllegalArgumentException{ Histogram<K> output = new Histogram<K>(); if (!this.keySet().equals(divisorHistogram.keySet())) throw new IllegalArgumentException("Attempting to divide Histograms with non-identical bins"); for (final K key : this.keySet()){ Bin dividend = this.get(key); Bin divisor = divisorHistogram.get(key); output.increment(key, dividend.getValue()/divisor.getValue()); } return output; } /*** * Mutable method that allows the addition of a Histogram into the current one. * @param addHistogram */ public void addHistogram(final Histogram<K> addHistogram) { for (final K key : addHistogram.keySet()){ this.increment(key, addHistogram.get(key).getValue()); } } }