/* * chombo: Hadoop Map Reduce utility * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.chombo.util; import java.io.Serializable; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; /** * Histogram that chnges as data gets added * * @author pranab * */ public class HistogramStat implements Serializable { protected double binWidth = -1; protected Map<Integer, Bin> binMap = new TreeMap<Integer, Bin>(); protected int count; protected double sum = 0.0; protected double sumSq = 0.0; protected int sampleCount; protected boolean normalized; protected Map<Double, Double> histogram = new TreeMap<Double, Double>(); protected boolean extendedOutput; protected int outputPrecision = 3; private boolean debugOn = false; private String fieldDelim = ","; /** * @param binWidth */ public HistogramStat() { super(); } /** * @param binWidth */ public HistogramStat(int binWidth) { super(); this.binWidth = binWidth; } /** * @param binWidth */ public HistogramStat(double binWidth) { super(); this.binWidth = binWidth; } public void initialize() { binMap.clear(); histogram.clear(); count = 0; sum = 0; sumSq = 0; normalized = false; } /** * @param binWidth */ public void setBinWidth(int binWidth) { this.binWidth = binWidth; } /** * @param binWidth */ public void setBinWidth(double binWidth) { this.binWidth = binWidth; } /** * @param extendedOutput * @return */ public HistogramStat withExtendedOutput(boolean extendedOutput) { this.extendedOutput = extendedOutput; return this; } /** * @param outputPrecision * @return */ public HistogramStat withOutputPrecision(int outputPrecision) { this.outputPrecision = outputPrecision; return this; } /** * @param outputPrecision * @return */ public HistogramStat withFieldDelim(String fieldDelim) { this.fieldDelim = fieldDelim; return this; } /** * @param value */ public void add(int value) { add(value, 1); } /** * @param value * @param count */ public void add(int value, int count) { int index = (int)(value / binWidth); addToBin(index, value, count); } /** * @param value */ public void add(long value) { add(value, 1); } /** * @param value * @param count */ public void add(long value, int count) { int index = (int)(value / binWidth); addToBin(index, value, count); } /** * @param value */ public void add(float value) { add(value, 1); } /** * @param value * @param count */ public void add(float value, int count) { int index = (int)(value / binWidth); addToBin(index, value, count); } /** * @param value */ public void add(double value) { add(value, 1); } /** * @param value * @param count */ public void add(double value, int count) { int index = (int)(value / binWidth); addToBin(index, value, count); } /** * @param index * @param value */ private void addToBin(int index, double value, int count) { if (debugOn) { System.out.println("index: " + index + " value: " + BasicUtils.formatDouble(value, outputPrecision) + " count: " + count); } Bin bin = binMap.get(index); if (null == bin) { bin = new Bin(index); binMap.put(index, bin); } bin.addCount(count); this.count += count; sum += value * count; sumSq += value * value * count; ++sampleCount; } /** * @param index * @param count */ public void addBin(int index, int count) { Bin bin = binMap.get(index); if (null == bin) { bin = new Bin(index); binMap.put(index, bin); } bin.addCount(count); } /** * @return */ public int getMeanCount() { int sum = 0; for (Integer index : binMap.keySet()) { Bin bin = binMap.get(index); sum += bin.count; } return sum / binMap.size(); } /** * @param confidenceLimitPercent * @return */ public int[] getConfidenceBounds(int confidenceLimitPercent) { int[] confidenceBounds = new int[2]; int mean = (int)getMean(); int meanIndex = (int)(mean / binWidth); int confCount = 0; int confidenceLimit = (count * confidenceLimitPercent) / 100; int binCount = 0; Bin bin = binMap.get(meanIndex); if (null != bin) { confCount += bin.getCount(); ++binCount; } //starting for mean index extend to both sides to include other bins int offset = 1; for(; binCount < binMap.size() ; ++offset) { bin = binMap.get(meanIndex + offset); if (null != bin) { confCount += bin.getCount(); ++binCount; } bin = binMap.get(meanIndex - offset); if (bin != null) { confCount += bin.getCount(); ++binCount; } if (confCount >= confidenceLimit) { break; } } double avBinWidth = binWidth > 1 ? 0.5 : 0.0; confidenceBounds[0] = (int)((((double)(meanIndex - offset)) + avBinWidth) * binWidth); confidenceBounds[1] = (int)((((double)(meanIndex + offset)) + avBinWidth) * binWidth); return confidenceBounds; } /** * @return */ public double getMean() { double mean = sum / count; return mean; } /** * @return */ public double getStdDev() { double mean = getMean(); double stdDev = Math.sqrt(sumSq / count - mean * mean); return stdDev; } /** * @return */ public int getCount() { return count; } /** * @return */ public HistogramStat.Bin[] getSortedBins() { Bin[] bins = new Bin[binMap.size()]; int i = 0; for (Integer index : binMap.keySet()) { Bin bin = binMap.get(index); bins[i++] = bin; } //Arrays.sort(bins); return bins; } /** * @return */ public HistogramStat.Bin[] getSortedBinsByCount() { //sort by count Map<Integer, HistogramStat.Bin> binSotredByCount = new TreeMap<Integer, HistogramStat.Bin>(); for (Integer index : binMap.keySet()) { Bin bin = binMap.get(index); binSotredByCount.put(bin.count, bin); } Bin[] bins = new Bin[binMap.size()]; int i = 0; for (Integer count : binSotredByCount.keySet()) { Bin bin = binSotredByCount.get(count); bins[i++] = bin; } return bins; } /** * @return */ public double getMedian() { return getQuantile(0.5); } /** * @param quantile * @return */ public double getQuantile(double quantile) { double median = 0; int quantileCount = (int)(count * quantile); int curCount = 0; Bin bin = null; for (int binIndex: binMap.keySet()) { curCount += binMap.get(binIndex).count; if (curCount > quantileCount) { bin = binMap.get(binIndex); break; } } //assume uniform distribution within bin median = bin.index * binWidth; int prevCount = curCount - bin.count; median += (binWidth * (quantileCount - prevCount)) / bin.count; return median; } /** * @return */ public double getMode() { double mode = 0; int maxCount = 0; Bin maxBin = null; for (int binIndex: binMap.keySet()) { int thisCount = binMap.get(binIndex).count; if (thisCount > maxCount) { maxCount = thisCount; maxBin = binMap.get(binIndex); } } //average within bin mode = maxBin.index * binWidth + binWidth / 2; return mode; } /** * @return */ public Map<Double, Double> getDistribution() { if (histogram.isEmpty()) { for (Integer index : binMap.keySet()) { double val = index * binWidth + binWidth / 2; histogram.put(val, ((double)binMap.get(index).count) / count); } normalized = true; } return histogram; } /** * @return */ public double getEntropy() { double entropy = 0; getDistribution(); for (double val : histogram.keySet()) { double distrVal = histogram.get(val); entropy -= distrVal * Math.log(distrVal); } return entropy; } /** * @param histStat * @return */ public HistogramStat merge(HistogramStat histStat) { HistogramStat mergedHistStat = new HistogramStat(); mergedHistStat.binWidth = binWidth; mergedHistStat.extendedOutput = extendedOutput; mergedHistStat.outputPrecision = outputPrecision; //bins for (Integer index : binMap.keySet()) { Bin bin = binMap.get(index); mergedHistStat.addBin(index, bin.count); } for (Integer index : histStat.binMap.keySet()) { Bin bin = histStat.binMap.get(index); mergedHistStat.addBin(index, bin.count); } if (debugOn) { System.out.println("merging histogram " + binsToString()); System.out.println("merging histogram " + histStat.binsToString()); System.out.println("merged histogram " + mergedHistStat.binsToString()); } //other stats mergedHistStat.count = count + histStat.count; mergedHistStat.sum = sum + histStat.sum; mergedHistStat.sumSq = sumSq + histStat.sumSq; mergedHistStat.sampleCount = sampleCount + histStat.sampleCount; return mergedHistStat; } /* (non-Javadoc) * @see java.lang.Object#toString() */ public String toString() { StringBuilder stBld = new StringBuilder(); final String delim = ","; if (!normalized) { this.getDistribution(); } //formatting String formatter = "%." + outputPrecision + "f"; //distribution stBld.append(histogram.size()).append(delim); for(double x : histogram.keySet()) { double y = histogram.get(x); stBld.append(BasicUtils.formatDouble(x, formatter)).append(delim). append(BasicUtils.formatDouble(y, formatter)).append(delim); } //other stats if (extendedOutput) { String formMean = BasicUtils.formatDouble(getMean(), formatter); String formMedian = BasicUtils.formatDouble(getMedian(), formatter); String formStdDev = BasicUtils.formatDouble(getStdDev(), formatter); String formMode = BasicUtils.formatDouble(getMode(), formatter); String formQuartPer = BasicUtils.formatDouble(getQuantile(0.25), formatter); String formHalfPer = BasicUtils.formatDouble(getQuantile(0.50), formatter); String formThreeQuartPer = BasicUtils.formatDouble(getQuantile(0.75), formatter); stBld.append(formMean).append(delim).append(formMedian).append(delim). append(formStdDev).append(delim).append(formMode).append(delim). append(formQuartPer).append(delim).append(formHalfPer).append(delim). append(formThreeQuartPer).append(delim); } return stBld.substring(0, stBld.length() - 1); } public String normalizedBinsToString() { StringBuilder stBld = new StringBuilder(); if (!normalized) { this.getDistribution(); } //formatting String formatter = "%." + outputPrecision + "f"; //distribution stBld.append(histogram.size()).append(fieldDelim); for(double x : histogram.keySet()) { double y = histogram.get(x); stBld.append(BasicUtils.formatDouble(x, formatter)).append(fieldDelim). append(BasicUtils.formatDouble(y, formatter)).append(fieldDelim); } return stBld.substring(0, stBld.length() - 1); } /** * @return */ public String binsToString() { StringBuilder stBld = new StringBuilder(); //distribution stBld.append(binMap.size()).append(fieldDelim); for(int x : binMap.keySet()) { Bin y = binMap.get(x); stBld.append(x).append(fieldDelim).append(y.count).append(fieldDelim); } return stBld.substring(0, stBld.length() - 1); } /** * @author pranab * */ public static class Bin implements Comparable<Bin>, Serializable { private int index; private int count; public Bin(int index) { super(); this.index = index; } public Bin(int index, int count) { super(); this.index = index; this.count = count; } public void addCount(int count) { this.count += count; } @Override public int compareTo(Bin that) { return this.index < that.index ? -1 : (this.index > that.index ? 1 : 0); } public int getIndex() { return index; } public int getCount() { return count; } } }