package edu.berkeley.nlp.util; import java.io.BufferedWriter; import java.io.IOException; import java.io.PrintWriter; import java.io.StringWriter; import java.util.ArrayList; import java.util.List; /** * A simple histogram class. It can be used to accumulate a histogram and * calculate statistical information about it. * * @author Simon George * @version 1.0 31 Aug 2001 * * Extended by John DeNero */ public class Histogram { private static final long serialVersionUID = 1L; private static final int DEFAULT_NUM_BINS = 10; private static int currentNumBins = DEFAULT_NUM_BINS; private boolean binsHaveBeenSet; private List<Double> data; public Histogram() { this("Histogram"); } public Histogram(String title) { this.title = title; data = new ArrayList<Double>(); } public static <T> Histogram histogramOfCounts(Counter<T> counter) { Histogram h = new Histogram(); for (T o : counter.keySet()) { h.add(counter.getCount(o)); } return h; } public static <T> Histogram histogramOfValues(Counter<Double> counter) { Histogram h = new Histogram(); for (Double d : counter.keySet()) { double count = counter.getCount(d); for(int i = 0; i < count; i++) { h.add(d); } } return h; } public void add(double value) { data.add(value); } /** * Enter data into the histogram. The fill method takes the given value, works * out which bin this corresponds to, and increments this bin by one. * * @param x * is the value to add in to the histogram */ private void fill(double x) { // use findBin method to work out which bin x falls in BinInfo bin = findBin(x); // check the result of findBin in case it was an overflow or underflow if (bin.isUnderflow) { m_underflow++; } if (bin.isOverflow) { m_overflow++; } if (bin.isInRange) { m_hist[bin.index]++; } // count the number of entries made by the fill method m_entries++; } private class BinInfo { public int index; public boolean isUnderflow; public boolean isOverflow; public boolean isInRange; } /** * Private internal utility method to figure out which bin of the histogram a * number falls in. * * @return info on which bin x falls in. */ private BinInfo findBin(double x) { BinInfo bin = new BinInfo(); bin.isInRange = false; bin.isUnderflow = false; bin.isOverflow = false; // first check if x is outside the range of the normal histogram bins if (x < minValue) { bin.isUnderflow = true; } else if (x > maxValue) { bin.isOverflow = true; } else { bin.isInRange = true; for (int i = 0; i < numBins; i++) { if (x < binUpperBounds[i]) { bin.index = i; break; } } if (x == maxValue) { bin.index = numBins - 1; } } return bin; } /** * Save the histogram data to a file. The file format is very simple, * human-readable text so it can be imported into Excel or cut & pasted into * other applications. * * @param fileName * name of the file to write the histogram to. Note this must be * valid for your operating system, e.g. a unix filename might not * work under windows * @exception IOException * if file cannot be opened or written to. */ public void write(PrintWriter outfile) { setBuckets(); fillHistogram(); writeToPrintWriter(outfile); } private void writeToPrintWriter(PrintWriter outfile) { outfile.println(title); outfile.println("Bins:\t" + numBins); outfile.println("Min:\t" + minValue); outfile.println("Max:\t" + maxValue); outfile.println("Entries:\t" + m_entries); if (m_overflow > 0) { outfile.println("Over:\t" + m_overflow); } if (m_underflow > 0) { outfile.println("Under:\t" + m_underflow); } for (int i = 0; i < numBins; i++) { String l = String.format("%.2f", binLowerBounds[i]); String u = String.format("%.2f", binUpperBounds[i]); outfile.print("[" + l + ", " + u); if (numBins - 1 != i) { outfile.print(")"); } else { outfile.print("]"); } outfile.println(":\t" + m_hist[i]); } outfile.close(); } public String toString() { setBuckets(); fillHistogram(); StringWriter s = new StringWriter(); PrintWriter pw = new PrintWriter(new BufferedWriter(s)); writeToPrintWriter(pw); return s.getBuffer().toString(); } private void fillHistogram() { m_entries = 0; m_overflow = 0; m_underflow = 0; m_hist = new int[numBins]; for (double d : data) { fill(d); } } private void setBuckets() { setBuckets(currentNumBins); } private void setBuckets(int numBins) { if (!binsHaveBeenSet) { setBuckets(numBins, getMin(), getMax()); binsHaveBeenSet = false; } } public void setBuckets(int numBins, double min, double max) { double[] lowers = new double[numBins]; double step = (max - min) / (numBins); for (int i = 0; i < numBins; i++) { lowers[i] = min + i * step; } setBuckets(lowers, min, max); } private void setBuckets(double[] lowers) { setBuckets(lowers, lowers[0], Double.POSITIVE_INFINITY); } public void setBuckets(double[] binLowerBounds, double min, double max) { numBins = binLowerBounds.length; this.binLowerBounds = binLowerBounds; assert (min == binLowerBounds[0]); minValue = min; maxValue = max; updateBinUpperBounds(); binsHaveBeenSet = true; } private void updateBinUpperBounds() { binUpperBounds = new double[numBins]; for (int i = 0; i < numBins - 1; i++) { binUpperBounds[i] = binLowerBounds[i + 1]; } binUpperBounds[numBins - 1] = maxValue; } public double getMax() { double max = Double.NEGATIVE_INFINITY; for (double d : data) { max = Math.max(max, d); } return max; } public double getMin() { double min = Double.POSITIVE_INFINITY; for (double d : data) { min = Math.min(min, d); } return min; } // private data used internally by this class. private int[] m_hist; private String title; private double minValue; private double maxValue; private int numBins; private double[] binLowerBounds, binUpperBounds; private int m_entries; private double m_overflow; private double m_underflow; public static void main(String[] args) { Histogram h = new Histogram(); for (double i = 1; i < 43400; i *= 1.2) { h.add((double) i); } System.out.println(h); double[] lowers = new double[3]; lowers[0] = 0; lowers[1] = 100; lowers[2] = 1000; h.setBuckets(lowers); System.out.println(h); h.setLogBuckets(10); System.out.println(h); } public void setLogBuckets(int numBuckets) { setLogBuckets(numBuckets, getMin(), getMax()); } public void setLogBuckets(int numBuckets, double min, double max) { double step = Math.pow(max - min + 1, 1.0 / numBuckets); double[] lowers = new double[numBuckets]; for (int i = 0; i < numBuckets; i++) { lowers[i] = (min - 1) + Math.pow(step, i); } setBuckets(lowers, min, max); } public void setTitle(String t) { title = t; } public static int getNumBins() { return currentNumBins; } public static void setNumBins(int currentNumBins) { Histogram.currentNumBins = currentNumBins; } }