package edu.brown.statistics; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; import org.apache.log4j.Logger; import edu.brown.logging.LoggerUtil; import edu.brown.logging.LoggerUtil.LoggerBoolean; import edu.brown.utils.MathUtil; public abstract class HistogramUtil { private static final Logger LOG = Logger.getLogger(HistogramUtil.class); private static final LoggerBoolean debug = new LoggerBoolean(); private static final LoggerBoolean trace = new LoggerBoolean(); static { LoggerUtil.attachObserver(LOG, debug, trace); } public static final String DELIMITER = "\t"; public static final String MARKER = "*"; public static final Integer MAX_CHARS = 80; public static final int MAX_VALUE_LENGTH = 20; /** * Histogram Pretty Print * @return */ public static <X> String toString(Histogram<X> histogram) { return toString(histogram, MAX_CHARS, MAX_VALUE_LENGTH); } public static <X> String toString(Histogram<X> histogram, int max_chars) { return toString(histogram, max_chars, MAX_VALUE_LENGTH); } /** * Histogram Pretty Print * @param max_chars * @param max_length * @return */ public static <X> String toString(Histogram<X> histogram, int max_chars, int max_length) { StringBuilder s = new StringBuilder(); // Figure out the max size of the counts int max_ctr_length = 4; long total = 0; for (X value : histogram.values()) { Long ctr = histogram.get(value); if (ctr != null) { total += ctr.longValue(); max_ctr_length = Math.max(max_ctr_length, ctr.toString().length()); } } // FOR boolean debug_percentages = histogram.hasDebugPercentages(); long max_count = histogram.getMaxCount(); // Don't let anything go longer than MAX_VALUE_LENGTH chars String f = "%-" + max_length + "s [%" + max_ctr_length + "d"; if (debug_percentages) { f += " - %4.1f%%"; } f += "] "; boolean first = true; Map<Object, String> debug_names = histogram.getDebugLabels(); boolean has_labels = histogram.hasDebugLabels(); for (X value : histogram.values()) { if (!first) s.append("\n"); String str = null; if (has_labels) str = debug_names.get(value); if (str == null) str = (value != null ? value.toString() : "null"); int value_str_len = str.length(); if (value_str_len > max_length) str = str.substring(0, max_length - 3) + "..."; // Value Label + Count long cnt = (value != null ? histogram.get(value).longValue() : 0); if (debug_percentages) { double percent = (cnt / (double)total) * 100; s.append(String.format(f, str, cnt, percent)); } else { s.append(String.format(f, str, cnt)); } // Histogram Bar int barSize = (int)((cnt / (double)max_count) * max_chars); for (int i = 0; i < barSize; i++) s.append(MARKER); first = false; } // FOR if (histogram.isEmpty()) s.append("<EMPTY>"); return (s.toString()); } /** * Return all the instances of the values stored in the histogram. * This means that if there is a value that has a count of three in the histogram, * then it will appear three times in the returned collection */ public static <X> Collection<X> weightedValues(final Histogram<X> h) { List<X> all = new ArrayList<X>(); for (X x : h.values()) { long cnt = h.get(x, 0l); for (int i = 0; i < cnt; i++) { all.add(x); } // FOR } // FOR return (all); } /** * Returns the list of values sorted in descending order by cardinality * @param h * @return */ public static <X> Collection<X> sortedValues(final Histogram<X> h) { SortedSet<X> sorted = new TreeSet<X>(new Comparator<X>() { public int compare(final X item0, final X item1) { final Long v0 = h.get(item0); final Long v1 = h.get(item1); if (v0.equals(v1)) return (-1); return (v1.compareTo(v0)); } }); sorted.addAll(h.values()); return (sorted); } /** * Return the weighted sum of the values within the histogram * @param h * @return */ public static <T extends Number> long sum(Histogram<T> h) { long total = 0; for (T val : h.values()) { long value = val.longValue(); long weight = h.get(val, 0l); total += (value * weight); } // FOR return (total); } /** * Return the percentile of the values within the histogram * @param h * @return */ @SuppressWarnings({ "unchecked", "rawtypes" }) public static <T extends Number> double[] percentile(Histogram<T> h, int[] percentiles) { List list = new ArrayList(h.values()); Collections.sort(list); List<T> typedList = new ArrayList<T>(list); List<T> values = new ArrayList<T>(); for(T t : typedList){ Long count = h.get(t); for (int i = 0; i< count; i++) { values.add(t); } } double[] res = new double[percentiles.length]; for(int i =0 ; i < percentiles.length; i++){ int percentile = percentiles[i]; if (percentile > 100) { percentile = 100; } if (percentile < 1) { percentile = 1; } if (values.size()==0) { res[i] = Double.NaN; } else if (values.size()==1){ res[i] = values.get(0).doubleValue(); } else{ double position = percentile * (values.size()-1)/ 100.0; if (position < 1){ res[i] = values.get(0).doubleValue(); } else if(position >= values.size()-1){ res[i] = values.get(values.size()-1).doubleValue(); } else{ Double floor = Math.floor(position); double d = position - floor; double v1 = (values.get(floor.intValue())).doubleValue(); double v2 = (values.get(floor.intValue()+1)).doubleValue(); res[i] = (v1 + d * (v2-v1)); } } } return res; } public static <T extends Number> double stdev(Histogram<T> h) { double values[] = new double[h.getSampleCount()]; int idx = 0; for (T val : h.values()) { double value = val.doubleValue(); long weight = h.get(val, 0l); for (int i = 0; i < weight; i++) { values[idx++] = value; } } // FOR assert(idx == values.length) : idx + "!=" + values.length; return (MathUtil.stdev(values)); } /** * Return a map where the values of the Histogram are mapped to doubles in * the range [-1.0, 1.0] * @return */ public static <T> Map<T, Double> normalize(Histogram<T> h) { double delta = 2.0d / (double) (h.getValueCount() - 1); if (trace.val) { LOG.trace("# of Values = " + h.getValueCount()); LOG.trace("Delta Step = " + delta); } // We only want to round the values that we put into the map. If you // round the current counter than we will always be off at the end Map<T, Double> normalized = new HashMap<T, Double>(); int precision = 10; double current = -1.0d; for (T k : h.values()) { normalized.put(k, MathUtil.roundToDecimals(current, precision)); if (trace.val) LOG.trace(k + " => " + current + " / " + normalized.get(k)); current += delta; } // FOR assert(h.getValueCount() == normalized.size()); return (normalized); } }