/* * chombo: Hadoop Map Reduce utility * Author: Pranab Ghosh * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package org.chombo.util; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.TreeMap; /** * Histogram for categorical attributes * @author pranab * */ public class CategoricalHistogramStat { protected Map<String, Integer> binMap = new HashMap<String, Integer>(); protected Map<String, Double> histogram = new HashMap<String, Double>(); protected int sampleCount; protected boolean extendedOutput; protected int outputPrecision = 3; private boolean debugOn = false; /** * */ public void intialize() { binMap.clear(); histogram.clear(); sampleCount = 0; } /** * @param value */ public void add(String value) { add(value, 1); } /** * @param extendedOutput * @return */ public CategoricalHistogramStat withExtendedOutput(boolean extendedOutput) { this.extendedOutput = extendedOutput; return this; } /** * @param outputPrecision * @return */ public CategoricalHistogramStat withOutputPrecision(int outputPrecision) { this.outputPrecision = outputPrecision; return this; } /** * @param value * @param count */ public void add(String value, int count) { Integer curAttrValCount = binMap.get(value); if (null == curAttrValCount) { curAttrValCount = count; } else { curAttrValCount += count; } binMap.put(value, curAttrValCount); sampleCount += count; histogram.clear(); } /** * @return */ public Map<String, Double> getDistribution() { if (histogram.isEmpty()) { for (String attrValue : binMap.keySet()) { histogram.put(attrValue, ((double)binMap.get(attrValue)) / sampleCount); } } return histogram; } /** * @return */ public double getEntropy() { double entropy = 0; getDistribution(); for (String attrValue : histogram.keySet()) { double distrVal = histogram.get(attrValue); entropy -= distrVal * Math.log(distrVal); } return entropy; } /** * @return */ public double getGiniIndex() { double giniIndex = 0; getDistribution(); for (String attrValue : histogram.keySet()) { double distrVal = histogram.get(attrValue); giniIndex += distrVal * distrVal; } giniIndex = 1.0 - giniIndex; return giniIndex; } /** * @return */ public String getMode() { String mode = null; int maxCount = 0; for (String binIndex: binMap.keySet()) { int thisCount = binMap.get(binIndex); if (thisCount > maxCount) { maxCount = thisCount; mode = binIndex; } } return mode; } /** * @param histStat * @return */ public CategoricalHistogramStat merge(CategoricalHistogramStat histStat) { CategoricalHistogramStat mergedHistStat = new CategoricalHistogramStat(); mergedHistStat.extendedOutput = extendedOutput; mergedHistStat.outputPrecision = outputPrecision; //bins for (String catAttrVal : binMap.keySet()) { mergedHistStat.add(catAttrVal, binMap.get(catAttrVal)); } for (String catAttrVal : histStat.binMap.keySet()) { mergedHistStat.add(catAttrVal, histStat.binMap.get(catAttrVal)); } return mergedHistStat; } /** * returns set of items within confidence bound * @param confidenceLimitPercent * @return */ public List<String> getConfidenceBounds(int confidenceLimitPercent) { List<String> confBoundSet = new ArrayList<String>(); int confidenceLimit = (sampleCount * confidenceLimitPercent) / 100; //sort by count TreeMap<Integer, String> countSortedHistogram = new TreeMap<Integer, String>(); for(Map.Entry<String,Integer> entry : binMap.entrySet()) { countSortedHistogram.put(entry.getValue(), entry.getKey()); } //collect high count items double confCount = 0; for(Integer count : countSortedHistogram.descendingKeySet()) { confCount += count; if (confCount < confidenceLimit) { confBoundSet.add(countSortedHistogram.get(count)); } } return confBoundSet; } /* (non-Javadoc) * @see java.lang.Object#toString() */ public String toString() { StringBuilder stBld = new StringBuilder(); final String delim = ","; getDistribution(); //formatting String formatter = "%." + outputPrecision + "f"; //distribution stBld.append(histogram.size()).append(delim); for(String catAttrVal : histogram.keySet()) { double catAttrCount = histogram.get(catAttrVal); stBld.append(catAttrVal).append(delim). append(BasicUtils.formatDouble(catAttrCount, formatter)).append(delim); } //other stats if (extendedOutput) { String formEntropy = BasicUtils.formatDouble(getEntropy(), formatter); stBld.append(getMode()).append(delim).append(formEntropy).append(delim); } return stBld.substring(0, stBld.length() - 1); } }