package com.facebook.hive.udf; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import java.util.HashMap; /** * Compute the normalized entropy of a histogram. The histogram is assumed to * be encoded as a map between values and counts (e.g. the output of * FB_HISTOGRAM). The entropy is normalized in the sense that the counts are * first normalized so that the sum of all counts equals one (i.e., the * histogram is converted to a probability distribution). If the histogram is * NULL then NULL is returned. Any NULLs in the histogram itself are ignored. * If the histogram contains negative values then NULL is returned. If the * total count of entries of in the histogram is zero (e.g. if the histogram * is empty), then zero is returned. * * Note that the entropy is computed using base-2 log. */ @Description(name = "udfmapentropy", value = "_FUNC_(histogram) - Return the normalized entropy of the histogram.") public class UDFMapEntropy extends UDF { private static final double log2 = Math.log(2); public Double evaluate(HashMap<String, Double> histogram) { if (histogram == null) { return null; } double total = 0.0; for (Double value : histogram.values()) { if (value != null) { if (value >= 0) { total += value; } else { return null; } } } if (total == 0) { return Double.valueOf(0.0); } double entropy = 0.0; for (Double value : histogram.values()) { if (value != null && value > 0) { entropy -= (value / total) * Math.log(value / total); } } // Clip small negative values. if (entropy < 0) { entropy = 0; } else { entropy /= log2; } return Double.valueOf(entropy); } }