package com.facebook.hive.udf; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDAF; import org.apache.hadoop.hive.ql.exec.UDAFEvaluator; /** * Compute the normalized entropy of a series of counts. The input is assumed * to be a column of counts of each value's occurence (if the column contains * draws from a probability distribution rather than the distribution itself, * consider using FB_MAP_ENTROPY). The entropy is normalized in the sense that * the counts are first normalized so that the sum of all counts equals one * (i.e., it is converted to a probability distribution). Any NULLs in the * column are ignored. If the column contains negative values then NULL is * returned. If the total count of entries of in the column is zero then zero * is returned. * * Note that the entropy is computed using base-2 log. */ @Description(name = "entropy", value = "_FUNC_(counts) - Return the normalized entropy of the counts.") public final class UDAFEntropy extends UDAF { /** * Implementation note: this is implemented efficiently in one pass. * * Let S = sum(x) * * -H = sum((x / S) * log(x / S)) * = (1 / S) sum(x * (log(x) - log(S))) * = (1 / S) (sum(x * log(x)) - log(S) * sum(x)) * = (1 / S) sum(x * log(x)) - log(S) */ public static class UDAFEntropyState { private double sum_x; private double sum_x_log_x; private boolean poisoned; } public static class UDAFEntropyEvaluator implements UDAFEvaluator { UDAFEntropyState state; public UDAFEntropyEvaluator() { super(); state = new UDAFEntropyState(); init(); } public void init() { state.sum_x = 0.0; state.sum_x_log_x = 0.0; state.poisoned = false; } private static final double log2 = Math.log(2); public boolean iterate(Double x) { if (x != null && !state.poisoned) { if (x > 0) { state.sum_x += x; state.sum_x_log_x += x * Math.log(x); } else if (x == 0) { // Use this slightly convoluted test to ensure that we poison NaNs. } else { state.poisoned = true; } } return true; } public UDAFEntropyState terminatePartial() { return state; } public boolean merge(UDAFEntropyState o) { state.poisoned |= o.poisoned; state.sum_x += o.sum_x; state.sum_x_log_x += o.sum_x_log_x; return true; } public Double terminate() { if (state.poisoned) { return null; } if (state.sum_x == 0) { return Double.valueOf(0); } double entropy = -state.sum_x_log_x / state.sum_x + Math.log(state.sum_x); // Clip small negative values. if (entropy < 0) { entropy = 0; } else { entropy /= log2; } return Double.valueOf(entropy); } } }