package com.facebook.hive.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.util.Arrays;
/**
* This UDF computes the cumulative sum (initialized at 0). Whenever
* one of the key columns change, the sum is reinitialized. Rows
* where the value column is NULL do not contribute to the sum.
*
* Queries will typically first sort the data to ensure that the data
* seen by this reducer is in the correct order. For example, you
* have a table of the number of user actions in each time bucket and
* you want the cumulative number of actions for a user in each time
* bucket,
*
* SELECT user, time, CUMSUM(num_actions, user) AS cumulative_actions
* FROM (
* SELECT user, time, num_actions
* FROM your_table
* SORT BY user, time
* DISTRIBUTE BY user
* ) A
*
* INPUT:
* user time num_actions
* 1 0 1
* 2 0 2
* 1 1 1
* 2 1 1
*
* OUTPUT:
* user time cumulative_actions
* 1 0 1
* 1 1 2
* 2 0 2
* 2 1 3
*/
@Description(name = "udfcumsum",
value = "_FUNC_(VAL, KEYS...) - Computes a cumulative sum on the VAL column. Resets whenever KEYS... changes.")
public class UDFCumsum extends UDF {
Object previous_keys[] = null;
Double running_sum;
public Double evaluate(Double val, Object... keys) {
if (previous_keys == null || !Arrays.equals(previous_keys, keys)) {
running_sum = 0.0;
previous_keys = keys.clone();
}
if (val != null) {
running_sum += val;
}
return running_sum;
}
}