package com.facebook.hive.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.util.Arrays;
/**
* Number rows. The numbering of rows starts at one and increases by one for
* each row. The arguments to the function are zero or more "keys" for the
* UDF; whenever any of the keys changes values the numbering resets to one.
* This allows it to emulate the behavior of a UDAF (see below).
*
* This UDF is a stateful UDF, that is, the output of one row depends on the
* previous row. As such, it is often necessary to explicitly specifiy how
* the rows are distributed/sorted in order to get the desired behavior. For
* example, suppose one has a table of (user, action, time) and one desires
* to label each user's ith action with i.
*
* SELECT A.user, A.action, A.time,
* FB_NUMBER_ROWS(A.user) AS i
* FROM (
* SELECT *
* FROM table
* DISTRIBUTE BY user
* SORT BY user, time
* ) A
*
* DISTRIBUTE by is needed so that all the rows with the same userid will be
* seen by the same machine; otherwise each machine operating in parallel will
* number its own rows starting from one. The SORT BY ensures that the
* numbering proceeds in the desired order. Using A.user as the key of the
* operation ensures that the numbering restarts when a new user is seen.
*/
@Description(name = "NUMBER_ROWS",
value = "_FUNC_(key1, key2, ...) - Number rows starting at 1. Whenever the value of any key changes the numbering is reset to 1.")
public class UDFNumberRows extends UDF {
Object[] previous_keys = null;
int previous_index;
public int evaluate(Object... keys) {
if (previous_keys == null || !Arrays.equals(previous_keys, keys)) {
previous_index = 0;
previous_keys = keys.clone();
}
previous_index++;
return previous_index;
}
}