package org.notmysock.hive; import hyperloglog.HyperLogLog; import hyperloglog.HyperLogLogUtils; import java.io.ByteArrayInputStream; import java.io.IOException; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; @Description(name = "hyperloglog_value", value = "_FUNC_(x)") public class UDFHyperLogLogValue extends UDF { public LongWritable evaluate(BytesWritable bw) throws HiveException { if (bw == null) { return new LongWritable(0); } ByteArrayInputStream input = new ByteArrayInputStream(bw.getBytes(), 0, bw.getLength()); final HyperLogLog hll; try { hll = HyperLogLogUtils.deserializeHLL(input); } catch (IOException ioe) { throw new HiveException(ioe); } return new LongWritable(hll.count()); } }