package org.notmysock.hive; import java.util.ArrayList; import java.util.List; import hyperloglog.HyperLogLog; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.AggregationBuffer; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.notmysock.hive.UDAFHyperLogLog.HyperLogLogBuffer; import org.notmysock.hive.UDAFHyperLogLog.HyperLogLogEvaluator; @Description(name = "approx_distinct", value = "_FUNC_(x)") public class UDAFApproximateDistinct extends UDAFHyperLogLog { public static final class CountApproximateDistinctEvaluator extends HyperLogLogEvaluator { @Override public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { ObjectInspector hyperloglog = super.init(m, parameters); if(m == Mode.FINAL || m == Mode.COMPLETE) { return PrimitiveObjectInspectorFactory.writableLongObjectInspector; } return hyperloglog; } @Override public Object terminate(AggregationBuffer agg) throws HiveException { HyperLogLog hll = ((HyperLogLogBuffer)agg).hll; return new LongWritable(hll.count()); } } @Override public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { if (parameters.length != 1) { throw new IllegalArgumentException("Function only takes 1 parameter"); } else if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE && parameters[0].getCategory() != ObjectInspector.Category.STRUCT) { throw new UDFArgumentTypeException(1, "Only primitive/struct rows are accepted but " + parameters[0].getTypeName() + " was passed."); } return new CountApproximateDistinctEvaluator(); } }