package brickhouse.udf.hll; /** * Copyright 2012,2013 Klout, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **/ import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFParameterInfo; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.log4j.Logger; /** * Aggregate and return a HyperLogLog. * <p/> * Uses Clearspring's Stream-lib project */ @Description(name = "hyperloglog", value = "_FUNC_(x, [b]) - Constructs a HyperLogLog++ estimator to estimate reach for large values, " + "with optional bit parameter for specifying precision (b must be in [4,16])." + "\nDefault is b = 6." + "\nReturns a binary value that represents the HyperLogLog++ data structure." ) public class HyperLogLogUDAF extends AbstractGenericUDAFResolver { private static final Logger LOG = Logger.getLogger(HyperLogLogUDAF.class); static final int DEFAULT_PRECISION = 6; static final int MIN_PRECISION = 4; static final int MAX_PRECISION = 16; @SuppressWarnings("deprecation") @Override public GenericUDAFEvaluator getEvaluator(GenericUDAFParameterInfo info) throws SemanticException { TypeInfo[] parameters = info.getParameters(); if (parameters.length != 1 && parameters.length != 2) { throw new UDFArgumentTypeException(parameters.length - 1, "Please specify one or two arguments."); } if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but " + parameters[0].getTypeName() + " was passed as parameter 1."); } if (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.STRING) { throw new UDFArgumentTypeException(0, "Only a string argument is accepted as parameter 1, but " + parameters[0].getTypeName() + " was passed instead."); } if (parameters.length == 2) { // validate the second parameter, which is the precision value if (parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) { throw new UDFArgumentTypeException(1, "Only primitive type arguments are accepted but " + parameters[1].getTypeName() + " was passed as parameter 2."); } if (((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.INT) { throw new UDFArgumentTypeException(1, "Only an integer argument is accepted as parameter 2, but " + parameters[1].getTypeName() + " was passed instead."); } } if (parameters.length > 2) throw new IllegalArgumentException("Function only takes 1 or 2 parameters."); return new HyperLogLogUDAFEvaluator(); } public static class HyperLogLogUDAFEvaluator extends GenericUDAFEvaluator { private static final Logger LOG = Logger.getLogger(HyperLogLogUDAFEvaluator.class); // For PARTIAL1 and COMPLETE: ObjectInspectors for original data private StringObjectInspector inputStrOI; private IntObjectInspector inputPrecisionIntOI; // For PARTIAL2 and FINAL: ObjectInspectors for partial aggregations (binary serialized hll object) private BinaryObjectInspector partialBufferOI; public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { super.init(m, parameters); LOG.debug("evaluator init: mode = " + m.name()); // init input object inspectors if (m == Mode.PARTIAL1 || m == Mode.COMPLETE) { // iterate() gets called.. string and int passed in this.inputStrOI = (StringObjectInspector) parameters[0]; if (parameters.length == 2) { this.inputPrecisionIntOI = (IntObjectInspector) parameters[1]; } } else { // Mode m == Mode.PARTIAL2 || m == Mode.FINAL // merge() gets called ... serialized hll is passed in this.partialBufferOI = (BinaryObjectInspector) parameters[0]; } // init output object inspectors // The partial aggregate type is the same as the final type return PrimitiveObjectInspectorFactory.javaByteArrayObjectInspector; } @Override public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { assert (parameters.length == 1 || parameters.length == 2); if ((parameters.length == 1 && parameters[0] == null) || (parameters.length == 2 && (parameters[0] == null || parameters[1] == null))) { return; } HLLBuffer myagg = (HLLBuffer) agg; // initialize aggregation buffer once if (!myagg.isReady()) { LOG.debug("agg buffer is not ready"); int p = DEFAULT_PRECISION; // If specified, parse out the precision and validate it is in allowed range. if (parameters.length == 2) { p = PrimitiveObjectInspectorUtils.getInt(parameters[1], inputPrecisionIntOI); if (p < MIN_PRECISION || p > MAX_PRECISION) { throw new HiveException(getClass().getSimpleName() + " precision must be in [4,16]," + " but you supplied " + p + "."); } } LOG.debug("initializing agg buffer: p = " + p); // allocate memory for the histogram bins myagg.init(p); } // string object to be added to hll Object strObj = parameters[0]; String str = inputStrOI.getPrimitiveJavaObject(strObj); myagg.addItem(str); } @Override public void merge(AggregationBuffer agg, Object partial) throws HiveException { if (partial == null) { return; } try { HLLBuffer myagg = (HLLBuffer) agg; byte[] partialBuffer = this.partialBufferOI .getPrimitiveJavaObject(partial); myagg.merge(partialBuffer); } catch (Exception e) { throw new HiveException(e); } } @Override public Object terminate(AggregationBuffer agg) throws HiveException { try { HLLBuffer myagg = (HLLBuffer) agg; return myagg.getPartial(); } catch (Exception e) { throw new HiveException(e); } } @Override public Object terminatePartial(AggregationBuffer agg) throws HiveException { LOG.debug("terminatePartial"); return terminate(agg); } @Override public AggregationBuffer getNewAggregationBuffer() throws HiveException { LOG.debug("getNewAggregationBuffer"); HLLBuffer buff = new HLLBuffer(); reset(buff); return buff; } @Override public void reset(AggregationBuffer buff) throws HiveException { LOG.debug("reset"); HLLBuffer hllBuff = (HLLBuffer) buff; hllBuff.reset(); } } }