/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.udf.generic; import java.util.ArrayList; import java.util.List; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.util.JavaDataModel; import org.apache.hadoop.hive.serde2.io.DoubleWritable; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; /** * Computes an approximate histogram of a numerical column using a user-specified number of bins. * * The output is an array of (x,y) pairs as Hive struct objects that represents the histogram's * bin centers and heights. */ @Description(name = "histogram_numeric", value = "_FUNC_(expr, nb) - Computes a histogram on numeric 'expr' using nb bins.", extended = "Example:\n" + "> SELECT histogram_numeric(val, 3) FROM src;\n" + "[{\"x\":100,\"y\":14.0},{\"x\":200,\"y\":22.0},{\"x\":290.5,\"y\":11.0}]\n" + "The return value is an array of (x,y) pairs representing the centers of the " + "histogram's bins. As the value of 'nb' is increased, the histogram approximation" + "gets finer-grained, but may yield artifacts around outliers. In practice, 20-40 " + "histogram bins appear to work well, with more bins being required for skewed or " + "smaller datasets. Note that this function creates a histogram with non-uniform " + "bin widths. It offers no guarantees in terms of the mean-squared-error of the " + "histogram, but in practice is comparable to the histograms produced by the R/S-Plus" + "statistical computing packages.") public class GenericUDAFHistogramNumeric extends AbstractGenericUDAFResolver { // class static variables static final Logger LOG = LoggerFactory.getLogger(GenericUDAFHistogramNumeric.class.getName()); @Override public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { if (parameters.length != 2) { throw new UDFArgumentTypeException(parameters.length - 1, "Please specify exactly two arguments."); } // validate the first parameter, which is the expression to compute over if (parameters[0].getCategory() != ObjectInspector.Category.PRIMITIVE) { throw new UDFArgumentTypeException(0, "Only primitive type arguments are accepted but " + parameters[0].getTypeName() + " was passed as parameter 1."); } switch (((PrimitiveTypeInfo) parameters[0]).getPrimitiveCategory()) { case BYTE: case SHORT: case INT: case LONG: case FLOAT: case DOUBLE: case TIMESTAMP: case DECIMAL: break; case STRING: case BOOLEAN: case DATE: default: throw new UDFArgumentTypeException(0, "Only numeric type arguments are accepted but " + parameters[0].getTypeName() + " was passed as parameter 1."); } // validate the second parameter, which is the number of histogram bins if (parameters[1].getCategory() != ObjectInspector.Category.PRIMITIVE) { throw new UDFArgumentTypeException(1, "Only primitive type arguments are accepted but " + parameters[1].getTypeName() + " was passed as parameter 2."); } if( ((PrimitiveTypeInfo) parameters[1]).getPrimitiveCategory() != PrimitiveObjectInspector.PrimitiveCategory.INT) { throw new UDFArgumentTypeException(1, "Only an integer argument is accepted as parameter 2, but " + parameters[1].getTypeName() + " was passed instead."); } return new GenericUDAFHistogramNumericEvaluator(); } /** * Construct a histogram using an algorithm described by Ben-Haim and Tom-Tov. * * The algorithm is a heuristic adapted from the following paper: * Yael Ben-Haim and Elad Tom-Tov, "A streaming parallel decision tree algorithm", * J. Machine Learning Research 11 (2010), pp. 849--872. Although there are no approximation * guarantees, it appears to work well with adequate data and a large (e.g., 20-80) number * of histogram bins. */ public static class GenericUDAFHistogramNumericEvaluator extends GenericUDAFEvaluator { // For PARTIAL1 and COMPLETE: ObjectInspectors for original data private PrimitiveObjectInspector inputOI; private transient PrimitiveObjectInspector nbinsOI; // For PARTIAL2 and FINAL: ObjectInspectors for partial aggregations (list of doubles) private transient ListObjectInspector loi; @Override public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { super.init(m, parameters); // init input object inspectors if (m == Mode.PARTIAL1 || m == Mode.COMPLETE) { assert(parameters.length == 2); inputOI = (PrimitiveObjectInspector) parameters[0]; nbinsOI = (PrimitiveObjectInspector) parameters[1]; } else { loi = (ListObjectInspector) parameters[0]; } // init output object inspectors if (m == Mode.PARTIAL1 || m == Mode.PARTIAL2) { // The output of a partial aggregation is a list of doubles representing the // histogram being constructed. The first element in the list is the user-specified // number of bins in the histogram, and the histogram itself is represented as (x,y) // pairs following the first element, so the list length should *always* be odd. return ObjectInspectorFactory.getStandardListObjectInspector( PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); } else { // The output of FINAL and COMPLETE is a full aggregation, which is a // list of DoubleWritable structs that represent the final histogram as // (x,y) pairs of bin centers and heights. ArrayList<ObjectInspector> foi = new ArrayList<ObjectInspector>(); foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); foi.add(PrimitiveObjectInspectorFactory.writableDoubleObjectInspector); ArrayList<String> fname = new ArrayList<String>(); fname.add("x"); fname.add("y"); return ObjectInspectorFactory.getStandardListObjectInspector( ObjectInspectorFactory.getStandardStructObjectInspector(fname, foi) ); } } @Override public Object terminatePartial(AggregationBuffer agg) throws HiveException { // Return a single ArrayList where the first element is the number of histogram bins, // and subsequent elements represent histogram (x,y) pairs. StdAgg myagg = (StdAgg) agg; return myagg.histogram.serialize(); } @Override public Object terminate(AggregationBuffer agg) throws HiveException { StdAgg myagg = (StdAgg) agg; if (myagg.histogram.getUsedBins() < 1) { // SQL standard - return null for zero elements return null; } else { ArrayList<DoubleWritable[]> result = new ArrayList<DoubleWritable[]>(); for(int i = 0; i < myagg.histogram.getUsedBins(); i++) { DoubleWritable[] bin = new DoubleWritable[2]; bin[0] = new DoubleWritable(myagg.histogram.getBin(i).x); bin[1] = new DoubleWritable(myagg.histogram.getBin(i).y); result.add(bin); } return result; } } @Override public void merge(AggregationBuffer agg, Object partial) throws HiveException { if(partial == null) { return; } List<DoubleWritable> partialHistogram = (List<DoubleWritable>) loi.getList(partial); DoubleObjectInspector doi = (DoubleObjectInspector)loi.getListElementObjectInspector(); StdAgg myagg = (StdAgg) agg; myagg.histogram.merge(partialHistogram, doi); } @Override public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { assert (parameters.length == 2); if(parameters[0] == null || parameters[1] == null) { return; } StdAgg myagg = (StdAgg) agg; // Parse out the number of histogram bins only once, if we haven't already done // so before. We need at least 2 bins; otherwise, there is no point in creating // a histogram. if(!myagg.histogram.isReady()) { int nbins = PrimitiveObjectInspectorUtils.getInt(parameters[1], nbinsOI); if(nbins < 2) { throw new HiveException(getClass().getSimpleName() + " needs nbins to be at least 2," + " but you supplied " + nbins + "."); } // allocate memory for the histogram bins myagg.histogram.allocate(nbins); } // Process the current data point double v = PrimitiveObjectInspectorUtils.getDouble(parameters[0], inputOI); myagg.histogram.add(v); } // Aggregation buffer definition and manipulation methods @AggregationType(estimable = true) static class StdAgg extends AbstractAggregationBuffer { NumericHistogram histogram; // the histogram object @Override public int estimate() { return histogram.lengthFor(JavaDataModel.get()); } }; @Override public AggregationBuffer getNewAggregationBuffer() throws HiveException { StdAgg result = new StdAgg(); reset(result); return result; } @Override public void reset(AggregationBuffer agg) throws HiveException { StdAgg myagg = (StdAgg) agg; myagg.histogram = new NumericHistogram(); myagg.histogram.reset(); } } }