GenericUDAFEvaluator.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.udf.generic;

import java.io.Closeable;
import java.io.IOException;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.util.List;

import org.apache.hadoop.hive.ql.exec.MapredContext;
import org.apache.hadoop.hive.ql.exec.PTFPartition;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.ptf.PTFExpressionDef;
import org.apache.hadoop.hive.ql.plan.ptf.WindowFrameDef;
import org.apache.hadoop.hive.ql.udf.UDFType;
import org.apache.hadoop.hive.ql.udf.ptf.BasePartitionEvaluator;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hive.common.util.AnnotationUtils;

/**
 * A Generic User-defined aggregation function (GenericUDAF) for the use with
 * Hive.
 * 
 * New GenericUDAF classes need to inherit from this GenericUDAF class.
 * 
 * The GenericUDAF are superior to normal UDAFs in the following ways: 1. It can
 * accept arguments of complex types, and return complex types. 2. It can accept
 * variable length of arguments. 3. It can accept an infinite number of function
 * signature - for example, it's easy to write a GenericUDAF that accepts
 * array<int>, array<array<int>> and so on (arbitrary levels of nesting).
 */
@UDFType(deterministic = true)
public abstract class GenericUDAFEvaluator implements Closeable {

  @Retention(RetentionPolicy.RUNTIME)
  public static @interface AggregationType {
    boolean estimable() default false;
  }

  public static boolean isEstimable(AggregationBuffer buffer) {
    if (buffer instanceof AbstractAggregationBuffer) {
      Class<? extends AggregationBuffer> clazz = buffer.getClass();
      AggregationType annotation = AnnotationUtils.getAnnotation(clazz, AggregationType.class);
      return annotation != null && annotation.estimable();
    }
    return false;
  }

  /**
   * Mode.
   *
   */
  public static enum Mode {
    /**
     * PARTIAL1: from original data to partial aggregation data: iterate() and
     * terminatePartial() will be called.
     */
    PARTIAL1,
        /**
     * PARTIAL2: from partial aggregation data to partial aggregation data:
     * merge() and terminatePartial() will be called.
     */
    PARTIAL2,
        /**
     * FINAL: from partial aggregation to full aggregation: merge() and
     * terminate() will be called.
     */
    FINAL,
        /**
     * COMPLETE: from original data directly to full aggregation: iterate() and
     * terminate() will be called.
     */
    COMPLETE
  };

  Mode mode;

  /**
   * The constructor.
   */
  public GenericUDAFEvaluator() {
  }

  /**
   * Additionally setup GenericUDAFEvaluator with MapredContext before initializing.
   * This is only called in runtime of MapRedTask.
   *
   * @param mapredContext context
   */
  public void configure(MapredContext mapredContext) {
  }

  /**
   * Initialize the evaluator.
   * 
   * @param m
   *          The mode of aggregation.
   * @param parameters
   *          The ObjectInspector for the parameters: In PARTIAL1 and COMPLETE
   *          mode, the parameters are original data; In PARTIAL2 and FINAL
   *          mode, the parameters are just partial aggregations (in that case,
   *          the array will always have a single element).
   * @return The ObjectInspector for the return value. In PARTIAL1 and PARTIAL2
   *         mode, the ObjectInspector for the return value of
   *         terminatePartial() call; In FINAL and COMPLETE mode, the
   *         ObjectInspector for the return value of terminate() call.
   * 
   *         NOTE: We need ObjectInspector[] (in addition to the TypeInfo[] in
   *         GenericUDAFResolver) for 2 reasons: 1. ObjectInspector contains
   *         more information than TypeInfo; and GenericUDAFEvaluator.init at
   *         execution time. 2. We call GenericUDAFResolver.getEvaluator at
   *         compilation time,
   */
  public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException {
    // This function should be overriden in every sub class
    // And the sub class should call super.init(m, parameters) to get mode set.
    mode = m;
    return null;
  }

  /**
   * The interface for a class that is used to store the aggregation result
   * during the process of aggregation.
   * 
   * We split this piece of data out because there can be millions of instances
   * of this Aggregation in hash-based aggregation process, and it's very
   * important to conserve memory.
   * 
   * In the future, we may completely hide this class inside the Evaluator and
   * use integer numbers to identify which aggregation we are looking at.
   *
   * @deprecated use {@link AbstractAggregationBuffer} instead
   */
  public static interface AggregationBuffer {
  };

  public static abstract class AbstractAggregationBuffer implements AggregationBuffer {
    /**
     * Estimate the size of memory which is occupied by aggregation buffer.
     * Currently, hive assumes that primitives types occupies 16 byte and java object has
     * 64 byte overhead for each. For map, each entry also has 64 byte overhead.
     */
    public int estimate() { return -1; }
  }

  /**
   * Get a new aggregation object.
   */
  public abstract AggregationBuffer getNewAggregationBuffer() throws HiveException;

  /**
   * Reset the aggregation. This is useful if we want to reuse the same
   * aggregation.
   */
  public abstract void reset(AggregationBuffer agg) throws HiveException;

  /**
   * Close GenericUDFEvaluator.
   * This is only called in runtime of MapRedTask.
   */
  public void close() throws IOException {
  }

  /**
   * This function will be called by GroupByOperator when it sees a new input
   * row.
   * 
   * @param agg
   *          The object to store the aggregation result.
   * @param parameters
   *          The row, can be inspected by the OIs passed in init().
   */
  public void aggregate(AggregationBuffer agg, Object[] parameters) throws HiveException {
    if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
      iterate(agg, parameters);
    } else {
      assert (parameters.length == 1);
      merge(agg, parameters[0]);
    }
  }

  /**
   * This function will be called by GroupByOperator when it sees a new input
   * row.
   * 
   * @param agg
   *          The object to store the aggregation result.
   */
  public Object evaluate(AggregationBuffer agg) throws HiveException {
    if (mode == Mode.PARTIAL1 || mode == Mode.PARTIAL2) {
      return terminatePartial(agg);
    } else {
      return terminate(agg);
    }
  }

  /**
   * Iterate through original data.
   * 
   * @param parameters
   *          The objects of parameters.
   */
  public abstract void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException;

  /**
   * Get partial aggregation result.
   * 
   * @return partial aggregation result.
   */
  public abstract Object terminatePartial(AggregationBuffer agg) throws HiveException;

  /**
   * Merge with partial aggregation result. NOTE: null might be passed in case
   * there is no input data.
   * 
   * @param partial
   *          The partial aggregation result.
   */
  public abstract void merge(AggregationBuffer agg, Object partial) throws HiveException;

  /**
   * Get final aggregation result.
   * 
   * @return final aggregation result.
   */
  public abstract Object terminate(AggregationBuffer agg) throws HiveException;

  /**
   * When evaluating an aggregates over a fixed Window, the naive way to compute
   * results is to compute the aggregate for each row. But often there is a way
   * to compute results in a more efficient manner. This method enables the
   * basic evaluator to provide a function object that does the job in a more
   * efficient manner.
   * <p>
   * This method is called after this Evaluator is initialized. The returned
   * Function must be initialized. It is passed the 'window' of aggregation for
   * each row.
   * 
   * @param wFrmDef
   *          the Window definition in play for this evaluation.
   * @return null implies that this fn cannot be processed in Streaming mode. So
   *         each row is evaluated independently.
   */
  public GenericUDAFEvaluator getWindowingEvaluator(WindowFrameDef wFrmDef) {
    return null;
  }

  /**
   * Optional information to add to expression string. Subclasses can override.
   */
  public String getExprString() {
    return "";
  }

  protected BasePartitionEvaluator partitionEvaluator;

  /**
   * When evaluating an aggregates over a fixed Window, streaming is not possible
   * especially for RANGE Window type. For such case, the whole partition data needs
   * to be collected and then to evaluate the aggregates. The naive approach is to
   * calculate a row range for each row and to perform the aggregates. For some
   * functions, a better implementation can be used to reduce the calculation.
   * Note: since the evaluator is reused across different partitions, AggregationBuffer
   * needs reset before aggregating for the new partition in the implementation.
   * @param winFrame    the Window definition in play for this evaluation.
   * @param partition   the partition data
   * @param parameters  the list of the expressions in the function
   * @param outputOI    the output object inspector
   * @return            the evaluator, default to BasePartitionEvaluator which
   *                    implements the naive approach
   */
  public final BasePartitionEvaluator getPartitionWindowingEvaluator(
      WindowFrameDef winFrame,
      PTFPartition partition,
      List<PTFExpressionDef> parameters,
      ObjectInspector outputOI) {
    if (partitionEvaluator == null) {
      partitionEvaluator = createPartitionEvaluator(winFrame, partition, parameters, outputOI);
    }

    return partitionEvaluator;
  }

  /**
   *  This class needs to be overridden by the child class to implement function
   *  specific evaluator.
   */
  protected BasePartitionEvaluator createPartitionEvaluator(
      WindowFrameDef winFrame,
      PTFPartition partition,
      List<PTFExpressionDef> parameters,
      ObjectInspector outputOI) {
    return new BasePartitionEvaluator(this, winFrame, partition, parameters, outputOI);
  }
}