/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.vector;
import java.lang.reflect.Constructor;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.commons.lang.ArrayUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.common.type.HiveChar;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.common.type.HiveIntervalDayTime;
import org.apache.hadoop.hive.common.type.HiveIntervalYearMonth;
import org.apache.hadoop.hive.common.type.HiveVarchar;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory;
import org.apache.hadoop.hive.ql.exec.FunctionInfo;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector.Type;
import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.ArgumentType;
import org.apache.hadoop.hive.ql.exec.vector.VectorExpressionDescriptor.InputExpressionType;
import org.apache.hadoop.hive.ql.exec.vector.expressions.*;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFAvgDecimal;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFAvgTimestamp;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFBloomFilter;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFBloomFilterMerge;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFCount;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFCountMerge;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFCountStar;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFStdPopTimestamp;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFStdSampTimestamp;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFSumDecimal;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFVarPopTimestamp;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorUDAFVarSampTimestamp;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFAvgDouble;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFAvgLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMaxDecimal;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMaxDouble;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMaxLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMaxString;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMaxTimestamp;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMinDecimal;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMinDouble;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMinLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMinString;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFMinTimestamp;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFStdPopDecimal;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFStdPopDouble;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFStdPopLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFStdSampDecimal;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFStdSampDouble;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFStdSampLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFSumDouble;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFSumLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFVarPopDecimal;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFVarPopDouble;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFVarPopLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFVarSampDecimal;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFVarSampDouble;
import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.gen.VectorUDAFVarSampLong;
import org.apache.hadoop.hive.ql.exec.vector.expressions.gen.*;
import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFAdaptor;
import org.apache.hadoop.hive.ql.exec.vector.udf.VectorUDFArgDesc;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.optimizer.physical.Vectorizer;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicValueDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.udf.*;
import org.apache.hadoop.hive.ql.udf.generic.*;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
import org.apache.hadoop.hive.serde2.ByteStream.Output;
import org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.hadoop.hive.serde2.io.DoubleWritable;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.typeinfo.BaseCharTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.DecimalTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.StringUtils;
import org.apache.hive.common.util.DateUtils;
import com.google.common.annotations.VisibleForTesting;
/**
* Context class for vectorization execution.
* Main role is to map column names to column indices and serves as a
* factory class for building vectorized expressions out of descriptors.
*
*/
public class VectorizationContext {
private static final Logger LOG = LoggerFactory.getLogger(
VectorizationContext.class.getName());
private final String contextName;
private final int level;
VectorExpressionDescriptor vMap;
private final List<String> initialColumnNames;
private List<Integer> projectedColumns;
private List<String> projectionColumnNames;
private Map<String, Integer> projectionColumnMap;
//columnName to column position map
// private final Map<String, Integer> columnMap;
private int firstOutputColumnIndex;
public enum HiveVectorAdaptorUsageMode {
NONE,
CHOSEN,
ALL;
public static HiveVectorAdaptorUsageMode getHiveConfValue(HiveConf hiveConf) {
String string = HiveConf.getVar(hiveConf,
HiveConf.ConfVars.HIVE_VECTOR_ADAPTOR_USAGE_MODE);
return valueOf(string.toUpperCase());
}
}
private HiveVectorAdaptorUsageMode hiveVectorAdaptorUsageMode;
private void setHiveConfVars(HiveConf hiveConf) {
hiveVectorAdaptorUsageMode = HiveVectorAdaptorUsageMode.getHiveConfValue(hiveConf);
}
private void copyHiveConfVars(VectorizationContext vContextEnvironment) {
hiveVectorAdaptorUsageMode = vContextEnvironment.hiveVectorAdaptorUsageMode;
}
// Convenient constructor for initial batch creation takes
// a list of columns names and maps them to 0..n-1 indices.
public VectorizationContext(String contextName, List<String> initialColumnNames,
HiveConf hiveConf) {
this.contextName = contextName;
level = 0;
this.initialColumnNames = initialColumnNames;
this.projectionColumnNames = initialColumnNames;
projectedColumns = new ArrayList<Integer>();
projectionColumnMap = new HashMap<String, Integer>();
for (int i = 0; i < this.projectionColumnNames.size(); i++) {
projectedColumns.add(i);
projectionColumnMap.put(projectionColumnNames.get(i), i);
}
int firstOutputColumnIndex = projectedColumns.size();
this.ocm = new OutputColumnManager(firstOutputColumnIndex);
this.firstOutputColumnIndex = firstOutputColumnIndex;
vMap = new VectorExpressionDescriptor();
if (hiveConf != null) {
setHiveConfVars(hiveConf);
}
}
public VectorizationContext(String contextName, List<String> initialColumnNames,
VectorizationContext vContextEnvironment) {
this(contextName, initialColumnNames, (HiveConf) null);
copyHiveConfVars(vContextEnvironment);
}
@VisibleForTesting
public VectorizationContext(String contextName, List<String> initialColumnNames) {
this(contextName, initialColumnNames, (HiveConf) null);
}
// Constructor to with the individual addInitialColumn method
// followed by a call to finishedAddingInitialColumns.
public VectorizationContext(String contextName, HiveConf hiveConf) {
this.contextName = contextName;
level = 0;
initialColumnNames = new ArrayList<String>();
projectedColumns = new ArrayList<Integer>();
projectionColumnNames = new ArrayList<String>();
projectionColumnMap = new HashMap<String, Integer>();
this.ocm = new OutputColumnManager(0);
this.firstOutputColumnIndex = 0;
vMap = new VectorExpressionDescriptor();
if (hiveConf != null) {
setHiveConfVars(hiveConf);
}
}
@VisibleForTesting
public VectorizationContext(String contextName) {
this(contextName, (HiveConf) null);
}
// Constructor useful making a projection vectorization context.
// Use with resetProjectionColumns and addProjectionColumn.
// Keeps existing output column map, etc.
public VectorizationContext(String contextName, VectorizationContext vContext) {
this.contextName = contextName;
level = vContext.level + 1;
this.initialColumnNames = vContext.initialColumnNames;
this.projectedColumns = new ArrayList<Integer>();
this.projectionColumnNames = new ArrayList<String>();
this.projectionColumnMap = new HashMap<String, Integer>();
this.ocm = vContext.ocm;
this.firstOutputColumnIndex = vContext.firstOutputColumnIndex;
vMap = new VectorExpressionDescriptor();
copyHiveConfVars(vContext);
}
// Add an initial column to a vectorization context when
// a vectorized row batch is being created.
public void addInitialColumn(String columnName) {
initialColumnNames.add(columnName);
int index = projectedColumns.size();
projectedColumns.add(index);
projectionColumnNames.add(columnName);
projectionColumnMap.put(columnName, index);
}
// Finishes the vectorization context after all the initial
// columns have been added.
public void finishedAddingInitialColumns() {
int firstOutputColumnIndex = projectedColumns.size();
this.ocm = new OutputColumnManager(firstOutputColumnIndex);
this.firstOutputColumnIndex = firstOutputColumnIndex;
}
// Empties the projection columns.
public void resetProjectionColumns() {
projectedColumns = new ArrayList<Integer>();
projectionColumnNames = new ArrayList<String>();
projectionColumnMap = new HashMap<String, Integer>();
}
// Add a projection column to a projection vectorization context.
public void addProjectionColumn(String columnName, int vectorBatchColIndex) {
projectedColumns.add(vectorBatchColIndex);
projectionColumnNames.add(columnName);
projectionColumnMap.put(columnName, vectorBatchColIndex);
}
public List<String> getInitialColumnNames() {
return initialColumnNames;
}
public List<Integer> getProjectedColumns() {
return projectedColumns;
}
public List<String> getProjectionColumnNames() {
return projectionColumnNames;
}
public Map<String, Integer> getProjectionColumnMap() {
return projectionColumnMap;
}
public static final Pattern decimalTypePattern = Pattern.compile("decimal.*",
Pattern.CASE_INSENSITIVE);
public static final Pattern charTypePattern = Pattern.compile("char.*",
Pattern.CASE_INSENSITIVE);
public static final Pattern varcharTypePattern = Pattern.compile("varchar.*",
Pattern.CASE_INSENSITIVE);
public static final Pattern charVarcharTypePattern = Pattern.compile("char.*|varchar.*",
Pattern.CASE_INSENSITIVE);
//Map column number to type
private OutputColumnManager ocm;
// Set of UDF classes for type casting data types in row-mode.
private static Set<Class<?>> castExpressionUdfs = new HashSet<Class<?>>();
static {
castExpressionUdfs.add(GenericUDFToDecimal.class);
castExpressionUdfs.add(GenericUDFToBinary.class);
castExpressionUdfs.add(GenericUDFToDate.class);
castExpressionUdfs.add(GenericUDFToUnixTimeStamp.class);
castExpressionUdfs.add(GenericUDFToUtcTimestamp.class);
castExpressionUdfs.add(GenericUDFToChar.class);
castExpressionUdfs.add(GenericUDFToVarchar.class);
castExpressionUdfs.add(GenericUDFTimestamp.class);
castExpressionUdfs.add(GenericUDFToIntervalYearMonth.class);
castExpressionUdfs.add(GenericUDFToIntervalDayTime.class);
castExpressionUdfs.add(UDFToByte.class);
castExpressionUdfs.add(UDFToBoolean.class);
castExpressionUdfs.add(UDFToDouble.class);
castExpressionUdfs.add(UDFToFloat.class);
castExpressionUdfs.add(UDFToString.class);
castExpressionUdfs.add(UDFToInteger.class);
castExpressionUdfs.add(UDFToLong.class);
castExpressionUdfs.add(UDFToShort.class);
}
// Set of GenericUDFs which require need implicit type casting of decimal parameters.
// Vectorization for mathmatical functions currently depends on decimal params automatically
// being converted to the return type (see getImplicitCastExpression()), which is not correct
// in the general case. This set restricts automatic type conversion to just these functions.
private static Set<Class<?>> udfsNeedingImplicitDecimalCast = new HashSet<Class<?>>();
static {
udfsNeedingImplicitDecimalCast.add(GenericUDFOPPlus.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPMinus.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPMultiply.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPDivide.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPMod.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFRound.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFBRound.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFFloor.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFCbrt.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFCeil.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFAbs.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFPosMod.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFPower.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFFactorial.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPPositive.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPNegative.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFCoalesce.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFElt.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFGreatest.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFLeast.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFIn.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPEqual.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPEqualNS.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPNotEqual.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPLessThan.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPEqualOrLessThan.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPGreaterThan.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFOPEqualOrGreaterThan.class);
udfsNeedingImplicitDecimalCast.add(GenericUDFBetween.class);
udfsNeedingImplicitDecimalCast.add(UDFSqrt.class);
udfsNeedingImplicitDecimalCast.add(UDFRand.class);
udfsNeedingImplicitDecimalCast.add(UDFLn.class);
udfsNeedingImplicitDecimalCast.add(UDFLog2.class);
udfsNeedingImplicitDecimalCast.add(UDFSin.class);
udfsNeedingImplicitDecimalCast.add(UDFAsin.class);
udfsNeedingImplicitDecimalCast.add(UDFCos.class);
udfsNeedingImplicitDecimalCast.add(UDFAcos.class);
udfsNeedingImplicitDecimalCast.add(UDFLog10.class);
udfsNeedingImplicitDecimalCast.add(UDFLog.class);
udfsNeedingImplicitDecimalCast.add(UDFExp.class);
udfsNeedingImplicitDecimalCast.add(UDFDegrees.class);
udfsNeedingImplicitDecimalCast.add(UDFRadians.class);
udfsNeedingImplicitDecimalCast.add(UDFAtan.class);
udfsNeedingImplicitDecimalCast.add(UDFTan.class);
udfsNeedingImplicitDecimalCast.add(UDFOPLongDivide.class);
}
protected boolean needsImplicitCastForDecimal(GenericUDF udf) {
Class<?> udfClass = udf.getClass();
if (udf instanceof GenericUDFBridge) {
udfClass = ((GenericUDFBridge) udf).getUdfClass();
}
return udfsNeedingImplicitDecimalCast.contains(udfClass);
}
protected int getInputColumnIndex(String name) throws HiveException {
if (name == null) {
throw new HiveException("Null column name");
}
if (!projectionColumnMap.containsKey(name)) {
throw new HiveException(String.format("The column %s is not in the vectorization context column map %s.",
name, projectionColumnMap.toString()));
}
return projectionColumnMap.get(name);
}
protected int getInputColumnIndex(ExprNodeColumnDesc colExpr) throws HiveException {
// Call the regular method since it does error checking.
return getInputColumnIndex(colExpr.getColumn());
}
private static class OutputColumnManager {
private final int initialOutputCol;
private int outputColCount = 0;
protected OutputColumnManager(int initialOutputCol) {
this.initialOutputCol = initialOutputCol;
}
//The complete list of output columns. These should be added to the
//Vectorized row batch for processing. The index in the row batch is
//equal to the index in this array plus initialOutputCol.
//Start with size 100 and double when needed.
private String [] scratchVectorTypeNames = new String[100];
private final Set<Integer> usedOutputColumns = new HashSet<Integer>();
int allocateOutputColumn(TypeInfo typeInfo) throws HiveException {
if (initialOutputCol < 0) {
// This is a test calling.
return 0;
}
// CONCERN: We currently differentiate DECIMAL columns by their precision and scale...,
// which could lead to a lot of extra unnecessary scratch columns.
String vectorTypeName = getScratchName(typeInfo);
int relativeCol = allocateOutputColumnInternal(vectorTypeName);
return initialOutputCol + relativeCol;
}
private int allocateOutputColumnInternal(String columnType) {
for (int i = 0; i < outputColCount; i++) {
// Re-use an existing, available column of the same required type.
if (usedOutputColumns.contains(i) ||
!(scratchVectorTypeNames)[i].equalsIgnoreCase(columnType)) {
continue;
}
//Use i
usedOutputColumns.add(i);
return i;
}
//Out of allocated columns
if (outputColCount < scratchVectorTypeNames.length) {
int newIndex = outputColCount;
scratchVectorTypeNames[outputColCount++] = columnType;
usedOutputColumns.add(newIndex);
return newIndex;
} else {
//Expand the array
scratchVectorTypeNames = Arrays.copyOf(scratchVectorTypeNames, 2*outputColCount);
int newIndex = outputColCount;
scratchVectorTypeNames[outputColCount++] = columnType;
usedOutputColumns.add(newIndex);
return newIndex;
}
}
void freeOutputColumn(int index) {
if (initialOutputCol < 0) {
// This is a test
return;
}
int colIndex = index-initialOutputCol;
if (colIndex >= 0) {
usedOutputColumns.remove(index-initialOutputCol);
}
}
public int[] currentScratchColumns() {
TreeSet<Integer> treeSet = new TreeSet<Integer>();
for (Integer col : usedOutputColumns) {
treeSet.add(initialOutputCol + col);
}
return ArrayUtils.toPrimitive(treeSet.toArray(new Integer[0]));
}
}
public int allocateScratchColumn(TypeInfo typeInfo) throws HiveException {
return ocm.allocateOutputColumn(typeInfo);
}
public int[] currentScratchColumns() {
return ocm.currentScratchColumns();
}
private VectorExpression getColumnVectorExpression(ExprNodeColumnDesc
exprDesc, VectorExpressionDescriptor.Mode mode) throws HiveException {
int columnNum = getInputColumnIndex(exprDesc.getColumn());
VectorExpression expr = null;
switch (mode) {
case FILTER:
// Evaluate the column as a boolean, converting if necessary.
TypeInfo typeInfo = exprDesc.getTypeInfo();
if (typeInfo.getCategory() == Category.PRIMITIVE &&
((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory() == PrimitiveCategory.BOOLEAN) {
expr = new SelectColumnIsTrue(columnNum);
} else {
// Ok, we need to convert.
ArrayList<ExprNodeDesc> exprAsList = new ArrayList<ExprNodeDesc>(1);
exprAsList.add(exprDesc);
// First try our cast method that will handle a few special cases.
VectorExpression castToBooleanExpr = getCastToBoolean(exprAsList);
if (castToBooleanExpr == null) {
// Ok, try the UDF.
castToBooleanExpr = getVectorExpressionForUdf(null, UDFToBoolean.class, exprAsList,
VectorExpressionDescriptor.Mode.PROJECTION, null);
if (castToBooleanExpr == null) {
throw new HiveException("Cannot vectorize converting expression " +
exprDesc.getExprString() + " to boolean");
}
}
expr = new SelectColumnIsTrue(castToBooleanExpr.getOutputColumn());
expr.setChildExpressions(new VectorExpression[] {castToBooleanExpr});
}
break;
case PROJECTION:
expr = new IdentityExpression(columnNum, exprDesc.getTypeString());
break;
}
return expr;
}
public VectorExpression[] getVectorExpressions(List<ExprNodeDesc> exprNodes) throws HiveException {
return getVectorExpressions(exprNodes, VectorExpressionDescriptor.Mode.PROJECTION);
}
public VectorExpression[] getVectorExpressions(List<ExprNodeDesc> exprNodes, VectorExpressionDescriptor.Mode mode)
throws HiveException {
int i = 0;
if (null == exprNodes) {
return new VectorExpression[0];
}
VectorExpression[] ret = new VectorExpression[exprNodes.size()];
for (ExprNodeDesc e : exprNodes) {
ret[i++] = getVectorExpression(e, mode);
}
return ret;
}
public VectorExpression getVectorExpression(ExprNodeDesc exprDesc) throws HiveException {
return getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.PROJECTION);
}
/**
* Returns a vector expression for a given expression
* description.
* @param exprDesc, Expression description
* @param mode
* @return {@link VectorExpression}
* @throws HiveException
*/
public VectorExpression getVectorExpression(ExprNodeDesc exprDesc, VectorExpressionDescriptor.Mode mode) throws HiveException {
VectorExpression ve = null;
if (exprDesc instanceof ExprNodeColumnDesc) {
ve = getColumnVectorExpression((ExprNodeColumnDesc) exprDesc, mode);
} else if (exprDesc instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc expr = (ExprNodeGenericFuncDesc) exprDesc;
// push not through between...
if ("not".equals(expr.getFuncText())) {
if (expr.getChildren() != null && expr.getChildren().size() == 1) {
ExprNodeDesc child = expr.getChildren().get(0);
if (child instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc childExpr = (ExprNodeGenericFuncDesc) child;
if ("between".equals(childExpr.getFuncText())) {
ExprNodeConstantDesc flag = (ExprNodeConstantDesc) childExpr.getChildren().get(0);
List<ExprNodeDesc> newChildren = new ArrayList<>();
if (Boolean.TRUE.equals(flag.getValue())) {
newChildren.add(new ExprNodeConstantDesc(Boolean.FALSE));
} else {
newChildren.add(new ExprNodeConstantDesc(Boolean.TRUE));
}
newChildren
.addAll(childExpr.getChildren().subList(1, childExpr.getChildren().size()));
expr.setTypeInfo(childExpr.getTypeInfo());
expr.setGenericUDF(childExpr.getGenericUDF());
expr.setChildren(newChildren);
}
}
}
}
// Add cast expression if needed. Child expressions of a udf may return different data types
// and that would require converting their data types to evaluate the udf.
// For example decimal column added to an integer column would require integer column to be
// cast to decimal.
// Note: this is a no-op for custom UDFs
List<ExprNodeDesc> childExpressions = getChildExpressionsWithImplicitCast(expr.getGenericUDF(),
exprDesc.getChildren(), exprDesc.getTypeInfo());
ve = getGenericUdfVectorExpression(expr.getGenericUDF(),
childExpressions, mode, exprDesc.getTypeInfo());
if (ve == null) {
// Ok, no vectorized class available. No problem -- try to use the VectorUDFAdaptor
// when configured.
//
// NOTE: We assume if hiveVectorAdaptorUsageMode has not been set it because we are
// executing a test that didn't create a HiveConf, etc. No usage of VectorUDFAdaptor in
// that case.
if (hiveVectorAdaptorUsageMode != null) {
switch (hiveVectorAdaptorUsageMode) {
case NONE:
// No VectorUDFAdaptor usage.
throw new HiveException(
"Could not vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString()
+ " because hive.vectorized.adaptor.usage.mode=none");
case CHOSEN:
if (isNonVectorizedPathUDF(expr, mode)) {
ve = getCustomUDFExpression(expr, mode);
} else {
throw new HiveException(
"Could not vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString()
+ " because hive.vectorized.adaptor.usage.mode=chosen"
+ " and the UDF wasn't one of the chosen ones");
}
break;
case ALL:
if (LOG.isDebugEnabled()) {
LOG.debug("We will try to use the VectorUDFAdaptor for " + exprDesc.toString()
+ " because hive.vectorized.adaptor.usage.mode=all");
}
ve = getCustomUDFExpression(expr, mode);
break;
default:
throw new RuntimeException("Unknown hive vector adaptor usage mode " +
hiveVectorAdaptorUsageMode.name());
}
if (ve == null) {
throw new HiveException(
"Unable vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString()
+ " even for the VectorUDFAdaptor");
}
}
}
} else if (exprDesc instanceof ExprNodeConstantDesc) {
ve = getConstantVectorExpression(((ExprNodeConstantDesc) exprDesc).getValue(), exprDesc.getTypeInfo(),
mode);
} else if (exprDesc instanceof ExprNodeDynamicValueDesc) {
ve = getDynamicValueVectorExpression((ExprNodeDynamicValueDesc) exprDesc, mode);
}
if (ve == null) {
throw new HiveException(
"Could not vectorize expression (mode = " + mode.name() + "): " + exprDesc.toString());
}
if (LOG.isDebugEnabled()) {
LOG.debug("Input Expression = " + exprDesc.toString()
+ ", Vectorized Expression = " + ve.toString());
}
return ve;
}
/**
* Given a udf and its children, return the common type to which the children's type should be
* cast.
*/
private TypeInfo getCommonTypeForChildExpressions(GenericUDF genericUdf,
List<ExprNodeDesc> children, TypeInfo returnType) throws HiveException {
TypeInfo commonType;
if (genericUdf instanceof GenericUDFBaseCompare) {
// Apply comparison rules
TypeInfo tLeft = children.get(0).getTypeInfo();
TypeInfo tRight = children.get(1).getTypeInfo();
commonType = FunctionRegistry.getCommonClassForComparison(tLeft, tRight);
if (commonType == null) {
commonType = returnType;
}
} else if (genericUdf instanceof GenericUDFIn) {
TypeInfo colTi = children.get(0).getTypeInfo();
if (colTi.getCategory() != Category.PRIMITIVE) {
return colTi; // Handled later, only struct will be supported.
}
TypeInfo opTi = GenericUDFUtils.deriveInType(children);
if (opTi == null || opTi.getCategory() != Category.PRIMITIVE) {
throw new HiveException("Cannot vectorize IN() - common type is " + opTi);
}
if (((PrimitiveTypeInfo)colTi).getPrimitiveCategory() !=
((PrimitiveTypeInfo)opTi).getPrimitiveCategory()) {
throw new HiveException("Cannot vectorize IN() - casting a column is not supported. "
+ "Column type is " + colTi + " but the common type is " + opTi);
}
return colTi;
} else {
// The children type should be converted to return type
commonType = returnType;
}
return commonType;
}
/**
* Add a cast expression to the expression tree if needed. The output of child expressions of a given UDF might
* need a cast if their return type is different from the return type of the UDF.
*
* @param genericUDF The given UDF
* @param children Child expressions of the UDF that might require a cast.
* @param returnType The return type of the UDF.
* @return List of child expressions added with cast.
*/
private List<ExprNodeDesc> getChildExpressionsWithImplicitCast(GenericUDF genericUDF,
List<ExprNodeDesc> children, TypeInfo returnType) throws HiveException {
if (isCustomUDF(genericUDF.getUdfName())) {
// no implicit casts possible
return children;
}
if (isExcludedFromCast(genericUDF)) {
// No implicit cast needed
return children;
}
if (children == null) {
return null;
}
TypeInfo commonType = getCommonTypeForChildExpressions(genericUDF, children, returnType);
if (commonType == null) {
// Couldn't determine common type, don't cast
return children;
}
List<ExprNodeDesc> childrenWithCasts = new ArrayList<ExprNodeDesc>();
boolean atleastOneCastNeeded = false;
if (genericUDF instanceof GenericUDFElt) {
int i = 0;
for (ExprNodeDesc child : children) {
TypeInfo castType = commonType;
if (i++ == 0) {
castType = isIntFamily(child.getTypeString()) ? child.getTypeInfo() : TypeInfoFactory.intTypeInfo;
}
ExprNodeDesc castExpression = getImplicitCastExpression(genericUDF, child, castType);
if (castExpression != null) {
atleastOneCastNeeded = true;
childrenWithCasts.add(castExpression);
} else {
childrenWithCasts.add(child);
}
}
} else {
for (ExprNodeDesc child : children) {
ExprNodeDesc castExpression = getImplicitCastExpression(genericUDF, child, commonType);
if (castExpression != null) {
atleastOneCastNeeded = true;
childrenWithCasts.add(castExpression);
} else {
childrenWithCasts.add(child);
}
}
}
if (atleastOneCastNeeded) {
return childrenWithCasts;
} else {
return children;
}
}
private boolean isExcludedFromCast(GenericUDF genericUDF) {
boolean ret = castExpressionUdfs.contains(genericUDF.getClass())
|| (genericUDF instanceof GenericUDFRound) || (genericUDF instanceof GenericUDFBetween);
if (ret) {
return ret;
}
if (genericUDF instanceof GenericUDFBridge) {
Class<?> udfClass = ((GenericUDFBridge) genericUDF).getUdfClass();
return castExpressionUdfs.contains(udfClass)
|| UDFSign.class.isAssignableFrom(udfClass);
}
return false;
}
/**
* Creates a DecimalTypeInfo object with appropriate precision and scale for the given
* inputTypeInfo.
*/
private TypeInfo updatePrecision(TypeInfo inputTypeInfo, DecimalTypeInfo returnType) {
if (!(inputTypeInfo instanceof PrimitiveTypeInfo)) {
return returnType;
}
PrimitiveTypeInfo ptinfo = (PrimitiveTypeInfo) inputTypeInfo;
int precision = getPrecisionForType(ptinfo);
// TODO: precision and scale would be practically invalid for string conversion (38,38)
int scale = HiveDecimalUtils.getScaleForType(ptinfo);
return new DecimalTypeInfo(precision, scale);
}
/**
* The GenericUDFs might need their children output to be cast to the given castType.
* This method returns a cast expression that would achieve the required casting.
*/
private ExprNodeDesc getImplicitCastExpression(GenericUDF udf, ExprNodeDesc child, TypeInfo castType)
throws HiveException {
TypeInfo inputTypeInfo = child.getTypeInfo();
String inputTypeString = inputTypeInfo.getTypeName();
String castTypeString = castType.getTypeName();
if (inputTypeString.equals(castTypeString)) {
// Nothing to be done
return null;
}
boolean inputTypeDecimal = false;
boolean castTypeDecimal = false;
if (decimalTypePattern.matcher(inputTypeString).matches()) {
inputTypeDecimal = true;
}
if (decimalTypePattern.matcher(castTypeString).matches()) {
castTypeDecimal = true;
}
if (castTypeDecimal && !inputTypeDecimal) {
if (needsImplicitCastForDecimal(udf)) {
// Cast the input to decimal
// If castType is decimal, try not to lose precision for numeric types.
castType = updatePrecision(inputTypeInfo, (DecimalTypeInfo) castType);
GenericUDFToDecimal castToDecimalUDF = new GenericUDFToDecimal();
castToDecimalUDF.setTypeInfo(castType);
List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
children.add(child);
ExprNodeDesc desc = new ExprNodeGenericFuncDesc(castType, castToDecimalUDF, children);
return desc;
}
} else if (!castTypeDecimal && inputTypeDecimal) {
if (needsImplicitCastForDecimal(udf)) {
// Cast decimal input to returnType
GenericUDF genericUdf = getGenericUDFForCast(castType);
List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
children.add(child);
ExprNodeDesc desc = new ExprNodeGenericFuncDesc(castType, genericUdf, children);
return desc;
}
} else {
// Casts to exact types including long to double etc. are needed in some special cases.
if (udf instanceof GenericUDFCoalesce || udf instanceof GenericUDFNvl
|| udf instanceof GenericUDFElt) {
GenericUDF genericUdf = getGenericUDFForCast(castType);
List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
children.add(child);
ExprNodeDesc desc = new ExprNodeGenericFuncDesc(castType, genericUdf, children);
return desc;
}
}
return null;
}
private int getPrecisionForType(PrimitiveTypeInfo typeInfo) {
if (isFloatFamily(typeInfo.getTypeName())) {
return HiveDecimal.MAX_PRECISION;
}
return HiveDecimalUtils.getPrecisionForType(typeInfo);
}
private GenericUDF getGenericUDFForCast(TypeInfo castType) throws HiveException {
UDF udfClass = null;
GenericUDF genericUdf = null;
switch (((PrimitiveTypeInfo) castType).getPrimitiveCategory()) {
case BYTE:
udfClass = new UDFToByte();
break;
case SHORT:
udfClass = new UDFToShort();
break;
case INT:
udfClass = new UDFToInteger();
break;
case LONG:
udfClass = new UDFToLong();
break;
case FLOAT:
udfClass = new UDFToFloat();
break;
case DOUBLE:
udfClass = new UDFToDouble();
break;
case STRING:
udfClass = new UDFToString();
break;
case CHAR:
genericUdf = new GenericUDFToChar();
break;
case VARCHAR:
genericUdf = new GenericUDFToVarchar();
break;
case BOOLEAN:
udfClass = new UDFToBoolean();
break;
case DATE:
genericUdf = new GenericUDFToDate();
break;
case TIMESTAMP:
genericUdf = new GenericUDFTimestamp();
break;
case INTERVAL_YEAR_MONTH:
genericUdf = new GenericUDFToIntervalYearMonth();
break;
case INTERVAL_DAY_TIME:
genericUdf = new GenericUDFToIntervalDayTime();
break;
case BINARY:
genericUdf = new GenericUDFToBinary();
break;
case DECIMAL:
genericUdf = new GenericUDFToDecimal();
break;
case VOID:
case UNKNOWN:
// fall-through to throw exception, its not expected for execution to reach here.
break;
}
if (genericUdf == null) {
if (udfClass == null) {
throw new HiveException("Could not add implicit cast for type "+castType.getTypeName());
}
genericUdf = new GenericUDFBridge();
((GenericUDFBridge) genericUdf).setUdfClassName(udfClass.getClass().getName());
}
if (genericUdf instanceof SettableUDF) {
((SettableUDF) genericUdf).setTypeInfo(castType);
}
return genericUdf;
}
/* Return true if this is one of a small set of functions for which
* it is significantly easier to use the old code path in vectorized
* mode instead of implementing a new, optimized VectorExpression.
*
* Depending on performance requirements and frequency of use, these
* may be implemented in the future with an optimized VectorExpression.
*/
public static boolean isNonVectorizedPathUDF(ExprNodeGenericFuncDesc expr,
VectorExpressionDescriptor.Mode mode) {
GenericUDF gudf = expr.getGenericUDF();
if (gudf instanceof GenericUDFBridge) {
GenericUDFBridge bridge = (GenericUDFBridge) gudf;
Class<? extends UDF> udfClass = bridge.getUdfClass();
if (udfClass.equals(UDFHex.class)
|| udfClass.equals(UDFRegExpExtract.class)
|| udfClass.equals(UDFRegExpReplace.class)
|| udfClass.equals(UDFConv.class)
|| udfClass.equals(UDFFromUnixTime.class) && isIntFamily(arg0Type(expr))
|| isCastToIntFamily(udfClass) && isStringFamily(arg0Type(expr))
|| isCastToFloatFamily(udfClass) && isStringFamily(arg0Type(expr))
|| udfClass.equals(UDFToString.class) &&
(arg0Type(expr).equals("timestamp")
|| arg0Type(expr).equals("double")
|| arg0Type(expr).equals("float"))) {
return true;
}
} else if ((gudf instanceof GenericUDFTimestamp && isStringFamily(arg0Type(expr)))
/* GenericUDFCase and GenericUDFWhen are implemented with the UDF Adaptor because
* of their complexity and generality. In the future, variations of these
* can be optimized to run faster for the vectorized code path. For example,
* CASE col WHEN 1 then "one" WHEN 2 THEN "two" ELSE "other" END
* is an example of a GenericUDFCase that has all constant arguments
* except for the first argument. This is probably a common case and a
* good candidate for a fast, special-purpose VectorExpression. Then
* the UDF Adaptor code path could be used as a catch-all for
* non-optimized general cases.
*/
|| gudf instanceof GenericUDFCase
|| gudf instanceof GenericUDFWhen) {
return true;
} else if (gudf instanceof GenericUDFToChar &&
(arg0Type(expr).equals("timestamp")
|| arg0Type(expr).equals("double")
|| arg0Type(expr).equals("float"))) {
return true;
} else if (gudf instanceof GenericUDFToVarchar &&
(arg0Type(expr).equals("timestamp")
|| arg0Type(expr).equals("double")
|| arg0Type(expr).equals("float"))) {
return true;
} else if (gudf instanceof GenericUDFBetween && (mode == VectorExpressionDescriptor.Mode.PROJECTION)) {
// between has 4 args here, but can be vectorized like this
return true;
}
return false;
}
public static boolean isCastToIntFamily(Class<? extends UDF> udfClass) {
return udfClass.equals(UDFToByte.class)
|| udfClass.equals(UDFToShort.class)
|| udfClass.equals(UDFToInteger.class)
|| udfClass.equals(UDFToLong.class);
// Boolean is purposely excluded.
}
public static boolean isCastToFloatFamily(Class<? extends UDF> udfClass) {
return udfClass.equals(UDFToDouble.class)
|| udfClass.equals(UDFToFloat.class);
}
// Return the type string of the first argument (argument 0).
public static String arg0Type(ExprNodeGenericFuncDesc expr) {
String type = expr.getChildren().get(0).getTypeString();
return type;
}
// Return true if this is a custom UDF or custom GenericUDF.
// This two functions are for use only in the planner. It will fail in a task.
public static boolean isCustomUDF(ExprNodeGenericFuncDesc expr) {
return isCustomUDF(expr.getFuncText());
}
private static boolean isCustomUDF(String udfName) {
if (udfName == null) {
return false;
}
FunctionInfo funcInfo;
try {
funcInfo = FunctionRegistry.getFunctionInfo(udfName);
} catch (SemanticException e) {
LOG.warn("Failed to load " + udfName, e);
funcInfo = null;
}
if (funcInfo == null) {
return false;
}
boolean isNativeFunc = funcInfo.isNative();
return !isNativeFunc;
}
/**
* Handles only the special cases of cast/+ve/-ve operator on a constant.
* @param exprDesc
* @return The same expression if no evaluation done, else return the constant
* expression.
* @throws HiveException
*/
ExprNodeDesc evaluateCastOnConstants(ExprNodeDesc exprDesc) throws HiveException {
if (!(exprDesc instanceof ExprNodeGenericFuncDesc)) {
return exprDesc;
}
if (exprDesc.getChildren() == null || (exprDesc.getChildren().size() != 1) ) {
return exprDesc;
}
ExprNodeConstantDesc foldedChild = null;
if (!( exprDesc.getChildren().get(0) instanceof ExprNodeConstantDesc)) {
// try recursive folding
ExprNodeDesc expr = evaluateCastOnConstants(exprDesc.getChildren().get(0));
if (expr instanceof ExprNodeConstantDesc) {
foldedChild = (ExprNodeConstantDesc) expr;
}
} else {
foldedChild = (ExprNodeConstantDesc) exprDesc.getChildren().get(0);
}
if (foldedChild == null) {
return exprDesc;
}
ObjectInspector childoi = foldedChild.getWritableObjectInspector();
GenericUDF gudf = ((ExprNodeGenericFuncDesc) exprDesc).getGenericUDF();
// Only evaluate +ve/-ve or cast on constant or recursive casting.
if (gudf instanceof GenericUDFOPNegative || gudf instanceof GenericUDFOPPositive ||
castExpressionUdfs.contains(gudf.getClass())
|| ((gudf instanceof GenericUDFBridge)
&& castExpressionUdfs.contains(((GenericUDFBridge) gudf).getUdfClass()))) {
ExprNodeEvaluator<?> evaluator = ExprNodeEvaluatorFactory.get(exprDesc);
ObjectInspector output = evaluator.initialize(childoi);
Object constant = evaluator.evaluate(null);
Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output);
return new ExprNodeConstantDesc(exprDesc.getTypeInfo(), java);
}
return exprDesc;
}
/* For cast on constant operator in all members of the input list and return new list
* containing results.
*/
private List<ExprNodeDesc> evaluateCastOnConstants(List<ExprNodeDesc> childExpr)
throws HiveException {
List<ExprNodeDesc> evaluatedChildren = new ArrayList<ExprNodeDesc>();
if (childExpr != null) {
for (ExprNodeDesc expr : childExpr) {
expr = this.evaluateCastOnConstants(expr);
evaluatedChildren.add(expr);
}
}
return evaluatedChildren;
}
private VectorExpression getConstantVectorExpression(Object constantValue, TypeInfo typeInfo,
VectorExpressionDescriptor.Mode mode) throws HiveException {
String typeName = typeInfo.getTypeName();
VectorExpressionDescriptor.ArgumentType vectorArgType = VectorExpressionDescriptor.ArgumentType.fromHiveTypeName(typeName);
if (vectorArgType == VectorExpressionDescriptor.ArgumentType.NONE) {
throw new HiveException("No vector argument type for type name " + typeName);
}
int outCol = -1;
if (mode == VectorExpressionDescriptor.Mode.PROJECTION) {
outCol = ocm.allocateOutputColumn(typeInfo);
}
if (constantValue == null) {
return new ConstantVectorExpression(outCol, typeName, true);
}
// Boolean is special case.
if (typeName.equalsIgnoreCase("boolean")) {
if (mode == VectorExpressionDescriptor.Mode.FILTER) {
if (((Boolean) constantValue).booleanValue()) {
return new FilterConstantBooleanVectorExpression(1);
} else {
return new FilterConstantBooleanVectorExpression(0);
}
} else {
if (((Boolean) constantValue).booleanValue()) {
return new ConstantVectorExpression(outCol, 1);
} else {
return new ConstantVectorExpression(outCol, 0);
}
}
}
switch (vectorArgType) {
case INT_FAMILY:
return new ConstantVectorExpression(outCol, ((Number) constantValue).longValue());
case DATE:
return new ConstantVectorExpression(outCol, DateWritable.dateToDays((Date) constantValue));
case TIMESTAMP:
return new ConstantVectorExpression(outCol, (Timestamp) constantValue);
case INTERVAL_YEAR_MONTH:
return new ConstantVectorExpression(outCol,
((HiveIntervalYearMonth) constantValue).getTotalMonths());
case INTERVAL_DAY_TIME:
return new ConstantVectorExpression(outCol, (HiveIntervalDayTime) constantValue);
case FLOAT_FAMILY:
return new ConstantVectorExpression(outCol, ((Number) constantValue).doubleValue());
case DECIMAL:
return new ConstantVectorExpression(outCol, (HiveDecimal) constantValue, typeName);
case STRING:
return new ConstantVectorExpression(outCol, ((String) constantValue).getBytes());
case CHAR:
return new ConstantVectorExpression(outCol, ((HiveChar) constantValue), typeName);
case VARCHAR:
return new ConstantVectorExpression(outCol, ((HiveVarchar) constantValue), typeName);
default:
throw new HiveException("Unsupported constant type: " + typeName + ", object class " + constantValue.getClass().getSimpleName());
}
}
private VectorExpression getDynamicValueVectorExpression(ExprNodeDynamicValueDesc dynamicValueExpr,
VectorExpressionDescriptor.Mode mode) throws HiveException {
String typeName = dynamicValueExpr.getTypeInfo().getTypeName();
VectorExpressionDescriptor.ArgumentType vectorArgType = VectorExpressionDescriptor.ArgumentType.fromHiveTypeName(typeName);
if (vectorArgType == VectorExpressionDescriptor.ArgumentType.NONE) {
throw new HiveException("No vector argument type for type name " + typeName);
}
int outCol = -1;
if (mode == VectorExpressionDescriptor.Mode.PROJECTION) {
outCol = ocm.allocateOutputColumn(dynamicValueExpr.getTypeInfo());
}
return new DynamicValueVectorExpression(outCol, dynamicValueExpr.getTypeInfo(), dynamicValueExpr.getDynamicValue());
}
/**
* Used as a fast path for operations that don't modify their input, like unary +
* and casting boolean to long. IdentityExpression and its children are always
* projections.
*/
private VectorExpression getIdentityExpression(List<ExprNodeDesc> childExprList)
throws HiveException {
ExprNodeDesc childExpr = childExprList.get(0);
int inputCol;
String colType;
VectorExpression v1 = null;
if (childExpr instanceof ExprNodeGenericFuncDesc) {
v1 = getVectorExpression(childExpr);
inputCol = v1.getOutputColumn();
colType = v1.getOutputType();
} else if (childExpr instanceof ExprNodeColumnDesc) {
ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc) childExpr;
inputCol = getInputColumnIndex(colDesc.getColumn());
colType = colDesc.getTypeString();
} else {
throw new HiveException("Expression not supported: "+childExpr);
}
VectorExpression expr = new IdentityExpression(inputCol, colType);
if (v1 != null) {
expr.setChildExpressions(new VectorExpression [] {v1});
}
return expr;
}
private VectorExpression getVectorExpressionForUdf(GenericUDF genericeUdf,
Class<?> udfClass, List<ExprNodeDesc> childExpr, VectorExpressionDescriptor.Mode mode,
TypeInfo returnType) throws HiveException {
int numChildren = (childExpr == null) ? 0 : childExpr.size();
if (numChildren > 2 && genericeUdf != null && mode == VectorExpressionDescriptor.Mode.FILTER &&
((genericeUdf instanceof GenericUDFOPOr) || (genericeUdf instanceof GenericUDFOPAnd))) {
// Special case handling for Multi-OR and Multi-AND.
for (int i = 0; i < numChildren; i++) {
ExprNodeDesc child = childExpr.get(i);
String childTypeString = child.getTypeString();
if (childTypeString == null) {
throw new HiveException("Null child type name string");
}
TypeInfo typeInfo = TypeInfoUtils.getTypeInfoFromTypeString(childTypeString);
Type columnVectorType = VectorizationContext.getColumnVectorTypeFromTypeInfo(typeInfo);
if (columnVectorType != ColumnVector.Type.LONG){
return null;
}
if (!(child instanceof ExprNodeGenericFuncDesc) && !(child instanceof ExprNodeColumnDesc)) {
return null;
}
}
Class<?> vclass;
if (genericeUdf instanceof GenericUDFOPOr) {
vclass = FilterExprOrExpr.class;
} else if (genericeUdf instanceof GenericUDFOPAnd) {
vclass = FilterExprAndExpr.class;
} else {
throw new RuntimeException("Unexpected multi-child UDF");
}
VectorExpressionDescriptor.Mode childrenMode = getChildrenMode(mode, udfClass);
return createVectorExpression(vclass, childExpr, childrenMode, returnType);
}
if (numChildren > VectorExpressionDescriptor.MAX_NUM_ARGUMENTS) {
return null;
}
VectorExpressionDescriptor.Builder builder = new VectorExpressionDescriptor.Builder();
builder.setNumArguments(numChildren);
builder.setMode(mode);
for (int i = 0; i < numChildren; i++) {
ExprNodeDesc child = childExpr.get(i);
String childTypeString = child.getTypeString();
if (childTypeString == null) {
throw new HiveException("Null child type name string");
}
String undecoratedTypeName = getUndecoratedName(childTypeString);
if (undecoratedTypeName == null) {
throw new HiveException("No match for type string " + childTypeString + " from undecorated type name method");
}
builder.setArgumentType(i, undecoratedTypeName);
if ((child instanceof ExprNodeGenericFuncDesc) || (child instanceof ExprNodeColumnDesc)) {
builder.setInputExpressionType(i, InputExpressionType.COLUMN);
} else if (child instanceof ExprNodeConstantDesc) {
builder.setInputExpressionType(i, InputExpressionType.SCALAR);
} else if (child instanceof ExprNodeDynamicValueDesc) {
builder.setInputExpressionType(i, InputExpressionType.DYNAMICVALUE);
} else {
throw new HiveException("Cannot handle expression type: " + child.getClass().getSimpleName());
}
}
VectorExpressionDescriptor.Descriptor descriptor = builder.build();
Class<?> vclass = this.vMap.getVectorExpressionClass(udfClass, descriptor);
if (vclass == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("No vector udf found for "+udfClass.getSimpleName() + ", descriptor: "+descriptor);
}
return null;
}
VectorExpressionDescriptor.Mode childrenMode = getChildrenMode(mode, udfClass);
return createVectorExpression(vclass, childExpr, childrenMode, returnType);
}
private VectorExpression createVectorExpression(Class<?> vectorClass,
List<ExprNodeDesc> childExpr, VectorExpressionDescriptor.Mode childrenMode, TypeInfo returnType) throws HiveException {
int numChildren = childExpr == null ? 0: childExpr.size();
VectorExpression.Type [] inputTypes = new VectorExpression.Type[numChildren];
List<VectorExpression> children = new ArrayList<VectorExpression>();
Object[] arguments = new Object[numChildren];
try {
for (int i = 0; i < numChildren; i++) {
ExprNodeDesc child = childExpr.get(i);
String undecoratedName = getUndecoratedName(child.getTypeInfo().getTypeName());
inputTypes[i] = VectorExpression.Type.getValue(undecoratedName);
if (inputTypes[i] == VectorExpression.Type.OTHER){
throw new HiveException("No vector type for " + vectorClass.getSimpleName() + " argument #" + i + " type name " + undecoratedName);
}
if (child instanceof ExprNodeGenericFuncDesc) {
VectorExpression vChild = getVectorExpression(child, childrenMode);
children.add(vChild);
arguments[i] = vChild.getOutputColumn();
} else if (child instanceof ExprNodeColumnDesc) {
int colIndex = getInputColumnIndex((ExprNodeColumnDesc) child);
if (childrenMode == VectorExpressionDescriptor.Mode.FILTER) {
// In filter mode, the column must be a boolean
children.add(new SelectColumnIsTrue(colIndex));
}
arguments[i] = colIndex;
} else if (child instanceof ExprNodeConstantDesc) {
Object scalarValue = getVectorTypeScalarValue((ExprNodeConstantDesc) child);
arguments[i] = (null == scalarValue) ? getConstantVectorExpression(null, child.getTypeInfo(), childrenMode) : scalarValue;
} else if (child instanceof ExprNodeDynamicValueDesc) {
arguments[i] = ((ExprNodeDynamicValueDesc) child).getDynamicValue();
} else {
throw new HiveException("Cannot handle expression type: " + child.getClass().getSimpleName());
}
}
VectorExpression vectorExpression = instantiateExpression(vectorClass, returnType, arguments);
vectorExpression.setInputTypes(inputTypes);
if ((vectorExpression != null) && !children.isEmpty()) {
vectorExpression.setChildExpressions(children.toArray(new VectorExpression[0]));
}
return vectorExpression;
} catch (Exception ex) {
throw new HiveException(ex);
} finally {
for (VectorExpression ve : children) {
ocm.freeOutputColumn(ve.getOutputColumn());
}
}
}
private VectorExpressionDescriptor.Mode getChildrenMode(VectorExpressionDescriptor.Mode mode, Class<?> udf) {
if (mode.equals(VectorExpressionDescriptor.Mode.FILTER) && (udf.equals(GenericUDFOPAnd.class) || udf.equals(GenericUDFOPOr.class))) {
return VectorExpressionDescriptor.Mode.FILTER;
}
return VectorExpressionDescriptor.Mode.PROJECTION;
}
private String getNewInstanceArgumentString(Object [] args) {
if (args == null) {
return "arguments: NULL";
}
ArrayList<String> argClasses = new ArrayList<String>();
for (Object obj : args) {
argClasses.add(obj.getClass().getSimpleName());
}
return "arguments: " + Arrays.toString(args) + ", argument classes: " + argClasses.toString();
}
private static final int STACK_LENGTH_LIMIT = 15;
public static String getStackTraceAsSingleLine(Throwable e) {
StringBuilder sb = new StringBuilder();
sb.append(e);
sb.append(" stack trace: ");
StackTraceElement[] stackTrace = e.getStackTrace();
int length = stackTrace.length;
boolean isTruncated = false;
if (length > STACK_LENGTH_LIMIT) {
length = STACK_LENGTH_LIMIT;
isTruncated = true;
}
for (int i = 0; i < length; i++) {
if (i > 0) {
sb.append(", ");
}
sb.append(stackTrace[i]);
}
if (isTruncated) {
sb.append(", ...");
}
// Attempt to cleanup stack trace elements that vary by VM.
String cleaned = sb.toString().replaceAll("GeneratedConstructorAccessor[0-9]*", "GeneratedConstructorAccessor<omitted>");
return cleaned;
}
private VectorExpression instantiateExpression(Class<?> vclass, TypeInfo returnType, Object...args)
throws HiveException {
VectorExpression ve = null;
Constructor<?> ctor = getConstructor(vclass);
int numParams = ctor.getParameterTypes().length;
int argsLength = (args == null) ? 0 : args.length;
if (numParams == 0) {
try {
ve = (VectorExpression) ctor.newInstance();
} catch (Exception ex) {
throw new HiveException("Could not instantiate " + vclass.getSimpleName() + " with 0 arguments, exception: " +
getStackTraceAsSingleLine(ex));
}
} else if (numParams == argsLength) {
try {
ve = (VectorExpression) ctor.newInstance(args);
} catch (Exception ex) {
throw new HiveException("Could not instantiate " + vclass.getSimpleName() + " with " + getNewInstanceArgumentString(args) + ", exception: " +
getStackTraceAsSingleLine(ex));
}
} else if (numParams == argsLength + 1) {
// Additional argument is needed, which is the outputcolumn.
Object [] newArgs = null;
try {
String returnTypeName;
if (returnType == null) {
returnTypeName = ((VectorExpression) vclass.newInstance()).getOutputType().toLowerCase();
if (returnTypeName.equals("long")) {
returnTypeName = "bigint";
}
returnType = TypeInfoUtils.getTypeInfoFromTypeString(returnTypeName);
} else {
returnTypeName = returnType.getTypeName();
}
// Special handling for decimal because decimal types need scale and precision parameter.
// This special handling should be avoided by using returnType uniformly for all cases.
int outputCol = ocm.allocateOutputColumn(returnType);
newArgs = Arrays.copyOf(args, numParams);
newArgs[numParams-1] = outputCol;
ve = (VectorExpression) ctor.newInstance(newArgs);
ve.setOutputType(returnTypeName);
} catch (Exception ex) {
throw new HiveException("Could not instantiate " + vclass.getSimpleName() + " with arguments " + getNewInstanceArgumentString(newArgs) + ", exception: " +
getStackTraceAsSingleLine(ex));
}
}
// Add maxLength parameter to UDFs that have CHAR or VARCHAR output.
if (ve instanceof TruncStringOutput) {
TruncStringOutput truncStringOutput = (TruncStringOutput) ve;
if (returnType instanceof BaseCharTypeInfo) {
BaseCharTypeInfo baseCharTypeInfo = (BaseCharTypeInfo) returnType;
truncStringOutput.setMaxLength(baseCharTypeInfo.getLength());
}
}
return ve;
}
private VectorExpression getGenericUdfVectorExpression(GenericUDF udf,
List<ExprNodeDesc> childExpr, VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException {
List<ExprNodeDesc> castedChildren = evaluateCastOnConstants(childExpr);
childExpr = castedChildren;
//First handle special cases. If one of the special case methods cannot handle it,
// it returns null.
VectorExpression ve = null;
if (udf instanceof GenericUDFBetween && mode == VectorExpressionDescriptor.Mode.FILTER) {
ve = getBetweenFilterExpression(childExpr, mode, returnType);
} else if (udf instanceof GenericUDFIn) {
ve = getInExpression(childExpr, mode, returnType);
} else if (udf instanceof GenericUDFWhen) {
ve = getWhenExpression(childExpr, mode, returnType);
} else if (udf instanceof GenericUDFOPPositive) {
ve = getIdentityExpression(childExpr);
} else if (udf instanceof GenericUDFCoalesce || udf instanceof GenericUDFNvl) {
// Coalesce is a special case because it can take variable number of arguments.
// Nvl is a specialization of the Coalesce.
ve = getCoalesceExpression(childExpr, returnType);
} else if (udf instanceof GenericUDFElt) {
// Elt is a special case because it can take variable number of arguments.
ve = getEltExpression(childExpr, returnType);
} else if (udf instanceof GenericUDFBridge) {
ve = getGenericUDFBridgeVectorExpression((GenericUDFBridge) udf, childExpr, mode,
returnType);
} else if (udf instanceof GenericUDFToDecimal) {
ve = getCastToDecimal(childExpr, returnType);
} else if (udf instanceof GenericUDFToChar) {
ve = getCastToChar(childExpr, returnType);
} else if (udf instanceof GenericUDFToVarchar) {
ve = getCastToVarChar(childExpr, returnType);
} else if (udf instanceof GenericUDFTimestamp) {
ve = getCastToTimestamp((GenericUDFTimestamp)udf, childExpr, mode, returnType);
}
if (ve != null) {
return ve;
}
// Now do a general lookup
Class<?> udfClass = udf.getClass();
boolean isSubstituted = false;
if (udf instanceof GenericUDFBridge) {
udfClass = ((GenericUDFBridge) udf).getUdfClass();
isSubstituted = true;
}
ve = getVectorExpressionForUdf((!isSubstituted ? udf : null),
udfClass, castedChildren, mode, returnType);
return ve;
}
private VectorExpression getCastToTimestamp(GenericUDFTimestamp udf,
List<ExprNodeDesc> childExpr, VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException {
VectorExpression ve = getVectorExpressionForUdf(udf, udf.getClass(), childExpr, mode, returnType);
// Replace with the milliseconds conversion
if (!udf.isIntToTimestampInSeconds() && ve instanceof CastLongToTimestamp) {
ve = createVectorExpression(CastMillisecondsLongToTimestamp.class,
childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
}
return ve;
}
private void freeNonColumns(VectorExpression[] vectorChildren) {
if (vectorChildren == null) {
return;
}
for (VectorExpression v : vectorChildren) {
if (!(v instanceof IdentityExpression)) {
ocm.freeOutputColumn(v.getOutputColumn());
}
}
}
private VectorExpression getCoalesceExpression(List<ExprNodeDesc> childExpr, TypeInfo returnType)
throws HiveException {
int[] inputColumns = new int[childExpr.size()];
VectorExpression[] vectorChildren =
getVectorExpressions(childExpr, VectorExpressionDescriptor.Mode.PROJECTION);
int i = 0;
for (VectorExpression ve : vectorChildren) {
inputColumns[i++] = ve.getOutputColumn();
}
int outColumn = ocm.allocateOutputColumn(returnType);
VectorCoalesce vectorCoalesce = new VectorCoalesce(inputColumns, outColumn);
vectorCoalesce.setOutputType(returnType.getTypeName());
vectorCoalesce.setChildExpressions(vectorChildren);
freeNonColumns(vectorChildren);
return vectorCoalesce;
}
private VectorExpression getEltExpression(List<ExprNodeDesc> childExpr, TypeInfo returnType)
throws HiveException {
int[] inputColumns = new int[childExpr.size()];
VectorExpression[] vectorChildren =
getVectorExpressions(childExpr, VectorExpressionDescriptor.Mode.PROJECTION);
int i = 0;
for (VectorExpression ve : vectorChildren) {
inputColumns[i++] = ve.getOutputColumn();
}
int outColumn = ocm.allocateOutputColumn(returnType);
VectorElt vectorElt = new VectorElt(inputColumns, outColumn);
vectorElt.setOutputType(returnType.getTypeName());
vectorElt.setChildExpressions(vectorChildren);
freeNonColumns(vectorChildren);
return vectorElt;
}
public enum InConstantType {
INT_FAMILY,
TIMESTAMP,
DATE,
FLOAT_FAMILY,
STRING_FAMILY,
DECIMAL
}
public static InConstantType getInConstantTypeFromPrimitiveCategory(PrimitiveCategory primitiveCategory) {
switch (primitiveCategory) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
return InConstantType.INT_FAMILY;
case DATE:
return InConstantType.TIMESTAMP;
case TIMESTAMP:
return InConstantType.DATE;
case FLOAT:
case DOUBLE:
return InConstantType.FLOAT_FAMILY;
case STRING:
case CHAR:
case VARCHAR:
case BINARY:
return InConstantType.STRING_FAMILY;
case DECIMAL:
return InConstantType.DECIMAL;
case INTERVAL_YEAR_MONTH:
case INTERVAL_DAY_TIME:
// UNDONE: Fall through for these... they don't appear to be supported yet.
default:
throw new RuntimeException("Unexpected primitive type category " + primitiveCategory);
}
}
private VectorExpression getStructInExpression(List<ExprNodeDesc> childExpr, ExprNodeDesc colExpr,
TypeInfo colTypeInfo, List<ExprNodeDesc> inChildren, VectorExpressionDescriptor.Mode mode, TypeInfo returnType)
throws HiveException {
VectorExpression expr = null;
StructTypeInfo structTypeInfo = (StructTypeInfo) colTypeInfo;
ArrayList<TypeInfo> fieldTypeInfos = structTypeInfo.getAllStructFieldTypeInfos();
final int fieldCount = fieldTypeInfos.size();
ColumnVector.Type[] fieldVectorColumnTypes = new ColumnVector.Type[fieldCount];
InConstantType[] fieldInConstantTypes = new InConstantType[fieldCount];
for (int f = 0; f < fieldCount; f++) {
TypeInfo fieldTypeInfo = fieldTypeInfos.get(f);
// Only primitive fields supports for now.
if (fieldTypeInfo.getCategory() != Category.PRIMITIVE) {
return null;
}
// We are going to serialize using the 4 basic types.
ColumnVector.Type fieldVectorColumnType = getColumnVectorTypeFromTypeInfo(fieldTypeInfo);
fieldVectorColumnTypes[f] = fieldVectorColumnType;
// We currently evaluate the IN (..) constants in special ways.
PrimitiveCategory fieldPrimitiveCategory =
((PrimitiveTypeInfo) fieldTypeInfo).getPrimitiveCategory();
InConstantType inConstantType = getInConstantTypeFromPrimitiveCategory(fieldPrimitiveCategory);
fieldInConstantTypes[f] = inConstantType;
}
Output buffer = new Output();
BinarySortableSerializeWrite binarySortableSerializeWrite =
new BinarySortableSerializeWrite(fieldCount);
final int inChildrenCount = inChildren.size();
byte[][] serializedInChildren = new byte[inChildrenCount][];
try {
for (int i = 0; i < inChildrenCount; i++) {
final ExprNodeDesc node = inChildren.get(i);
final Object[] constants;
if (node instanceof ExprNodeConstantDesc) {
ExprNodeConstantDesc constNode = (ExprNodeConstantDesc) node;
ConstantObjectInspector output = constNode.getWritableObjectInspector();
constants = ((List<?>) output.getWritableConstantValue()).toArray();
} else {
ExprNodeGenericFuncDesc exprNode = (ExprNodeGenericFuncDesc) node;
ExprNodeEvaluator<?> evaluator = ExprNodeEvaluatorFactory
.get(exprNode);
ObjectInspector output = evaluator.initialize(exprNode
.getWritableObjectInspector());
constants = (Object[]) evaluator.evaluate(null);
}
binarySortableSerializeWrite.set(buffer);
for (int f = 0; f < fieldCount; f++) {
Object constant = constants[f];
if (constant == null) {
binarySortableSerializeWrite.writeNull();
} else {
InConstantType inConstantType = fieldInConstantTypes[f];
switch (inConstantType) {
case STRING_FAMILY:
{
byte[] bytes;
if (constant instanceof Text) {
Text text = (Text) constant;
bytes = text.getBytes();
binarySortableSerializeWrite.writeString(bytes, 0, text.getLength());
} else {
throw new HiveException("Unexpected constant String type " +
constant.getClass().getSimpleName());
}
}
break;
case INT_FAMILY:
{
long value;
if (constant instanceof IntWritable) {
value = ((IntWritable) constant).get();
} else if (constant instanceof LongWritable) {
value = ((LongWritable) constant).get();
} else {
throw new HiveException("Unexpected constant Long type " +
constant.getClass().getSimpleName());
}
binarySortableSerializeWrite.writeLong(value);
}
break;
case FLOAT_FAMILY:
{
double value;
if (constant instanceof DoubleWritable) {
value = ((DoubleWritable) constant).get();
} else {
throw new HiveException("Unexpected constant Double type " +
constant.getClass().getSimpleName());
}
binarySortableSerializeWrite.writeDouble(value);
}
break;
// UNDONE...
case DATE:
case TIMESTAMP:
case DECIMAL:
default:
throw new RuntimeException("Unexpected IN constant type " + inConstantType.name());
}
}
}
serializedInChildren[i] = Arrays.copyOfRange(buffer.getData(), 0, buffer.getLength());
}
} catch (Exception e) {
throw new HiveException(e);
}
// Create a single child representing the scratch column where we will
// generate the serialized keys of the batch.
int scratchBytesCol = ocm.allocateOutputColumn(TypeInfoFactory.stringTypeInfo);
Class<?> cl = (mode == VectorExpressionDescriptor.Mode.FILTER ? FilterStructColumnInList.class : StructColumnInList.class);
expr = createVectorExpression(cl, null, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
((IStringInExpr) expr).setInListValues(serializedInChildren);
((IStructInExpr) expr).setScratchBytesColumn(scratchBytesCol);
((IStructInExpr) expr).setStructColumnExprs(this, colExpr.getChildren(),
fieldVectorColumnTypes);
return expr;
}
/**
* Create a filter or boolean-valued expression for column IN ( <list-of-constants> )
*/
private VectorExpression getInExpression(List<ExprNodeDesc> childExpr,
VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException {
ExprNodeDesc colExpr = childExpr.get(0);
List<ExprNodeDesc> inChildren = childExpr.subList(1, childExpr.size());
String colType = colExpr.getTypeString();
colType = VectorizationContext.mapTypeNameSynonyms(colType);
TypeInfo colTypeInfo = TypeInfoUtils.getTypeInfoFromTypeString(colType);
Category category = colTypeInfo.getCategory();
if (category == Category.STRUCT) {
return getStructInExpression(childExpr, colExpr, colTypeInfo, inChildren, mode, returnType);
} else if (category != Category.PRIMITIVE) {
return null;
}
// prepare arguments for createVectorExpression
List<ExprNodeDesc> childrenForInList = evaluateCastOnConstants(inChildren);
/* This method assumes that the IN list has no NULL entries. That is enforced elsewhere,
* in the Vectorizer class. If NULL is passed in as a list entry, behavior is not defined.
* If in the future, NULL values are allowed in the IN list, be sure to handle 3-valued
* logic correctly. E.g. NOT (col IN (null)) should be considered UNKNOWN, so that would
* become FALSE in the WHERE clause, and cause the row in question to be filtered out.
* See the discussion in Jira HIVE-5583.
*/
VectorExpression expr = null;
// Validate the IN items are only constants.
for (ExprNodeDesc inListChild : childrenForInList) {
if (!(inListChild instanceof ExprNodeConstantDesc)) {
throw new HiveException("Vectorizing IN expression only supported for constant values");
}
}
// determine class
Class<?> cl = null;
// TODO: the below assumes that all the arguments to IN are of the same type;
// non-vectorized validates that explicitly during UDF init.
if (isIntFamily(colType)) {
cl = (mode == VectorExpressionDescriptor.Mode.FILTER ? FilterLongColumnInList.class : LongColumnInList.class);
long[] inVals = new long[childrenForInList.size()];
for (int i = 0; i != inVals.length; i++) {
inVals[i] = getIntFamilyScalarAsLong((ExprNodeConstantDesc) childrenForInList.get(i));
}
expr = createVectorExpression(cl, childExpr.subList(0, 1), VectorExpressionDescriptor.Mode.PROJECTION, returnType);
((ILongInExpr) expr).setInListValues(inVals);
} else if (isTimestampFamily(colType)) {
cl = (mode == VectorExpressionDescriptor.Mode.FILTER ? FilterTimestampColumnInList.class : TimestampColumnInList.class);
Timestamp[] inVals = new Timestamp[childrenForInList.size()];
for (int i = 0; i != inVals.length; i++) {
inVals[i] = getTimestampScalar(childrenForInList.get(i));
}
expr = createVectorExpression(cl, childExpr.subList(0, 1), VectorExpressionDescriptor.Mode.PROJECTION, returnType);
((ITimestampInExpr) expr).setInListValues(inVals);
} else if (isStringFamily(colType)) {
cl = (mode == VectorExpressionDescriptor.Mode.FILTER ? FilterStringColumnInList.class : StringColumnInList.class);
byte[][] inVals = new byte[childrenForInList.size()][];
for (int i = 0; i != inVals.length; i++) {
inVals[i] = getStringScalarAsByteArray((ExprNodeConstantDesc) childrenForInList.get(i));
}
expr = createVectorExpression(cl, childExpr.subList(0, 1), VectorExpressionDescriptor.Mode.PROJECTION, returnType);
((IStringInExpr) expr).setInListValues(inVals);
} else if (isFloatFamily(colType)) {
cl = (mode == VectorExpressionDescriptor.Mode.FILTER ? FilterDoubleColumnInList.class : DoubleColumnInList.class);
double[] inValsD = new double[childrenForInList.size()];
for (int i = 0; i != inValsD.length; i++) {
inValsD[i] = getNumericScalarAsDouble(childrenForInList.get(i));
}
expr = createVectorExpression(cl, childExpr.subList(0, 1), VectorExpressionDescriptor.Mode.PROJECTION, returnType);
((IDoubleInExpr) expr).setInListValues(inValsD);
} else if (isDecimalFamily(colType)) {
cl = (mode == VectorExpressionDescriptor.Mode.FILTER ? FilterDecimalColumnInList.class : DecimalColumnInList.class);
HiveDecimal[] inValsD = new HiveDecimal[childrenForInList.size()];
for (int i = 0; i != inValsD.length; i++) {
inValsD[i] = (HiveDecimal) getVectorTypeScalarValue(
(ExprNodeConstantDesc) childrenForInList.get(i));
}
expr = createVectorExpression(cl, childExpr.subList(0, 1), VectorExpressionDescriptor.Mode.PROJECTION, returnType);
((IDecimalInExpr) expr).setInListValues(inValsD);
} else if (isDateFamily(colType)) {
cl = (mode == VectorExpressionDescriptor.Mode.FILTER ? FilterLongColumnInList.class : LongColumnInList.class);
long[] inVals = new long[childrenForInList.size()];
for (int i = 0; i != inVals.length; i++) {
inVals[i] = (Long) getVectorTypeScalarValue((ExprNodeConstantDesc) childrenForInList.get(i));
}
expr = createVectorExpression(cl, childExpr.subList(0, 1), VectorExpressionDescriptor.Mode.PROJECTION, returnType);
((ILongInExpr) expr).setInListValues(inVals);
}
// Return the desired VectorExpression if found. Otherwise, return null to cause
// execution to fall back to row mode.
return expr;
}
private byte[] getStringScalarAsByteArray(ExprNodeConstantDesc exprNodeConstantDesc)
throws HiveException {
Object o = getScalarValue(exprNodeConstantDesc);
if (!(o instanceof byte[])) {
throw new HiveException("Expected constant argument of type string");
}
return (byte[]) o;
}
private PrimitiveCategory getAnyIntegerPrimitiveCategoryFromUdfClass(Class<? extends UDF> udfClass) {
if (udfClass.equals(UDFToByte.class)) {
return PrimitiveCategory.BYTE;
} else if (udfClass.equals(UDFToShort.class)) {
return PrimitiveCategory.SHORT;
} else if (udfClass.equals(UDFToInteger.class)) {
return PrimitiveCategory.INT;
} else if (udfClass.equals(UDFToLong.class)) {
return PrimitiveCategory.LONG;
} else {
throw new RuntimeException("Unexpected any integery UDF class " + udfClass.getName());
}
}
/**
* Invoke special handling for expressions that can't be vectorized by regular
* descriptor based lookup.
*/
private VectorExpression getGenericUDFBridgeVectorExpression(GenericUDFBridge udf,
List<ExprNodeDesc> childExpr, VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException {
Class<? extends UDF> cl = udf.getUdfClass();
VectorExpression ve = null;
if (isCastToIntFamily(cl)) {
PrimitiveCategory integerPrimitiveCategory =
getAnyIntegerPrimitiveCategoryFromUdfClass(cl);
ve = getCastToLongExpression(childExpr, integerPrimitiveCategory);
} else if (cl.equals(UDFToBoolean.class)) {
ve = getCastToBoolean(childExpr);
} else if (isCastToFloatFamily(cl)) {
ve = getCastToDoubleExpression(cl, childExpr, returnType);
} else if (cl.equals(UDFToString.class)) {
ve = getCastToString(childExpr, returnType);
}
if (ve == null && childExpr instanceof ExprNodeGenericFuncDesc) {
ve = getCustomUDFExpression((ExprNodeGenericFuncDesc) childExpr, mode);
}
return ve;
}
private HiveDecimal castConstantToDecimal(Object scalar, TypeInfo type) throws HiveException {
if (null == scalar) {
return null;
}
PrimitiveTypeInfo ptinfo = (PrimitiveTypeInfo) type;
String typename = type.getTypeName();
HiveDecimal rawDecimal;
switch (ptinfo.getPrimitiveCategory()) {
case FLOAT:
rawDecimal = HiveDecimal.create(String.valueOf(scalar));
break;
case DOUBLE:
rawDecimal = HiveDecimal.create(String.valueOf(scalar));
break;
case BYTE:
rawDecimal = HiveDecimal.create((Byte) scalar);
break;
case SHORT:
rawDecimal = HiveDecimal.create((Short) scalar);
break;
case INT:
rawDecimal = HiveDecimal.create((Integer) scalar);
break;
case LONG:
rawDecimal = HiveDecimal.create((Long) scalar);
break;
case DECIMAL:
rawDecimal = (HiveDecimal) scalar;
break;
default:
throw new HiveException("Unsupported type " + typename + " for cast to HiveDecimal");
}
if (rawDecimal == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Casting constant scalar " + scalar + " to HiveDecimal resulted in null");
}
return null;
}
return rawDecimal;
}
private String castConstantToString(Object scalar, TypeInfo type) throws HiveException {
if (null == scalar) {
return null;
}
PrimitiveTypeInfo ptinfo = (PrimitiveTypeInfo) type;
String typename = type.getTypeName();
switch (ptinfo.getPrimitiveCategory()) {
case FLOAT:
case DOUBLE:
case BYTE:
case SHORT:
case INT:
case LONG:
return ((Number) scalar).toString();
case DECIMAL:
HiveDecimal decimalVal = (HiveDecimal) scalar;
return decimalVal.toString();
default:
throw new HiveException("Unsupported type "+typename+" for cast to String");
}
}
private Double castConstantToDouble(Object scalar, TypeInfo type) throws HiveException {
if (null == scalar) {
return null;
}
PrimitiveTypeInfo ptinfo = (PrimitiveTypeInfo) type;
String typename = type.getTypeName();
switch (ptinfo.getPrimitiveCategory()) {
case FLOAT:
case DOUBLE:
case BYTE:
case SHORT:
case INT:
case LONG:
return ((Number) scalar).doubleValue();
case DECIMAL:
HiveDecimal decimalVal = (HiveDecimal) scalar;
return decimalVal.doubleValue();
default:
throw new HiveException("Unsupported type "+typename+" for cast to Double");
}
}
private Long castConstantToLong(Object scalar, TypeInfo type,
PrimitiveCategory integerPrimitiveCategory) throws HiveException {
if (null == scalar) {
return null;
}
PrimitiveTypeInfo ptinfo = (PrimitiveTypeInfo) type;
String typename = type.getTypeName();
switch (ptinfo.getPrimitiveCategory()) {
case FLOAT:
case DOUBLE:
case BYTE:
case SHORT:
case INT:
case LONG:
return ((Number) scalar).longValue();
case DECIMAL:
HiveDecimal decimalVal = (HiveDecimal) scalar;
switch (integerPrimitiveCategory) {
case BYTE:
if (!decimalVal.isByte()) {
// Accurate byte value cannot be obtained.
return null;
}
break;
case SHORT:
if (!decimalVal.isShort()) {
// Accurate short value cannot be obtained.
return null;
}
break;
case INT:
if (!decimalVal.isInt()) {
// Accurate int value cannot be obtained.
return null;
}
break;
case LONG:
if (!decimalVal.isLong()) {
// Accurate long value cannot be obtained.
return null;
}
break;
default:
throw new RuntimeException("Unexpected integer primitive type " + integerPrimitiveCategory);
}
// We only store longs in our LongColumnVector.
return decimalVal.longValue();
default:
throw new HiveException("Unsupported type "+typename+" for cast to Long");
}
}
private VectorExpression getCastToDecimal(List<ExprNodeDesc> childExpr, TypeInfo returnType)
throws HiveException {
ExprNodeDesc child = childExpr.get(0);
String inputType = childExpr.get(0).getTypeString();
if (child instanceof ExprNodeConstantDesc) {
// Return a constant vector expression
Object constantValue = ((ExprNodeConstantDesc) child).getValue();
HiveDecimal decimalValue = castConstantToDecimal(constantValue, child.getTypeInfo());
return getConstantVectorExpression(decimalValue, returnType, VectorExpressionDescriptor.Mode.PROJECTION);
}
if (isIntFamily(inputType)) {
return createVectorExpression(CastLongToDecimal.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isFloatFamily(inputType)) {
return createVectorExpression(CastDoubleToDecimal.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (decimalTypePattern.matcher(inputType).matches()) {
return createVectorExpression(CastDecimalToDecimal.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION,
returnType);
} else if (isStringFamily(inputType)) {
return createVectorExpression(CastStringToDecimal.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (inputType.equals("timestamp")) {
return createVectorExpression(CastTimestampToDecimal.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
}
return null;
}
private VectorExpression getCastToString(List<ExprNodeDesc> childExpr, TypeInfo returnType)
throws HiveException {
ExprNodeDesc child = childExpr.get(0);
String inputType = childExpr.get(0).getTypeString();
if (child instanceof ExprNodeConstantDesc) {
// Return a constant vector expression
Object constantValue = ((ExprNodeConstantDesc) child).getValue();
String strValue = castConstantToString(constantValue, child.getTypeInfo());
return getConstantVectorExpression(strValue, returnType, VectorExpressionDescriptor.Mode.PROJECTION);
}
if (inputType.equals("boolean")) {
// Boolean must come before the integer family. It's a special case.
return createVectorExpression(CastBooleanToStringViaLongToString.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, null);
} else if (isIntFamily(inputType)) {
return createVectorExpression(CastLongToString.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isDecimalFamily(inputType)) {
return createVectorExpression(CastDecimalToString.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isDateFamily(inputType)) {
return createVectorExpression(CastDateToString.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isStringFamily(inputType)) {
return createVectorExpression(CastStringGroupToString.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
}
return null;
}
private VectorExpression getCastToChar(List<ExprNodeDesc> childExpr, TypeInfo returnType)
throws HiveException {
ExprNodeDesc child = childExpr.get(0);
String inputType = childExpr.get(0).getTypeString();
if (child instanceof ExprNodeConstantDesc) {
// Don't do constant folding here. Wait until the optimizer is changed to do it.
// Family of related JIRAs: HIVE-7421, HIVE-7422, and HIVE-7424.
return null;
}
if (inputType.equals("boolean")) {
// Boolean must come before the integer family. It's a special case.
return createVectorExpression(CastBooleanToCharViaLongToChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isIntFamily(inputType)) {
return createVectorExpression(CastLongToChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isDecimalFamily(inputType)) {
return createVectorExpression(CastDecimalToChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isDateFamily(inputType)) {
return createVectorExpression(CastDateToChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isStringFamily(inputType)) {
return createVectorExpression(CastStringGroupToChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
}
return null;
}
private VectorExpression getCastToVarChar(List<ExprNodeDesc> childExpr, TypeInfo returnType)
throws HiveException {
ExprNodeDesc child = childExpr.get(0);
String inputType = childExpr.get(0).getTypeString();
if (child instanceof ExprNodeConstantDesc) {
// Don't do constant folding here. Wait until the optimizer is changed to do it.
// Family of related JIRAs: HIVE-7421, HIVE-7422, and HIVE-7424.
return null;
}
if (inputType.equals("boolean")) {
// Boolean must come before the integer family. It's a special case.
return createVectorExpression(CastBooleanToVarCharViaLongToVarChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isIntFamily(inputType)) {
return createVectorExpression(CastLongToVarChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isDecimalFamily(inputType)) {
return createVectorExpression(CastDecimalToVarChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isDateFamily(inputType)) {
return createVectorExpression(CastDateToVarChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else if (isStringFamily(inputType)) {
return createVectorExpression(CastStringGroupToVarChar.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
}
return null;
}
private VectorExpression getCastToDoubleExpression(Class<?> udf, List<ExprNodeDesc> childExpr,
TypeInfo returnType) throws HiveException {
ExprNodeDesc child = childExpr.get(0);
String inputType = childExpr.get(0).getTypeString();
if (child instanceof ExprNodeConstantDesc) {
// Return a constant vector expression
Object constantValue = ((ExprNodeConstantDesc) child).getValue();
Double doubleValue = castConstantToDouble(constantValue, child.getTypeInfo());
return getConstantVectorExpression(doubleValue, returnType, VectorExpressionDescriptor.Mode.PROJECTION);
}
if (isIntFamily(inputType)) {
if (udf.equals(UDFToFloat.class)) {
// In order to convert from integer to float correctly, we need to apply the float cast not the double cast (HIVE-13338).
return createVectorExpression(CastLongToFloatViaLongToDouble.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
} else {
return createVectorExpression(CastLongToDouble.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
}
} else if (inputType.equals("timestamp")) {
return createVectorExpression(CastTimestampToDouble.class, childExpr, VectorExpressionDescriptor.Mode.PROJECTION,
returnType);
} else if (isFloatFamily(inputType)) {
// float types require no conversion, so use a no-op
return getIdentityExpression(childExpr);
}
return null;
}
private VectorExpression getCastToBoolean(List<ExprNodeDesc> childExpr)
throws HiveException {
ExprNodeDesc child = childExpr.get(0);
String inputType = childExpr.get(0).getTypeString();
if (child instanceof ExprNodeConstantDesc) {
if (null == ((ExprNodeConstantDesc)child).getValue()) {
return getConstantVectorExpression(null, TypeInfoFactory.booleanTypeInfo, VectorExpressionDescriptor.Mode.PROJECTION);
}
// Don't do constant folding here. Wait until the optimizer is changed to do it.
// Family of related JIRAs: HIVE-7421, HIVE-7422, and HIVE-7424.
return null;
}
// Long and double are handled using descriptors, string needs to be specially handled.
if (isStringFamily(inputType)) {
// string casts to false if it is 0 characters long, otherwise true
VectorExpression lenExpr = createVectorExpression(StringLength.class, childExpr,
VectorExpressionDescriptor.Mode.PROJECTION, null);
int outputCol = ocm.allocateOutputColumn(TypeInfoFactory.longTypeInfo);
VectorExpression lenToBoolExpr =
new CastLongToBooleanViaLongToLong(lenExpr.getOutputColumn(), outputCol);
lenToBoolExpr.setChildExpressions(new VectorExpression[] {lenExpr});
ocm.freeOutputColumn(lenExpr.getOutputColumn());
return lenToBoolExpr;
}
return null;
}
private VectorExpression getCastToLongExpression(List<ExprNodeDesc> childExpr, PrimitiveCategory integerPrimitiveCategory)
throws HiveException {
ExprNodeDesc child = childExpr.get(0);
String inputType = childExpr.get(0).getTypeString();
if (child instanceof ExprNodeConstantDesc) {
// Return a constant vector expression
Object constantValue = ((ExprNodeConstantDesc) child).getValue();
Long longValue = castConstantToLong(constantValue, child.getTypeInfo(), integerPrimitiveCategory);
return getConstantVectorExpression(longValue, TypeInfoFactory.longTypeInfo, VectorExpressionDescriptor.Mode.PROJECTION);
}
// Float family, timestamp are handled via descriptor based lookup, int family needs
// special handling.
if (isIntFamily(inputType)) {
// integer and boolean types require no conversion, so use a no-op
return getIdentityExpression(childExpr);
}
return null;
}
/* Get a [NOT] BETWEEN filter expression. This is treated as a special case
* because the NOT is actually specified in the expression tree as the first argument,
* and we don't want any runtime cost for that. So creating the VectorExpression
* needs to be done differently than the standard way where all arguments are
* passed to the VectorExpression constructor.
*/
private VectorExpression getBetweenFilterExpression(List<ExprNodeDesc> childExpr, VectorExpressionDescriptor.Mode mode, TypeInfo returnType)
throws HiveException {
if (mode == VectorExpressionDescriptor.Mode.PROJECTION) {
// Projection mode is not yet supported for [NOT] BETWEEN. Return null so Vectorizer
// knows to revert to row-at-a-time execution.
return null;
}
boolean hasDynamicValues = false;
// We don't currently support the BETWEEN ends being columns. They must be scalars.
if ((childExpr.get(2) instanceof ExprNodeDynamicValueDesc) &&
(childExpr.get(3) instanceof ExprNodeDynamicValueDesc)) {
hasDynamicValues = true;
} else if (!(childExpr.get(2) instanceof ExprNodeConstantDesc) ||
!(childExpr.get(3) instanceof ExprNodeConstantDesc)) {
return null;
}
boolean notKeywordPresent = (Boolean) ((ExprNodeConstantDesc) childExpr.get(0)).getValue();
ExprNodeDesc colExpr = childExpr.get(1);
// The children after not, might need a cast. Get common types for the two comparisons.
// Casting for 'between' is handled here as a special case, because the first child is for NOT and doesn't need
// cast
TypeInfo commonType = FunctionRegistry.getCommonClassForComparison(childExpr.get(1).getTypeInfo(),
childExpr.get(2).getTypeInfo());
if (commonType == null) {
// Can't vectorize
return null;
}
commonType = FunctionRegistry.getCommonClassForComparison(commonType, childExpr.get(3).getTypeInfo());
if (commonType == null) {
// Can't vectorize
return null;
}
List<ExprNodeDesc> castChildren = new ArrayList<ExprNodeDesc>();
for (ExprNodeDesc desc: childExpr.subList(1, 4)) {
if (commonType.equals(desc.getTypeInfo())) {
castChildren.add(desc);
} else {
GenericUDF castUdf = getGenericUDFForCast(commonType);
ExprNodeGenericFuncDesc engfd = new ExprNodeGenericFuncDesc(commonType, castUdf,
Arrays.asList(new ExprNodeDesc[] { desc }));
castChildren.add(engfd);
}
}
String colType = commonType.getTypeName();
// prepare arguments for createVectorExpression
List<ExprNodeDesc> childrenAfterNot = evaluateCastOnConstants(castChildren);
// determine class
Class<?> cl = null;
if (isIntFamily(colType) && !notKeywordPresent) {
cl = (hasDynamicValues ?
FilterLongColumnBetweenDynamicValue.class :
FilterLongColumnBetween.class);
} else if (isIntFamily(colType) && notKeywordPresent) {
cl = FilterLongColumnNotBetween.class;
} else if (isFloatFamily(colType) && !notKeywordPresent) {
cl = (hasDynamicValues ?
FilterDoubleColumnBetweenDynamicValue.class :
FilterDoubleColumnBetween.class);
} else if (isFloatFamily(colType) && notKeywordPresent) {
cl = FilterDoubleColumnNotBetween.class;
} else if (colType.equals("string") && !notKeywordPresent) {
cl = (hasDynamicValues ?
FilterStringColumnBetweenDynamicValue.class :
FilterStringColumnBetween.class);
} else if (colType.equals("string") && notKeywordPresent) {
cl = FilterStringColumnNotBetween.class;
} else if (varcharTypePattern.matcher(colType).matches() && !notKeywordPresent) {
cl = (hasDynamicValues ?
FilterVarCharColumnBetweenDynamicValue.class :
FilterVarCharColumnBetween.class);
} else if (varcharTypePattern.matcher(colType).matches() && notKeywordPresent) {
cl = FilterVarCharColumnNotBetween.class;
} else if (charTypePattern.matcher(colType).matches() && !notKeywordPresent) {
cl = (hasDynamicValues ?
FilterCharColumnBetweenDynamicValue.class :
FilterCharColumnBetween.class);
} else if (charTypePattern.matcher(colType).matches() && notKeywordPresent) {
cl = FilterCharColumnNotBetween.class;
} else if (colType.equals("timestamp") && !notKeywordPresent) {
cl = (hasDynamicValues ?
FilterTimestampColumnBetweenDynamicValue.class :
FilterTimestampColumnBetween.class);
} else if (colType.equals("timestamp") && notKeywordPresent) {
cl = FilterTimestampColumnNotBetween.class;
} else if (isDecimalFamily(colType) && !notKeywordPresent) {
cl = (hasDynamicValues ?
FilterDecimalColumnBetweenDynamicValue.class :
FilterDecimalColumnBetween.class);
} else if (isDecimalFamily(colType) && notKeywordPresent) {
cl = FilterDecimalColumnNotBetween.class;
} else if (isDateFamily(colType) && !notKeywordPresent) {
cl = (hasDynamicValues ?
FilterDateColumnBetweenDynamicValue.class :
FilterLongColumnBetween.class);
} else if (isDateFamily(colType) && notKeywordPresent) {
cl = FilterLongColumnNotBetween.class;
}
return createVectorExpression(cl, childrenAfterNot, VectorExpressionDescriptor.Mode.PROJECTION, returnType);
}
private boolean isColumnOrNonNullConst(ExprNodeDesc exprNodeDesc) {
if (exprNodeDesc instanceof ExprNodeColumnDesc) {
return true;
}
if (exprNodeDesc instanceof ExprNodeConstantDesc) {
String typeString = exprNodeDesc.getTypeString();
if (!typeString.equalsIgnoreCase("void")) {
return true;
}
}
return false;
}
private VectorExpression getWhenExpression(List<ExprNodeDesc> childExpr,
VectorExpressionDescriptor.Mode mode, TypeInfo returnType) throws HiveException {
if (mode != VectorExpressionDescriptor.Mode.PROJECTION) {
return null;
}
if (childExpr.size() != 3) {
// For now, we only optimize the 2 value case.
return null;
}
/*
* When we have 2 simple values:
* CASE WHEN boolExpr THEN column | const ELSE column | const END
* then we can convert to: IF (boolExpr THEN column | const ELSE column | const)
*/
// CONSIDER: Adding a version of IfExpr* than can handle a non-column/const expression in the
// THEN or ELSE.
ExprNodeDesc exprNodeDesc1 = childExpr.get(1);
ExprNodeDesc exprNodeDesc2 = childExpr.get(2);
if (isColumnOrNonNullConst(exprNodeDesc1) &&
isColumnOrNonNullConst(exprNodeDesc2)) {
// Yes.
GenericUDFIf genericUDFIf = new GenericUDFIf();
return
getVectorExpressionForUdf(
genericUDFIf,
GenericUDFIf.class,
childExpr,
mode,
returnType);
}
return null; // Not handled by vector classes yet.
}
/*
* Return vector expression for a custom (i.e. not built-in) UDF.
*/
private VectorExpression getCustomUDFExpression(ExprNodeGenericFuncDesc expr, VectorExpressionDescriptor.Mode mode)
throws HiveException {
boolean isFilter = false; // Assume.
if (mode == VectorExpressionDescriptor.Mode.FILTER) {
// Is output type a BOOLEAN?
TypeInfo resultTypeInfo = expr.getTypeInfo();
if (resultTypeInfo.getCategory() == Category.PRIMITIVE &&
((PrimitiveTypeInfo) resultTypeInfo).getPrimitiveCategory() == PrimitiveCategory.BOOLEAN) {
isFilter = true;
} else {
return null;
}
}
//GenericUDFBridge udfBridge = (GenericUDFBridge) expr.getGenericUDF();
List<ExprNodeDesc> childExprList = expr.getChildren();
// argument descriptors
VectorUDFArgDesc[] argDescs = new VectorUDFArgDesc[expr.getChildren().size()];
for (int i = 0; i < argDescs.length; i++) {
argDescs[i] = new VectorUDFArgDesc();
}
// positions of variable arguments (columns or non-constant expressions)
List<Integer> variableArgPositions = new ArrayList<Integer>();
// Column numbers of batch corresponding to expression result arguments
List<Integer> exprResultColumnNums = new ArrayList<Integer>();
// Prepare children
List<VectorExpression> vectorExprs = new ArrayList<VectorExpression>();
for (int i = 0; i < childExprList.size(); i++) {
ExprNodeDesc child = childExprList.get(i);
if (child instanceof ExprNodeGenericFuncDesc) {
VectorExpression e = getVectorExpression(child, VectorExpressionDescriptor.Mode.PROJECTION);
vectorExprs.add(e);
variableArgPositions.add(i);
exprResultColumnNums.add(e.getOutputColumn());
argDescs[i].setVariable(e.getOutputColumn());
} else if (child instanceof ExprNodeColumnDesc) {
variableArgPositions.add(i);
argDescs[i].setVariable(getInputColumnIndex(((ExprNodeColumnDesc) child).getColumn()));
} else if (child instanceof ExprNodeConstantDesc) {
// this is a constant (or null)
argDescs[i].setConstant((ExprNodeConstantDesc) child);
} else if (child instanceof ExprNodeDynamicValueDesc) {
VectorExpression e = getVectorExpression(child, VectorExpressionDescriptor.Mode.PROJECTION);
vectorExprs.add(e);
variableArgPositions.add(i);
exprResultColumnNums.add(e.getOutputColumn());
argDescs[i].setVariable(e.getOutputColumn());
} else {
throw new HiveException("Unable to vectorize custom UDF. Encountered unsupported expr desc : "
+ child);
}
}
// Allocate output column and get column number;
int outputCol = -1;
String resultTypeName = expr.getTypeInfo().getTypeName();
outputCol = ocm.allocateOutputColumn(expr.getTypeInfo());
// Make vectorized operator
VectorExpression ve = new VectorUDFAdaptor(expr, outputCol, resultTypeName, argDescs);
// Set child expressions
VectorExpression[] childVEs = null;
if (exprResultColumnNums.size() != 0) {
childVEs = new VectorExpression[exprResultColumnNums.size()];
for (int i = 0; i < childVEs.length; i++) {
childVEs[i] = vectorExprs.get(i);
}
}
ve.setChildExpressions(childVEs);
// Free output columns if inputs have non-leaf expression trees.
for (Integer i : exprResultColumnNums) {
ocm.freeOutputColumn(i);
}
if (isFilter) {
SelectColumnIsTrue filterVectorExpr = new SelectColumnIsTrue(outputCol);
filterVectorExpr.setChildExpressions(new VectorExpression[] {ve});
return filterVectorExpr;
} else {
return ve;
}
}
public static boolean isStringFamily(String resultType) {
return resultType.equalsIgnoreCase("string") || charVarcharTypePattern.matcher(resultType).matches() ||
resultType.equalsIgnoreCase("string_family");
}
public static boolean isDatetimeFamily(String resultType) {
return resultType.equalsIgnoreCase("timestamp") || resultType.equalsIgnoreCase("date");
}
public static boolean isTimestampFamily(String resultType) {
return resultType.equalsIgnoreCase("timestamp");
}
public static boolean isDateFamily(String resultType) {
return resultType.equalsIgnoreCase("date");
}
public static boolean isIntervalYearMonthFamily(String resultType) {
return resultType.equalsIgnoreCase("interval_year_month");
}
public static boolean isIntervalDayTimeFamily(String resultType) {
return resultType.equalsIgnoreCase("interval_day_time");
}
// return true if this is any kind of float
public static boolean isFloatFamily(String resultType) {
return resultType.equalsIgnoreCase("double")
|| resultType.equalsIgnoreCase("float");
}
// Return true if this data type is handled in the output vector as an integer.
public static boolean isIntFamily(String resultType) {
return resultType.equalsIgnoreCase("tinyint")
|| resultType.equalsIgnoreCase("smallint")
|| resultType.equalsIgnoreCase("int")
|| resultType.equalsIgnoreCase("bigint")
|| resultType.equalsIgnoreCase("boolean")
|| resultType.equalsIgnoreCase("long");
}
public static boolean isDecimalFamily(String colType) {
return decimalTypePattern.matcher(colType).matches();
}
private Object getScalarValue(ExprNodeConstantDesc constDesc)
throws HiveException {
if (constDesc.getTypeString().equalsIgnoreCase("String")) {
try {
byte[] bytes = ((String) constDesc.getValue()).getBytes("UTF-8");
return bytes;
} catch (Exception ex) {
throw new HiveException(ex);
}
} else if (constDesc.getTypeString().equalsIgnoreCase("boolean")) {
if (constDesc.getValue().equals(Boolean.valueOf(true))) {
return 1;
} else {
return 0;
}
} else if (decimalTypePattern.matcher(constDesc.getTypeString()).matches()) {
return constDesc.getValue();
} else {
return constDesc.getValue();
}
}
private long getIntFamilyScalarAsLong(ExprNodeConstantDesc constDesc)
throws HiveException {
Object o = getScalarValue(constDesc);
if (o instanceof Integer) {
return (Integer) o;
} else if (o instanceof Long) {
return (Long) o;
}
throw new HiveException("Unexpected type when converting to long : "+o.getClass().getSimpleName());
}
private double getNumericScalarAsDouble(ExprNodeDesc constDesc)
throws HiveException {
Object o = getScalarValue((ExprNodeConstantDesc) constDesc);
if (o instanceof Double) {
return (Double) o;
} else if (o instanceof Float) {
return (Float) o;
} else if (o instanceof Integer) {
return (Integer) o;
} else if (o instanceof Long) {
return (Long) o;
}
throw new HiveException("Unexpected type when converting to double");
}
private Object getVectorTypeScalarValue(ExprNodeConstantDesc constDesc) throws HiveException {
String t = constDesc.getTypeInfo().getTypeName();
VectorExpression.Type type = VectorExpression.Type.getValue(t);
Object scalarValue = getScalarValue(constDesc);
switch (type) {
case DATE:
return new Long(DateWritable.dateToDays((Date) scalarValue));
case INTERVAL_YEAR_MONTH:
return ((HiveIntervalYearMonth) scalarValue).getTotalMonths();
default:
return scalarValue;
}
}
// Get a timestamp from a string constant or cast
private Timestamp getTimestampScalar(ExprNodeDesc expr) throws HiveException {
if (expr instanceof ExprNodeGenericFuncDesc &&
((ExprNodeGenericFuncDesc) expr).getGenericUDF() instanceof GenericUDFTimestamp) {
return evaluateCastToTimestamp(expr);
}
if (!(expr instanceof ExprNodeConstantDesc)) {
throw new HiveException("Constant timestamp value expected for expression argument. " +
"Non-constant argument not supported for vectorization.");
}
ExprNodeConstantDesc constExpr = (ExprNodeConstantDesc) expr;
String constTypeString = constExpr.getTypeString();
if (isStringFamily(constTypeString) || isDatetimeFamily(constTypeString)) {
// create expression tree with type cast from string to timestamp
ExprNodeGenericFuncDesc expr2 = new ExprNodeGenericFuncDesc();
GenericUDFTimestamp f = new GenericUDFTimestamp();
expr2.setGenericUDF(f);
ArrayList<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
children.add(expr);
expr2.setChildren(children);
// initialize and evaluate
return evaluateCastToTimestamp(expr2);
}
throw new HiveException("Udf: unhandled constant type for scalar argument. "
+ "Expecting string/date/timestamp.");
}
private Timestamp evaluateCastToTimestamp(ExprNodeDesc expr) throws HiveException {
ExprNodeGenericFuncDesc expr2 = (ExprNodeGenericFuncDesc) expr;
ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(expr2);
ObjectInspector output = evaluator.initialize(null);
Object constant = evaluator.evaluate(null);
Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output);
if (!(java instanceof Timestamp)) {
throw new HiveException("Udf: failed to convert to timestamp");
}
Timestamp ts = (Timestamp) java;
return ts;
}
private Constructor<?> getConstructor(Class<?> cl) throws HiveException {
try {
Constructor<?> [] ctors = cl.getDeclaredConstructors();
if (ctors.length == 1) {
return ctors[0];
}
Constructor<?> defaultCtor = cl.getConstructor();
for (Constructor<?> ctor : ctors) {
if (!ctor.equals(defaultCtor)) {
return ctor;
}
}
throw new HiveException("Only default constructor found");
} catch (Exception ex) {
throw new HiveException(ex);
}
}
static String getScratchName(TypeInfo typeInfo) throws HiveException {
// For now, leave DECIMAL precision/scale in the name so DecimalColumnVector scratch columns
// don't need their precision/scale adjusted...
if (typeInfo.getCategory() == Category.PRIMITIVE &&
((PrimitiveTypeInfo) typeInfo).getPrimitiveCategory() == PrimitiveCategory.DECIMAL) {
return typeInfo.getTypeName();
}
Type columnVectorType = VectorizationContext.getColumnVectorTypeFromTypeInfo(typeInfo);
return columnVectorType.name().toLowerCase();
}
static String getUndecoratedName(String hiveTypeName) throws HiveException {
VectorExpressionDescriptor.ArgumentType argType = VectorExpressionDescriptor.ArgumentType.fromHiveTypeName(hiveTypeName);
switch (argType) {
case INT_FAMILY:
return "Long";
case FLOAT_FAMILY:
return "Double";
case DECIMAL:
return "Decimal";
case STRING:
return "String";
case CHAR:
return "Char";
case VARCHAR:
return "VarChar";
case BINARY:
return "Binary";
case DATE:
return "Date";
case TIMESTAMP:
return "Timestamp";
case INTERVAL_YEAR_MONTH:
case INTERVAL_DAY_TIME:
return hiveTypeName;
default:
throw new HiveException("Unexpected hive type name " + hiveTypeName);
}
}
public static String mapTypeNameSynonyms(String typeName) {
typeName = typeName.toLowerCase();
if (typeName.equals("long")) {
return "bigint";
} else if (typeName.equals("string_family")) {
return "string";
} else {
return typeName;
}
}
public static ColumnVector.Type getColumnVectorTypeFromTypeInfo(TypeInfo typeInfo) {
switch (typeInfo.getCategory()) {
case STRUCT:
return Type.STRUCT;
case UNION:
return Type.UNION;
case LIST:
return Type.LIST;
case MAP:
return Type.MAP;
case PRIMITIVE: {
PrimitiveTypeInfo primitiveTypeInfo = (PrimitiveTypeInfo) typeInfo;
PrimitiveCategory primitiveCategory = primitiveTypeInfo.getPrimitiveCategory();
switch (primitiveCategory) {
case BOOLEAN:
case BYTE:
case SHORT:
case INT:
case LONG:
case DATE:
case INTERVAL_YEAR_MONTH:
return ColumnVector.Type.LONG;
case TIMESTAMP:
return ColumnVector.Type.TIMESTAMP;
case INTERVAL_DAY_TIME:
return ColumnVector.Type.INTERVAL_DAY_TIME;
case FLOAT:
case DOUBLE:
return ColumnVector.Type.DOUBLE;
case STRING:
case CHAR:
case VARCHAR:
case BINARY:
return ColumnVector.Type.BYTES;
case DECIMAL:
return ColumnVector.Type.DECIMAL;
default:
throw new RuntimeException("Unexpected primitive type category " + primitiveCategory);
}
}
default:
throw new RuntimeException("Unexpected type category " +
typeInfo.getCategory());
}
}
/*
* In the aggregatesDefinition table, Mode is GenericUDAFEvaluator.Mode.
*
* It is the different modes for an aggregate UDAF (User Defined Aggregation Function).
*
* (Notice the these names are a subset of GroupByDesc.Mode...)
*
* PARTIAL1 Original data --> Partial aggregation data
*
* PARTIAL2 Partial aggregation data --> Partial aggregation data
*
* FINAL Partial aggregation data --> Full aggregation data
*
* COMPLETE Original data --> Full aggregation data
*
*
* SIMPLEST CASE --> The data type/semantics of original data, partial aggregation
* data, and full aggregation data ARE THE SAME. E.g. MIN, MAX, SUM. The different
* modes can be handled by one aggregation class.
*
* This case has a null for the Mode.
*
* FOR OTHERS --> The data type/semantics of partial aggregation data and full aggregation data
* ARE THE SAME but different than original data. This results in 2 aggregation classes:
*
* 1) A class that takes original rows and outputs partial/full aggregation
* (PARTIAL1/COMPLETE)
*
* and
*
* 2) A class that takes partial aggregation and produces full aggregation
* (PARTIAL2/FINAL).
*
* E.g. COUNT(*) and COUNT(column)
*
* OTHERWISE FULL --> The data type/semantics of partial aggregation data is different than
* original data and full aggregation data.
*
* E.g. AVG uses a STRUCT with count and sum for partial aggregation data. It divides
* sum by count to produce the average for final aggregation.
*
*/
static ArrayList<AggregateDefinition> aggregatesDefinition = new ArrayList<AggregateDefinition>() {{
// MIN, MAX, and SUM have the same representation for partial and full aggregation, so the
// same class can be used for all modes (PARTIAL1, PARTIAL2, FINAL, and COMPLETE).
add(new AggregateDefinition("min", ArgumentType.INT_DATE_INTERVAL_YEAR_MONTH, null, VectorUDAFMinLong.class));
add(new AggregateDefinition("min", ArgumentType.FLOAT_FAMILY, null, VectorUDAFMinDouble.class));
add(new AggregateDefinition("min", ArgumentType.STRING_FAMILY, null, VectorUDAFMinString.class));
add(new AggregateDefinition("min", ArgumentType.DECIMAL, null, VectorUDAFMinDecimal.class));
add(new AggregateDefinition("min", ArgumentType.TIMESTAMP, null, VectorUDAFMinTimestamp.class));
add(new AggregateDefinition("max", ArgumentType.INT_DATE_INTERVAL_YEAR_MONTH, null, VectorUDAFMaxLong.class));
add(new AggregateDefinition("max", ArgumentType.FLOAT_FAMILY, null, VectorUDAFMaxDouble.class));
add(new AggregateDefinition("max", ArgumentType.STRING_FAMILY, null, VectorUDAFMaxString.class));
add(new AggregateDefinition("max", ArgumentType.DECIMAL, null, VectorUDAFMaxDecimal.class));
add(new AggregateDefinition("max", ArgumentType.TIMESTAMP, null, VectorUDAFMaxTimestamp.class));
add(new AggregateDefinition("sum", ArgumentType.INT_FAMILY, null, VectorUDAFSumLong.class));
add(new AggregateDefinition("sum", ArgumentType.FLOAT_FAMILY, null, VectorUDAFSumDouble.class));
add(new AggregateDefinition("sum", ArgumentType.DECIMAL, null, VectorUDAFSumDecimal.class));
// COUNT(column) doesn't count rows whose column value is NULL.
add(new AggregateDefinition("count", ArgumentType.ALL_FAMILY, Mode.PARTIAL1, VectorUDAFCount.class));
add(new AggregateDefinition("count", ArgumentType.ALL_FAMILY, Mode.COMPLETE, VectorUDAFCount.class));
// COUNT(*) counts all rows regardless of whether the column value(s) are NULL.
add(new AggregateDefinition("count", ArgumentType.NONE, Mode.PARTIAL1, VectorUDAFCountStar.class));
add(new AggregateDefinition("count", ArgumentType.NONE, Mode.COMPLETE, VectorUDAFCountStar.class));
// Merge the counts produced by either COUNT(column) or COUNT(*) modes PARTIAL1 or PARTIAL2.
add(new AggregateDefinition("count", ArgumentType.INT_FAMILY, Mode.PARTIAL2, VectorUDAFCountMerge.class));
add(new AggregateDefinition("count", ArgumentType.INT_FAMILY, Mode.FINAL, VectorUDAFCountMerge.class));
// Since the partial aggregation produced by AVG is a STRUCT with count and sum and the
// STRUCT data type isn't vectorized yet, we currently only support PARTIAL1. When we do
// support STRUCTs for average partial aggregation, we'll need 4 variations:
//
// PARTIAL1 Original data --> STRUCT Average Partial Aggregation
// PARTIAL2 STRUCT Average Partial Aggregation --> STRUCT Average Partial Aggregation
// FINAL STRUCT Average Partial Aggregation --> Full Aggregation
// COMPLETE Original data --> Full Aggregation
//
add(new AggregateDefinition("avg", ArgumentType.INT_FAMILY, Mode.PARTIAL1, VectorUDAFAvgLong.class));
add(new AggregateDefinition("avg", ArgumentType.FLOAT_FAMILY, Mode.PARTIAL1, VectorUDAFAvgDouble.class));
add(new AggregateDefinition("avg", ArgumentType.DECIMAL, Mode.PARTIAL1, VectorUDAFAvgDecimal.class));
add(new AggregateDefinition("avg", ArgumentType.TIMESTAMP, Mode.PARTIAL1, VectorUDAFAvgTimestamp.class));
// We haven't had a chance to examine the VAR* and STD* area and expand it beyond PARTIAL1.
add(new AggregateDefinition("variance", ArgumentType.INT_FAMILY, Mode.PARTIAL1, VectorUDAFVarPopLong.class));
add(new AggregateDefinition("var_pop", ArgumentType.INT_FAMILY, Mode.PARTIAL1, VectorUDAFVarPopLong.class));
add(new AggregateDefinition("variance", ArgumentType.FLOAT_FAMILY, Mode.PARTIAL1, VectorUDAFVarPopDouble.class));
add(new AggregateDefinition("var_pop", ArgumentType.FLOAT_FAMILY, Mode.PARTIAL1, VectorUDAFVarPopDouble.class));
add(new AggregateDefinition("variance", ArgumentType.DECIMAL, Mode.PARTIAL1, VectorUDAFVarPopDecimal.class));
add(new AggregateDefinition("var_pop", ArgumentType.DECIMAL, Mode.PARTIAL1, VectorUDAFVarPopDecimal.class));
add(new AggregateDefinition("variance", ArgumentType.TIMESTAMP, Mode.PARTIAL1, VectorUDAFVarPopTimestamp.class));
add(new AggregateDefinition("var_pop", ArgumentType.TIMESTAMP, Mode.PARTIAL1, VectorUDAFVarPopTimestamp.class));
add(new AggregateDefinition("var_samp", ArgumentType.INT_FAMILY, Mode.PARTIAL1, VectorUDAFVarSampLong.class));
add(new AggregateDefinition("var_samp" , ArgumentType.FLOAT_FAMILY, Mode.PARTIAL1, VectorUDAFVarSampDouble.class));
add(new AggregateDefinition("var_samp" , ArgumentType.DECIMAL, Mode.PARTIAL1, VectorUDAFVarSampDecimal.class));
add(new AggregateDefinition("var_samp" , ArgumentType.TIMESTAMP, Mode.PARTIAL1, VectorUDAFVarSampTimestamp.class));
add(new AggregateDefinition("std", ArgumentType.INT_FAMILY, Mode.PARTIAL1, VectorUDAFStdPopLong.class));
add(new AggregateDefinition("stddev", ArgumentType.INT_FAMILY, Mode.PARTIAL1, VectorUDAFStdPopLong.class));
add(new AggregateDefinition("stddev_pop", ArgumentType.INT_FAMILY, Mode.PARTIAL1, VectorUDAFStdPopLong.class));
add(new AggregateDefinition("std", ArgumentType.FLOAT_FAMILY, Mode.PARTIAL1, VectorUDAFStdPopDouble.class));
add(new AggregateDefinition("stddev", ArgumentType.FLOAT_FAMILY, Mode.PARTIAL1, VectorUDAFStdPopDouble.class));
add(new AggregateDefinition("stddev_pop", ArgumentType.FLOAT_FAMILY, Mode.PARTIAL1, VectorUDAFStdPopDouble.class));
add(new AggregateDefinition("std", ArgumentType.DECIMAL, Mode.PARTIAL1, VectorUDAFStdPopDecimal.class));
add(new AggregateDefinition("stddev", ArgumentType.DECIMAL, Mode.PARTIAL1, VectorUDAFStdPopDecimal.class));
add(new AggregateDefinition("stddev_pop", ArgumentType.DECIMAL, Mode.PARTIAL1, VectorUDAFStdPopDecimal.class));
add(new AggregateDefinition("std", ArgumentType.TIMESTAMP, Mode.PARTIAL1, VectorUDAFStdPopTimestamp.class));
add(new AggregateDefinition("stddev", ArgumentType.TIMESTAMP, Mode.PARTIAL1, VectorUDAFStdPopTimestamp.class));
add(new AggregateDefinition("stddev_pop", ArgumentType.TIMESTAMP, Mode.PARTIAL1, VectorUDAFStdPopTimestamp.class));
add(new AggregateDefinition("stddev_samp", ArgumentType.INT_FAMILY, Mode.PARTIAL1, VectorUDAFStdSampLong.class));
add(new AggregateDefinition("stddev_samp", ArgumentType.FLOAT_FAMILY, Mode.PARTIAL1, VectorUDAFStdSampDouble.class));
add(new AggregateDefinition("stddev_samp", ArgumentType.DECIMAL, Mode.PARTIAL1, VectorUDAFStdSampDecimal.class));
add(new AggregateDefinition("stddev_samp", ArgumentType.TIMESTAMP, Mode.PARTIAL1, VectorUDAFStdSampTimestamp.class));
// UDAFBloomFilter. Original data is one type, partial/final is another,
// so this requires 2 aggregation classes (partial1/complete), (partial2/final)
add(new AggregateDefinition("bloom_filter", ArgumentType.ALL_FAMILY, Mode.PARTIAL1, VectorUDAFBloomFilter.class));
add(new AggregateDefinition("bloom_filter", ArgumentType.ALL_FAMILY, Mode.COMPLETE, VectorUDAFBloomFilter.class));
add(new AggregateDefinition("bloom_filter", ArgumentType.BINARY, Mode.PARTIAL2, VectorUDAFBloomFilterMerge.class));
add(new AggregateDefinition("bloom_filter", ArgumentType.BINARY, Mode.FINAL, VectorUDAFBloomFilterMerge.class));
}};
public VectorAggregateExpression getAggregatorExpression(AggregationDesc desc)
throws HiveException {
ArrayList<ExprNodeDesc> paramDescList = desc.getParameters();
VectorExpression[] vectorParams = new VectorExpression[paramDescList.size()];
for (int i = 0; i< paramDescList.size(); ++i) {
ExprNodeDesc exprDesc = paramDescList.get(i);
vectorParams[i] = this.getVectorExpression(exprDesc, VectorExpressionDescriptor.Mode.PROJECTION);
}
String aggregateName = desc.getGenericUDAFName();
VectorExpressionDescriptor.ArgumentType inputType = VectorExpressionDescriptor.ArgumentType.NONE;
if (paramDescList.size() > 0) {
ExprNodeDesc inputExpr = paramDescList.get(0);
inputType = VectorExpressionDescriptor.ArgumentType.fromHiveTypeName(inputExpr.getTypeString());
if (inputType == VectorExpressionDescriptor.ArgumentType.NONE) {
throw new HiveException("No vector argument type for Hive type name " + inputExpr.getTypeString());
}
}
GenericUDAFEvaluator.Mode udafEvaluatorMode = desc.getMode();
for (AggregateDefinition aggDef : aggregatesDefinition) {
if (aggregateName.equalsIgnoreCase(aggDef.getName()) &&
((aggDef.getType() == VectorExpressionDescriptor.ArgumentType.NONE &&
inputType == VectorExpressionDescriptor.ArgumentType.NONE) ||
(aggDef.getType().isSameTypeOrFamily(inputType)))) {
// A null means all modes are ok.
GenericUDAFEvaluator.Mode aggDefUdafEvaluatorMode = aggDef.getUdafEvaluatorMode();
if (aggDefUdafEvaluatorMode != null && aggDefUdafEvaluatorMode != udafEvaluatorMode) {
continue;
}
Class<? extends VectorAggregateExpression> aggClass = aggDef.getAggClass();
try
{
Constructor<? extends VectorAggregateExpression> ctor =
aggClass.getConstructor(VectorExpression.class);
VectorAggregateExpression aggExpr = ctor.newInstance(
vectorParams.length > 0 ? vectorParams[0] : null);
aggExpr.init(desc);
return aggExpr;
} catch (Exception e) {
throw new HiveException("Internal exception for vector aggregate : \"" +
aggregateName + "\" for type: \"" + inputType + "", e);
}
}
}
throw new HiveException("Vector aggregate not implemented: \"" + aggregateName +
"\" for type: \"" + inputType.name() +
" (UDAF evaluator mode = " +
(udafEvaluatorMode == null ? "NULL" : udafEvaluatorMode.name()) + ")");
}
public int firstOutputColumnIndex() {
return firstOutputColumnIndex;
}
public String[] getScratchColumnTypeNames() {
String[] result = new String[ocm.outputColCount];
for (int i = 0; i < ocm.outputColCount; i++) {
String vectorTypeName = ocm.scratchVectorTypeNames[i];
String typeName;
if (vectorTypeName.equalsIgnoreCase("bytes")) {
// Use hive type name.
typeName = "string";
} else if (vectorTypeName.equalsIgnoreCase("long")) {
// Use hive type name.
typeName = "bigint";
} else {
typeName = vectorTypeName;
}
result[i] = typeName;
}
return result;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder(32);
sb.append("Context name ").append(contextName).append(", level " + level + ", ");
Comparator<Integer> comparerInteger = new Comparator<Integer>() {
@Override
public int compare(Integer o1, Integer o2) {
return o1.compareTo(o2);
}};
Map<Integer, String> sortedColumnMap = new TreeMap<Integer, String>(comparerInteger);
for (Map.Entry<String, Integer> entry : projectionColumnMap.entrySet()) {
sortedColumnMap.put(entry.getValue(), entry.getKey());
}
sb.append("sorted projectionColumnMap ").append(sortedColumnMap).append(", ");
sb.append("scratchColumnTypeNames ").append(Arrays.toString(getScratchColumnTypeNames()));
return sb.toString();
}
}