/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.optimizer.pcr;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
import org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcFactory;
import org.apache.hadoop.hive.ql.optimizer.ppr.PartExprEvalUtils;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDynamicListDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Expression processor factory for partition condition removing. Each processor tries to
* calculate a result vector from its children's result vectors.
* Each element is the result for one of the pruned partitions.
* It also generates node by Modifying expr trees with partition conditions removed
*/
public final class PcrExprProcFactory {
public static final Logger LOG = LoggerFactory.getLogger(PcrExprProcFactory.class.getName());
static Object evalExprWithPart(ExprNodeDesc expr, Partition p, List<VirtualColumn> vcs)
throws SemanticException {
StructObjectInspector rowObjectInspector;
Table tbl = p.getTable();
try {
rowObjectInspector = (StructObjectInspector) tbl
.getDeserializer().getObjectInspector();
} catch (SerDeException e) {
throw new SemanticException(e);
}
try {
return PartExprEvalUtils.evalExprWithPart(expr, p, vcs, rowObjectInspector);
} catch (HiveException e) {
throw new SemanticException(e);
}
}
static Boolean ifResultsAgree(Boolean[] resultVector) {
Boolean result = null;
for (Boolean b : resultVector) {
if (b == null) {
return null;
} else if (result == null) {
result = b;
} else if (!result.equals(b)) {
return null;
}
}
return result;
}
static Object ifResultsAgree(Object[] resultVector) {
Object result = null;
for (Object b : resultVector) {
if (b == null) {
return null;
} else if (result == null) {
result = b;
} else if (!result.equals(b)) {
return null;
}
}
return result;
}
static NodeInfoWrapper getResultWrapFromResults(Boolean[] results,
ExprNodeGenericFuncDesc fd, Object[] nodeOutputs) {
Boolean ifAgree = ifResultsAgree(results);
if (ifAgree == null) {
return new NodeInfoWrapper(WalkState.DIVIDED, results,
getOutExpr(fd, nodeOutputs));
} else if (ifAgree.booleanValue() == true) {
return new NodeInfoWrapper(WalkState.TRUE, null,
new ExprNodeConstantDesc(fd.getTypeInfo(), Boolean.TRUE));
} else {
return new NodeInfoWrapper(WalkState.FALSE, null,
new ExprNodeConstantDesc(fd.getTypeInfo(), Boolean.FALSE));
}
}
private PcrExprProcFactory() {
// prevent instantiation
}
static Boolean opAnd(Boolean... ops) {
// When people forget to quote a string, op1/op2 is null.
// For example, select * from some_table where ds > 2012-12-1 and ds < 2012-12-2 .
boolean anyNull = false;
for (Boolean op : ops) {
if (op == null) {
anyNull = true;
continue;
}
if (op.equals(Boolean.FALSE)) {
return Boolean.FALSE;
}
}
if (anyNull) {
return null;
}
return Boolean.TRUE;
}
static Boolean opOr(Boolean... ops) {
// When people forget to quote a string, op1/op2 is null.
// For example, select * from some_table where ds > 2012-12-1 or ds < 2012-12-2 .
boolean anyNull = false;
for (Boolean op : ops) {
if (op == null) {
anyNull = true;
continue;
}
if (op.equals(Boolean.TRUE)) {
return Boolean.TRUE;
}
}
if (anyNull) {
return null;
}
return Boolean.FALSE;
}
static Boolean opNot(Boolean op) {
// When people forget to quote a string, op1/op2 is null.
// For example, select * from some_table where not ds > 2012-12-1 .
if (op != null) {
if (op.equals(Boolean.TRUE)) {
return Boolean.FALSE;
}
if (op.equals(Boolean.FALSE)) {
return Boolean.TRUE;
}
}
return null;
}
public enum WalkState {
PART_COL, TRUE, FALSE, CONSTANT, UNKNOWN, DIVIDED, PART_COL_STRUCT
}
public static class NodeInfoWrapper {
public NodeInfoWrapper(WalkState state, Boolean[] resultVector, ExprNodeDesc outExpr) {
super();
this.state = state;
ResultVector = resultVector;
this.outExpr = outExpr;
}
WalkState state;
public Boolean[] ResultVector;
public ExprNodeDesc outExpr;
}
/**
* Processor for column expressions.
*/
public static class ColumnExprProcessor implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
ExprNodeColumnDesc cd = (ExprNodeColumnDesc) nd;
PcrExprProcCtx epc = (PcrExprProcCtx) procCtx;
if (cd.getTabAlias().equalsIgnoreCase(epc.getTabAlias())
&& cd.getIsPartitionColOrVirtualCol()) {
return new NodeInfoWrapper(WalkState.PART_COL, null, cd);
} else {
return new NodeInfoWrapper(WalkState.UNKNOWN, null, cd);
}
}
}
public static ExprNodeGenericFuncDesc getOutExpr(
ExprNodeGenericFuncDesc funcExpr, Object[] nodeOutputs) {
ArrayList<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
if (nodeOutputs != null) {
for (Object child : nodeOutputs) {
NodeInfoWrapper wrapper = (NodeInfoWrapper) child;
children.add(wrapper.outExpr);
}
}
funcExpr.setChildren(children);
return funcExpr;
}
/**
* Processor for Generic functions
*
* If it is AND, OR or NOT, we replace the node to be the constant true or
* false if we are sure the result from children, or cut one of the child
* if we know partial results. In case of both child has a result vector,
* we calculate the result vector for the node. If all partitions agree on
* a result, we replace the node with constant true or false. Otherwise, we
* pass the vector result. For other Generic functions, if it is non-deterministic
* we simply pass it (with children adjusted based on results from children).
* If it is deterministic, we evaluate result vector if any of the children
* is partition column. Otherwise, we pass it as it is.
*/
public static class GenericFuncExprProcessor implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
PcrExprProcCtx ctx = (PcrExprProcCtx) procCtx;
ExprNodeGenericFuncDesc fd = (ExprNodeGenericFuncDesc) nd;
if (LOG.isDebugEnabled()) {
String err = "Processing " + fd.getExprString() + " "
+ fd.getGenericUDF().getUdfName() + " outputs ";
for (Object child : nodeOutputs) {
NodeInfoWrapper wrapper = (NodeInfoWrapper) child;
err += "{" + wrapper.state + ", " + wrapper.outExpr + "}, ";
}
LOG.debug(err);
}
if (FunctionRegistry.isOpNot(fd)) {
return handleUdfNot(ctx, fd, nodeOutputs);
} else if (FunctionRegistry.isOpAnd(fd)) {
return handleUdfAnd(ctx, fd, nodeOutputs);
} else if (FunctionRegistry.isOpOr(fd)) {
return handleUdfOr(ctx, fd, nodeOutputs);
} else if (FunctionRegistry.isIn(fd)) {
List<ExprNodeDesc> children = fd.getChildren();
// We should not remove the dynamic partition pruner generated synthetic predicates.
for (int i = 1; i < children.size(); i++) {
if (children.get(i) instanceof ExprNodeDynamicListDesc) {
return new NodeInfoWrapper(WalkState.UNKNOWN, null, getOutExpr(fd, nodeOutputs));
}
}
// Otherwise, handle like a normal generic UDF.
return handleDeterministicUdf(ctx, fd, nodeOutputs);
} else if (fd.getGenericUDF() instanceof GenericUDFStruct) {
// Handle structs composed of partition columns,
for (Object child : nodeOutputs) {
NodeInfoWrapper wrapper = (NodeInfoWrapper) child;
if (wrapper.state != WalkState.PART_COL) {
return handleDeterministicUdf(ctx, fd, nodeOutputs); // Giving up.
}
}
return new NodeInfoWrapper(WalkState.PART_COL_STRUCT, null, getOutExpr(fd, nodeOutputs));
} else if (!FunctionRegistry.isDeterministic(fd.getGenericUDF())) {
// If it's a non-deterministic UDF, set unknown to true
return new NodeInfoWrapper(WalkState.UNKNOWN, null, getOutExpr(fd, nodeOutputs));
} else {
return handleDeterministicUdf(ctx, fd, nodeOutputs);
}
}
private Object handleDeterministicUdf(PcrExprProcCtx ctx,
ExprNodeGenericFuncDesc fd, Object... nodeOutputs)
throws SemanticException {
Boolean has_part_col = checkForPartColsAndUnknown(fd, nodeOutputs);
if (has_part_col == null) {
return new NodeInfoWrapper(WalkState.UNKNOWN, null, getOutExpr(fd, nodeOutputs));
}
if (has_part_col && fd.getTypeInfo().getCategory() == Category.PRIMITIVE) {
// we need to evaluate result for every pruned partition
if (fd.getTypeInfo().equals(TypeInfoFactory.booleanTypeInfo)) {
// if the return type of the GenericUDF is boolean and all partitions agree on
// a result, we update the state of the node to be TRUE of FALSE
Boolean[] results = new Boolean[ctx.getPartList().size()];
for (int i = 0; i < ctx.getPartList().size(); i++) {
results[i] = (Boolean) evalExprWithPart(fd, ctx.getPartList().get(i),
ctx.getVirtualColumns());
}
return getResultWrapFromResults(results, fd, nodeOutputs);
}
// the case that return type of the GenericUDF is not boolean, and if not all partition
// agree on result, we make the node UNKNOWN. If they all agree, we replace the node
// to be a CONSTANT node with value to be the agreed result.
Object[] results = new Object[ctx.getPartList().size()];
for (int i = 0; i < ctx.getPartList().size(); i++) {
results[i] = evalExprWithPart(fd, ctx.getPartList().get(i), ctx.getVirtualColumns());
}
Object result = ifResultsAgree(results);
if (result == null) {
// if the result is not boolean and not all partition agree on the
// result, we don't remove the condition. Potentially, it can miss
// the case like "where ds % 3 == 1 or ds % 3 == 2"
// TODO: handle this case by making result vector to handle all
// constant values.
return new NodeInfoWrapper(WalkState.UNKNOWN, null, getOutExpr(fd, nodeOutputs));
}
return new NodeInfoWrapper(WalkState.CONSTANT, null,
new ExprNodeConstantDesc(fd.getTypeInfo(), result));
}
// Try to fold, otherwise return the expression itself
final ExprNodeGenericFuncDesc desc = getOutExpr(fd, nodeOutputs);
final ExprNodeDesc foldedDesc = ConstantPropagateProcFactory.foldExpr(desc);
if (foldedDesc != null && foldedDesc instanceof ExprNodeConstantDesc) {
ExprNodeConstantDesc constant = (ExprNodeConstantDesc) foldedDesc;
if (Boolean.TRUE.equals(constant.getValue())) {
return new NodeInfoWrapper(WalkState.TRUE, null, constant);
} else if (Boolean.FALSE.equals(constant.getValue())) {
return new NodeInfoWrapper(WalkState.FALSE, null, constant);
} else {
return new NodeInfoWrapper(WalkState.CONSTANT, null, constant);
}
}
return new NodeInfoWrapper(WalkState.CONSTANT, null, desc);
}
private Boolean checkForPartColsAndUnknown(ExprNodeGenericFuncDesc fd,
Object... nodeOutputs) {
boolean has_part_col = false;
for (Object child : nodeOutputs) {
NodeInfoWrapper wrapper = (NodeInfoWrapper) child;
if (wrapper.state == WalkState.UNKNOWN) {
return null;
} else if (wrapper.state == WalkState.PART_COL
|| wrapper.state == WalkState.PART_COL_STRUCT) {
has_part_col = true;
}
}
return has_part_col;
}
private Object handleUdfOr(PcrExprProcCtx ctx, ExprNodeGenericFuncDesc fd,
Object... nodeOutputs) {
boolean anyUnknown = false; // Whether any of the node outputs is unknown
boolean allDivided = true; // Whether all of the node outputs are divided
List<NodeInfoWrapper> newNodeOutputsList =
new ArrayList<NodeInfoWrapper>(nodeOutputs.length);
for (int i = 0; i< nodeOutputs.length; i++) {
NodeInfoWrapper c = (NodeInfoWrapper)nodeOutputs[i];
if (c.state == WalkState.TRUE) {
return c;
}
if (c.state == WalkState.UNKNOWN) {
anyUnknown = true;
}
if (c.state != WalkState.DIVIDED) {
allDivided = false;
}
if (c.state != WalkState.FALSE) {
newNodeOutputsList.add(c);
}
}
// If all of them were false, return false
if (newNodeOutputsList.size() == 0) {
return new NodeInfoWrapper(WalkState.FALSE, null,
new ExprNodeConstantDesc(fd.getTypeInfo(), Boolean.FALSE));
}
// If we are left with a single child, return the child
if (newNodeOutputsList.size() == 1) {
return newNodeOutputsList.get(0);
}
Object[] newNodeOutputs = newNodeOutputsList.toArray();
if (anyUnknown) {
return new NodeInfoWrapper(WalkState.UNKNOWN, null, getOutExpr(fd, newNodeOutputs));
}
if (allDivided) {
Boolean[] results = new Boolean[ctx.getPartList().size()];
for (int i = 0; i < ctx.getPartList().size(); i++) {
Boolean[] orArray = new Boolean[newNodeOutputs.length];
for (int j = 0; j < newNodeOutputs.length; j++) {
orArray[j] = ((NodeInfoWrapper) newNodeOutputs[j]).ResultVector[i];
}
results[i] = opOr(orArray);
}
return getResultWrapFromResults(results, fd, newNodeOutputs);
}
return new NodeInfoWrapper(WalkState.UNKNOWN, null, getOutExpr(fd, newNodeOutputs));
}
private Object handleUdfAnd(PcrExprProcCtx ctx, ExprNodeGenericFuncDesc fd,
Object... nodeOutputs) {
boolean anyUnknown = false; // Whether any of the node outputs is unknown
boolean allDivided = true; // Whether all of the node outputs are divided
List<NodeInfoWrapper> newNodeOutputsList =
new ArrayList<NodeInfoWrapper>(nodeOutputs.length);
for (int i = 0; i < nodeOutputs.length; i++) {
NodeInfoWrapper c = (NodeInfoWrapper)nodeOutputs[i];
if (c.state == WalkState.FALSE) {
return c;
}
if (c.state == WalkState.UNKNOWN) {
anyUnknown = true;
}
if (c.state != WalkState.DIVIDED) {
allDivided = false;
}
if (c.state != WalkState.TRUE) {
newNodeOutputsList.add(c);
}
}
// If all of them were true, return true
if (newNodeOutputsList.size() == 0) {
return new NodeInfoWrapper(WalkState.TRUE, null,
new ExprNodeConstantDesc(fd.getTypeInfo(), Boolean.TRUE));
}
// If we are left with a single child, return the child
if (newNodeOutputsList.size() == 1) {
return newNodeOutputsList.get(0);
}
Object[] newNodeOutputs = newNodeOutputsList.toArray();
if (anyUnknown) {
return new NodeInfoWrapper(WalkState.UNKNOWN, null, getOutExpr(fd, newNodeOutputs));
}
if (allDivided) {
Boolean[] results = new Boolean[ctx.getPartList().size()];
for (int i = 0; i < ctx.getPartList().size(); i++) {
Boolean[] andArray = new Boolean[newNodeOutputs.length];
for (int j = 0; j < newNodeOutputs.length; j++) {
andArray[j] = ((NodeInfoWrapper) newNodeOutputs[j]).ResultVector[i];
}
results[i] = opAnd(andArray);
}
return getResultWrapFromResults(results, fd, newNodeOutputs);
}
return new NodeInfoWrapper(WalkState.UNKNOWN, null, getOutExpr(fd, newNodeOutputs));
}
private Object handleUdfNot(PcrExprProcCtx ctx, ExprNodeGenericFuncDesc fd,
Object... nodeOutputs) {
assert (nodeOutputs.length == 1);
NodeInfoWrapper wrapper = (NodeInfoWrapper) nodeOutputs[0];
if (wrapper.state == WalkState.TRUE) {
ExprNodeConstantDesc falseDesc = new ExprNodeConstantDesc(
wrapper.outExpr.getTypeInfo(), Boolean.FALSE);
return new NodeInfoWrapper(WalkState.FALSE, null, falseDesc);
} else if (wrapper.state == WalkState.FALSE) {
ExprNodeConstantDesc trueDesc = new ExprNodeConstantDesc(
wrapper.outExpr.getTypeInfo(), Boolean.TRUE);
return new NodeInfoWrapper(WalkState.TRUE, null, trueDesc);
} else if (wrapper.state == WalkState.DIVIDED) {
Boolean[] results = new Boolean[ctx.getPartList().size()];
for (int i = 0; i < ctx.getPartList().size(); i++) {
results[i] = opNot(wrapper.ResultVector[i]);
}
return new NodeInfoWrapper(WalkState.DIVIDED, results,
getOutExpr(fd, nodeOutputs));
} else {
return new NodeInfoWrapper(wrapper.state, null,
getOutExpr(fd, nodeOutputs));
}
}
};
/**
* FieldExprProcessor.
*
*/
public static class FieldExprProcessor implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
ExprNodeFieldDesc fnd = (ExprNodeFieldDesc) nd;
boolean unknown = false;
for (Object child : nodeOutputs) {
NodeInfoWrapper wrapper = (NodeInfoWrapper) child;
if (wrapper.state == WalkState.UNKNOWN) {
unknown = true;
break;
}
}
if (unknown) {
return new NodeInfoWrapper(WalkState.UNKNOWN, null, fnd);
} else {
return new NodeInfoWrapper(WalkState.CONSTANT, null, fnd);
}
}
}
/**
* Processor for constants and null expressions. For such expressions the
* processor simply returns.
*/
public static class DefaultExprProcessor implements NodeProcessor {
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
if (nd instanceof ExprNodeConstantDesc) {
return new NodeInfoWrapper(WalkState.CONSTANT, null,
(ExprNodeDesc) nd);
}
return new NodeInfoWrapper(WalkState.UNKNOWN, null, (ExprNodeDesc)nd);
}
}
public static NodeProcessor getDefaultExprProcessor() {
return new DefaultExprProcessor();
}
public static NodeProcessor getGenericFuncProcessor() {
return new GenericFuncExprProcessor();
}
public static NodeProcessor getFieldProcessor() {
return new FieldExprProcessor();
}
public static NodeProcessor getColumnProcessor() {
return new ColumnExprProcessor();
}
/**
* Remove partition conditions when necessary from the the expression tree.
*
* @param tabAlias
* the table alias
* @param parts
* the list of all pruned partitions for the table
* @param vcs
* virtual columns referenced
* @param pred
* expression tree of the target filter operator
* @return the node information of the root expression
* @throws SemanticException
*/
public static NodeInfoWrapper walkExprTree(
String tabAlias, ArrayList<Partition> parts, List<VirtualColumn> vcs, ExprNodeDesc pred)
throws SemanticException {
// Create the walker, the rules dispatcher and the context.
PcrExprProcCtx pprCtx = new PcrExprProcCtx(tabAlias, parts, vcs);
Map<Rule, NodeProcessor> exprRules = new LinkedHashMap<Rule, NodeProcessor>();
exprRules.put(
new RuleRegExp("R1", ExprNodeColumnDesc.class.getName() + "%"),
getColumnProcessor());
exprRules.put(
new RuleRegExp("R2", ExprNodeFieldDesc.class.getName() + "%"),
getFieldProcessor());
exprRules.put(new RuleRegExp("R5", ExprNodeGenericFuncDesc.class.getName()
+ "%"), getGenericFuncProcessor());
// The dispatcher fires the processor corresponding to the closest matching
// rule and passes the context along
Dispatcher disp = new DefaultRuleDispatcher(getDefaultExprProcessor(),
exprRules, pprCtx);
GraphWalker egw = new DefaultGraphWalker(disp);
List<Node> startNodes = new ArrayList<Node>();
startNodes.add(pred);
HashMap<Node, Object> outputMap = new HashMap<Node, Object>();
egw.startWalking(startNodes, outputMap);
// Return the wrapper of the root node
return (NodeInfoWrapper) outputMap.get(pred);
}
}