/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.optimizer; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.FilterOperator; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; import org.apache.hadoop.hive.ql.lib.Dispatcher; import org.apache.hadoop.hive.ql.lib.ForwardWalker; import org.apache.hadoop.hive.ql.lib.GraphWalker; import org.apache.hadoop.hive.ql.lib.Node; import org.apache.hadoop.hive.ql.lib.NodeProcessor; import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; import org.apache.hadoop.hive.ql.lib.PreOrderOnceWalker; import org.apache.hadoop.hive.ql.lib.Rule; import org.apache.hadoop.hive.ql.lib.RuleRegExp; import org.apache.hadoop.hive.ql.lib.TypeRule; import org.apache.hadoop.hive.ql.parse.ParseContext; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; /** * This optimization will take a Filter expression, and if its predicate contains * an IN operator whose children are constant structs or structs containing constant fields, * it will try to generate predicate with IN clauses containing only partition columns. * This predicate is in turn used by the partition pruner to prune the columns that are not * part of the original IN(STRUCT(..)..) predicate. */ public class PartitionColumnsSeparator extends Transform { private static final Log LOG = LogFactory.getLog(PartitionColumnsSeparator.class); private static final String IN_UDF = GenericUDFIn.class.getAnnotation(Description.class).name(); private static final String STRUCT_UDF = GenericUDFStruct.class.getAnnotation(Description.class).name(); private static final String AND_UDF = GenericUDFOPAnd.class.getAnnotation(Description.class).name(); @Override public ParseContext transform(ParseContext pctx) throws SemanticException { // 1. Trigger transformation Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put(new RuleRegExp("R1", FilterOperator.getOperatorName() + "%"), new StructInTransformer()); Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null); GraphWalker ogw = new ForwardWalker(disp); List<Node> topNodes = new ArrayList<Node>(); topNodes.addAll(pctx.getTopOps().values()); ogw.startWalking(topNodes, null); return pctx; } private class StructInTransformer implements NodeProcessor { @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { FilterOperator filterOp = (FilterOperator) nd; ExprNodeDesc predicate = filterOp.getConf().getPredicate(); // Generate the list bucketing pruning predicate as 2 separate IN clauses // containing the partitioning and non-partitioning columns. ExprNodeDesc newPredicate = generateInClauses(predicate); if (newPredicate != null) { // Replace filter in current FIL with new FIL if (LOG.isDebugEnabled()) { LOG.debug("Generated new predicate with IN clause: " + newPredicate); } final List<ExprNodeDesc> subExpr = new ArrayList<ExprNodeDesc>(2); subExpr.add(predicate); subExpr.add(newPredicate); ExprNodeGenericFuncDesc newFilterPredicate = new ExprNodeGenericFuncDesc( TypeInfoFactory.booleanTypeInfo, FunctionRegistry.getFunctionInfo(AND_UDF).getGenericUDF(), subExpr); filterOp.getConf().setPredicate(newFilterPredicate); } return null; } private ExprNodeDesc generateInClauses(ExprNodeDesc predicate) throws SemanticException { Map<Rule, NodeProcessor> exprRules = new LinkedHashMap<Rule, NodeProcessor>(); exprRules.put(new TypeRule(ExprNodeGenericFuncDesc.class), new StructInExprProcessor()); // The dispatcher fires the processor corresponding to the closest matching // rule and passes the context along Dispatcher disp = new DefaultRuleDispatcher(null, exprRules, null); GraphWalker egw = new PreOrderOnceWalker(disp); List<Node> startNodes = new ArrayList<Node>(); startNodes.add(predicate); HashMap<Node, Object> outputMap = new HashMap<Node, Object>(); egw.startWalking(startNodes, outputMap); return (ExprNodeDesc) outputMap.get(predicate); } } /** * The StructInExprProcessor processes the IN clauses of the following format : * STRUCT(T1.a, T1.b, T2.b, T2.c) IN (STRUCT(1, 2, 3, 4) , STRUCT(2, 3, 4, 5)) * where T1.a, T1.b, T2.c are all partition columns and T2.b is a non-partition * column. The resulting additional predicate generated after * StructInExprProcessor.process() looks like : * STRUCT(T1.a, T1.b) IN (STRUCT(1, 2), STRUCT(2, 3)) * AND * STRUCT(T2.b) IN (STRUCT(4), STRUCT(5)) * The additional predicate generated is used to prune the partitions that are * part of the given query. Once the partitions are pruned, the partition condition * remover is expected to remove the redundant predicates from the plan. */ private class StructInExprProcessor implements NodeProcessor { /** TableInfo is populated in PASS 1 of process(). It contains the information required * to generate an IN clause of the following format: * STRUCT(T1.a, T1.b) IN (const STRUCT(1, 2), const STRUCT(2, 3)) * In the above e.g. please note that all elements of the struct come from the same table. * The populated TableStructInfo is used to generate the IN clause in PASS 2 of process(). * The table struct information class has the following fields: * 1. Expression Node Descriptor for the Left Hand Side of the IN clause for the table * 2. 2-D List of expression node descriptors which corresponds to the elements of IN clause */ class TableInfo { List<ExprNodeDesc> exprNodeLHSDescriptor; List<List<ExprNodeDesc>> exprNodeRHSStructs; public TableInfo() { exprNodeLHSDescriptor = new ArrayList<ExprNodeDesc>(); exprNodeRHSStructs = new ArrayList<List<ExprNodeDesc>>(); } } // Mapping from expression node to is an expression containing only // partition or virtual column or constants private Map<ExprNodeDesc, Boolean> exprNodeToPartOrVirtualColOrConstExpr = new IdentityHashMap<ExprNodeDesc, Boolean>(); /** * This function iterates through the entire subtree under a given expression node * and makes sure that the expression contain only constant nodes or * partition/virtual columns as leaf nodes. * @param en Expression Node Descriptor for the root node. * @return true if the subtree rooted under en has only partition/virtual columns or * constant values as the leaf nodes. Else, return false. */ private boolean exprContainsOnlyPartitionColOrVirtualColOrConstants(ExprNodeDesc en) { if (en == null) { return true; } if (exprNodeToPartOrVirtualColOrConstExpr.containsKey(en)) { return exprNodeToPartOrVirtualColOrConstExpr.get(en); } if (en instanceof ExprNodeColumnDesc) { boolean ret = ((ExprNodeColumnDesc)en).getIsPartitionColOrVirtualCol(); exprNodeToPartOrVirtualColOrConstExpr.put(en, ret); return ret; } if (en.getChildren() != null) { for (ExprNodeDesc cn : en.getChildren()) { if (!exprContainsOnlyPartitionColOrVirtualColOrConstants(cn)) { exprNodeToPartOrVirtualColOrConstExpr.put(en, false); return false; } } } exprNodeToPartOrVirtualColOrConstExpr.put(en, true); return true; } /** * Check if the expression node satisfies the following : * Has atleast one subexpression containing a partition/virtualcolumn and has * exactly refer to a single table alias. * @param en Expression Node Descriptor * @return true if there is atleast one subexpression with partition/virtual column * and has exactly refer to a single table alias. If not, return false. */ private boolean hasAtleastOneSubExprWithPartColOrVirtualColWithOneTableAlias(ExprNodeDesc en) { if (en == null || en.getChildren() == null) { return false; } for (ExprNodeDesc cn : en.getChildren()) { if (exprContainsOnlyPartitionColOrVirtualColOrConstants(cn) && getTableAlias(cn) != null) { return true; } } return false; } /** * Check if the expression node satisfies the following : * Has all subexpressions containing constants or a partition/virtual column/coming from the * same table * @param en Expression Node Descriptor * @return true/false based on the condition specified in the above description. */ private boolean hasAllSubExprWithConstOrPartColOrVirtualColWithOneTableAlias(ExprNodeDesc en) { if (!exprContainsOnlyPartitionColOrVirtualColOrConstants(en)) { return false; } Set<String> s = new HashSet<String>(); Set<ExprNodeDesc> visited = new HashSet<ExprNodeDesc>(); return getTableAliasHelper(en, s, visited); } /** * Return the expression node descriptor if the input expression node is a GenericUDFIn. * Else, return null. * @param en Expression Node Descriptor * @return The expression node descriptor if the input expression node represents an IN clause. * Else, return null. */ private ExprNodeGenericFuncDesc getInExprNode(ExprNodeDesc en) { if (en == null) { return null; } if (en instanceof ExprNodeGenericFuncDesc && ((ExprNodeGenericFuncDesc)(en)).getGenericUDF() instanceof GenericUDFIn) { return (ExprNodeGenericFuncDesc) en; } return null; } /** * Helper used by getTableAlias * @param en Expression Node Descriptor * @param s Set of the table Aliases associated with the current Expression node. * @param visited Visited ExpressionNode set. * @return true if en has at most one table associated with it, else return false. */ private boolean getTableAliasHelper(ExprNodeDesc en, Set<String> s, Set<ExprNodeDesc> visited) { visited.add(en); // The current expression node is a column, see if the column alias is already a part of // the return set, s. If not and we already have an entry in set s, this is an invalid expression // and return false. if (en instanceof ExprNodeColumnDesc) { if (s.size() > 0 && !s.contains(((ExprNodeColumnDesc)en).getTabAlias())) { return false; } if (s.size() == 0) { s.add(((ExprNodeColumnDesc)en).getTabAlias()); } return true; } if (en.getChildren() == null) { return true; } // Iterative through the children in a DFS manner to see if there is more than 1 table alias // referenced by the current expression node. for (ExprNodeDesc cn : en.getChildren()) { if (visited.contains(cn)) { continue; } if (cn instanceof ExprNodeColumnDesc) { s.add(((ExprNodeColumnDesc) cn).getTabAlias()); } else if (!(cn instanceof ExprNodeConstantDesc)) { if (!getTableAliasHelper(cn, s, visited)) { return false; } } } return true; } /** * If the given expression has just a single table associated with it, * return the table alias associated with it. Else, return null. * @param en * @return The table alias associated with the expression if there is a single table * reference. Else, return null. */ private String getTableAlias(ExprNodeDesc en) { Set<String> s = new HashSet<String>(); Set<ExprNodeDesc> visited = new HashSet<ExprNodeDesc>(); boolean singleTableAlias = getTableAliasHelper(en, s, visited); if (!singleTableAlias || s.size() == 0) { return null; } StringBuilder ans = new StringBuilder(); for (String st : s) { ans.append(st); } return ans.toString(); } /** * The main process method for StructInExprProcessor to generate additional predicates * containing only partition columns. */ @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { ExprNodeGenericFuncDesc fd = getInExprNode((ExprNodeDesc)nd); /***************************************************************************************\ BEGIN : Early terminations for Partition Column Separator /***************************************************************************************/ // 1. If the input node is not an IN operator, we bail out. if (fd == null) { if (LOG.isDebugEnabled()) { LOG.debug("Partition columns not separated for " + fd + ", is not IN operator : "); } return null; } // 2. Check if the input is an IN operator with struct children List<ExprNodeDesc> children = fd.getChildren(); if (!(children.get(0) instanceof ExprNodeGenericFuncDesc) || (!(((ExprNodeGenericFuncDesc) children.get(0)).getGenericUDF() instanceof GenericUDFStruct))) { if (LOG.isDebugEnabled()) { LOG.debug("Partition columns not separated for " + fd + ", children size " + children.size() + ", child expression : " + children.get(0).getExprString()); } return null; } // 3. See if the IN (STRUCT(EXP1, EXP2,..) has atleast one expression with partition // column with single table alias. If not bail out. // We might have expressions containing only partitioning columns, say, T1.A + T2.B // where T1.A and T2.B are both partitioning columns. // However, these expressions should not be considered as valid expressions for separation. if (!hasAtleastOneSubExprWithPartColOrVirtualColWithOneTableAlias(children.get(0))) { if (LOG.isDebugEnabled()) { LOG.debug("Partition columns not separated for " + fd + ", there are no expression containing partition columns in struct fields"); } return null; } // 4. See if all the field expressions of the left hand side of IN are expressions // containing constants or only partition columns coming from same table. // If so, we need not perform this optimization and we should bail out. if (hasAllSubExprWithConstOrPartColOrVirtualColWithOneTableAlias(children.get(0))) { if (LOG.isDebugEnabled()) { LOG.debug("Partition columns not separated for " + fd + ", all fields are expressions containing constants or only partition columns" + "coming from same table"); } return null; } /***************************************************************************************\ END : Early terminations for Partition Column Separator /***************************************************************************************/ /***************************************************************************************\ BEGIN : Actual processing of the IN (STRUCT(..)) expression. /***************************************************************************************/ Map<String, TableInfo> tableAliasToInfo = new HashMap<>(); ExprNodeGenericFuncDesc originalStructDesc = ((ExprNodeGenericFuncDesc) children.get(0)); List<ExprNodeDesc> originalDescChildren = originalStructDesc.getChildren(); /** * PASS 1 : Iterate through the original IN(STRUCT(..)) and populate the tableAlias to * predicate information inside tableAliasToInfo. */ for (int i = 0; i < originalDescChildren.size(); i++) { ExprNodeDesc en = originalDescChildren.get(i); String tabAlias = null; // If the current expression node does not have a virtual/partition column or // single table alias reference, ignore it and move to the next expression node. if (!exprContainsOnlyPartitionColOrVirtualColOrConstants(en) || (tabAlias = getTableAlias(en)) == null) { continue; } TableInfo currTableInfo = null; // If the table alias to information map already contains the current table, // use the existing TableInfo object. Else, create a new one. if (tableAliasToInfo.containsKey(tabAlias)) { currTableInfo = tableAliasToInfo.get(tabAlias); } else { currTableInfo = new TableInfo(); } currTableInfo.exprNodeLHSDescriptor.add(en); // Iterate through the children nodes of the IN clauses starting from index 1, // which corresponds to the right hand side of the IN list. // Insert the value corresponding to the current expression in currExprNodeInfo.exprNodeValues. for (int j = 1; j < children.size(); j++) { ExprNodeDesc currChildStructExpr = children.get(j); ExprNodeDesc newConstStructElement = null; // 1. Get the constant value associated with the current element in the struct. // If the current child struct expression is a constant struct. if (currChildStructExpr instanceof ExprNodeConstantDesc) { List<Object> cnCols = (List<Object>)(((ExprNodeConstantDesc) (children.get(j))).getValue()); newConstStructElement = new ExprNodeConstantDesc(cnCols.get(i)); } else { // This better be a generic struct with constant values as the children. List<ExprNodeDesc> cnChildren = ((ExprNodeGenericFuncDesc) children.get(j)).getChildren(); newConstStructElement = new ExprNodeConstantDesc( (((ExprNodeConstantDesc) (cnChildren.get(i))).getValue())); } // 2. Insert the current constant value into exprNodeStructs list. // If there is no struct corresponding to the current element, create a new one, insert // the constant value into it and add the struct as part of exprNodeStructs. if (currTableInfo.exprNodeRHSStructs.size() < j) { List<ExprNodeDesc> newConstStructList = new ArrayList<ExprNodeDesc>(); newConstStructList.add(newConstStructElement); currTableInfo.exprNodeRHSStructs.add(newConstStructList); } else { // We already have a struct node for the current index. Insert the constant value // into the corresponding struct node. currTableInfo.exprNodeRHSStructs.get(j-1).add(newConstStructElement); } } // Insert the current table alias entry into the map if not already present in tableAliasToInfo. if (!tableAliasToInfo.containsKey(tabAlias)) { tableAliasToInfo.put(tabAlias, currTableInfo); } } /** * PASS 2 : Iterate through the tableAliasToInfo populated via PASS 1 * to generate the new expression. */ // subExpr is the list containing generated IN clauses as a result of this optimization. final List<ExprNodeDesc> subExpr = new ArrayList<ExprNodeDesc>(originalDescChildren.size()+1); for (Entry<String, TableInfo> entry : tableAliasToInfo.entrySet()) { TableInfo currTableInfo = entry.getValue(); List<List<ExprNodeDesc>> currConstStructList = currTableInfo.exprNodeRHSStructs; // IN(STRUCT(..)..) ExprNodeDesc list for the current table alias. List<ExprNodeDesc> currInStructExprList = new ArrayList<ExprNodeDesc>(); // Add the left hand side of the IN clause which contains the struct definition. currInStructExprList.add(ExprNodeGenericFuncDesc.newInstance (FunctionRegistry.getFunctionInfo(STRUCT_UDF).getGenericUDF(), STRUCT_UDF, currTableInfo.exprNodeLHSDescriptor)); // Generate the right hand side of the IN clause for (int i = 0; i < currConstStructList.size(); i++) { List<ExprNodeDesc> currConstStruct = currConstStructList.get(i); // Add the current constant struct to the right hand side of the IN clause. currInStructExprList.add(ExprNodeGenericFuncDesc.newInstance (FunctionRegistry.getFunctionInfo(STRUCT_UDF).getGenericUDF(), STRUCT_UDF, currConstStruct)); } // Add the newly generated IN clause to subExpr. subExpr.add(new ExprNodeGenericFuncDesc( TypeInfoFactory.booleanTypeInfo, FunctionRegistry. getFunctionInfo(IN_UDF).getGenericUDF(), currInStructExprList)); } /***************************************************************************************\ END : Actual processing of the IN (STRUCT(..)) expression. /***************************************************************************************/ // If there is only 1 table ALIAS, return it if (subExpr.size() == 1) { // Return the new expression containing only partition columns return subExpr.get(0); } // Return the new expression containing only partition columns // after concatenating them with AND operator return new ExprNodeGenericFuncDesc( TypeInfoFactory.booleanTypeInfo, FunctionRegistry.getFunctionInfo(AND_UDF).getGenericUDF(), subExpr); } } }