PartitionColumnsSeparator.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.optimizer;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.FilterOperator;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.ForwardWalker;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.PreOrderOnceWalker;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.lib.TypeRule;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFStruct;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;

/**
 * This optimization will take a Filter expression, and if its predicate contains
 * an IN operator whose children are constant structs or structs containing constant fields,
 * it will try to generate predicate with IN clauses containing only partition columns.
 * This predicate is in turn used by the partition pruner to prune the columns that are not
 * part of the original IN(STRUCT(..)..) predicate.
 */
public class PartitionColumnsSeparator extends Transform {

  private static final Log LOG = LogFactory.getLog(PartitionColumnsSeparator.class);
  private static final String IN_UDF =
    GenericUDFIn.class.getAnnotation(Description.class).name();
  private static final String STRUCT_UDF =
    GenericUDFStruct.class.getAnnotation(Description.class).name();
  private static final String AND_UDF =
    GenericUDFOPAnd.class.getAnnotation(Description.class).name();

  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {
    // 1. Trigger transformation
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(new RuleRegExp("R1", FilterOperator.getOperatorName() + "%"), new StructInTransformer());

    Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null);
    GraphWalker ogw = new ForwardWalker(disp);

    List<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    return pctx;
  }

  private class StructInTransformer implements NodeProcessor {

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      FilterOperator filterOp = (FilterOperator) nd;
      ExprNodeDesc predicate = filterOp.getConf().getPredicate();

      // Generate the list bucketing pruning predicate as 2 separate IN clauses
      // containing the partitioning and non-partitioning columns.
      ExprNodeDesc newPredicate = generateInClauses(predicate);
      if (newPredicate != null) {
        // Replace filter in current FIL with new FIL
        if (LOG.isDebugEnabled()) {
          LOG.debug("Generated new predicate with IN clause: " + newPredicate);
        }
        final List<ExprNodeDesc> subExpr =
                new ArrayList<ExprNodeDesc>(2);
        subExpr.add(predicate);
        subExpr.add(newPredicate);
        ExprNodeGenericFuncDesc newFilterPredicate = new ExprNodeGenericFuncDesc(
                TypeInfoFactory.booleanTypeInfo,
                FunctionRegistry.getFunctionInfo(AND_UDF).getGenericUDF(), subExpr);
        filterOp.getConf().setPredicate(newFilterPredicate);
      }

      return null;
    }

    private ExprNodeDesc generateInClauses(ExprNodeDesc predicate) throws SemanticException {
      Map<Rule, NodeProcessor> exprRules = new LinkedHashMap<Rule, NodeProcessor>();
      exprRules.put(new TypeRule(ExprNodeGenericFuncDesc.class), new StructInExprProcessor());

      // The dispatcher fires the processor corresponding to the closest matching
      // rule and passes the context along
      Dispatcher disp = new DefaultRuleDispatcher(null, exprRules, null);
      GraphWalker egw = new PreOrderOnceWalker(disp);

      List<Node> startNodes = new ArrayList<Node>();
      startNodes.add(predicate);

      HashMap<Node, Object> outputMap = new HashMap<Node, Object>();
      egw.startWalking(startNodes, outputMap);
      return (ExprNodeDesc) outputMap.get(predicate);
    }
  }

  /**
   * The StructInExprProcessor processes the IN clauses of the following format :
   * STRUCT(T1.a, T1.b, T2.b, T2.c) IN (STRUCT(1, 2, 3, 4) , STRUCT(2, 3, 4, 5))
   * where T1.a, T1.b, T2.c are all partition columns and T2.b is a non-partition
   * column. The resulting additional predicate generated after
   * StructInExprProcessor.process() looks like :
   *    STRUCT(T1.a, T1.b) IN (STRUCT(1, 2), STRUCT(2, 3))
   *    AND
   *    STRUCT(T2.b) IN (STRUCT(4), STRUCT(5))
   * The additional predicate generated is used to prune the partitions that are
   * part of the given query. Once the partitions are pruned, the partition condition
   * remover is expected to remove the redundant predicates from the plan.
   */
  private class StructInExprProcessor implements NodeProcessor {

    /** TableInfo is populated in PASS 1 of process(). It contains the information required
     * to generate an IN clause of the following format:
     * STRUCT(T1.a, T1.b) IN (const STRUCT(1, 2), const STRUCT(2, 3))
     * In the above e.g. please note that all elements of the struct come from the same table.
     * The populated TableStructInfo is used to generate the IN clause in PASS 2 of process().
     * The table struct information class has the following fields:
     * 1. Expression Node Descriptor for the Left Hand Side of the IN clause for the table
     * 2. 2-D List of expression node descriptors which corresponds to the elements of IN clause
     */
    class TableInfo {
      List<ExprNodeDesc> exprNodeLHSDescriptor;
      List<List<ExprNodeDesc>> exprNodeRHSStructs;

      public TableInfo() {
       exprNodeLHSDescriptor = new ArrayList<ExprNodeDesc>();
       exprNodeRHSStructs = new ArrayList<List<ExprNodeDesc>>();
      }
    }

    // Mapping from expression node to is an expression containing only
    // partition or virtual column or constants
    private Map<ExprNodeDesc, Boolean> exprNodeToPartOrVirtualColOrConstExpr =
      new IdentityHashMap<ExprNodeDesc, Boolean>();

    /**
     * This function iterates through the entire subtree under a given expression node
     * and makes sure that the expression contain only constant nodes or
     * partition/virtual columns as leaf nodes.
     * @param en Expression Node Descriptor for the root node.
     * @return true if the subtree rooted under en has only partition/virtual columns or
     * constant values as the leaf nodes. Else, return false.
     */
    private boolean exprContainsOnlyPartitionColOrVirtualColOrConstants(ExprNodeDesc en) {
      if (en == null) {
        return true;
      }
      if (exprNodeToPartOrVirtualColOrConstExpr.containsKey(en)) {
        return exprNodeToPartOrVirtualColOrConstExpr.get(en);
      }
      if (en instanceof ExprNodeColumnDesc) {
        boolean ret = ((ExprNodeColumnDesc)en).getIsPartitionColOrVirtualCol();
        exprNodeToPartOrVirtualColOrConstExpr.put(en, ret);
        return ret;
      }
      if (en.getChildren() != null) {
        for (ExprNodeDesc cn : en.getChildren()) {
          if (!exprContainsOnlyPartitionColOrVirtualColOrConstants(cn)) {
            exprNodeToPartOrVirtualColOrConstExpr.put(en, false);
            return false;
          }
        }
      }
      exprNodeToPartOrVirtualColOrConstExpr.put(en, true);
      return true;
    }


    /**
     * Check if the expression node satisfies the following :
     * Has atleast one subexpression containing a partition/virtualcolumn and has
     * exactly refer to a single table alias.
     * @param en Expression Node Descriptor
     * @return true if there is atleast one subexpression with partition/virtual column
     * and has exactly refer to a single table alias. If not, return false.
     */
    private boolean hasAtleastOneSubExprWithPartColOrVirtualColWithOneTableAlias(ExprNodeDesc en) {
      if (en == null || en.getChildren() == null) {
        return false;
      }
      for (ExprNodeDesc cn : en.getChildren()) {
        if (exprContainsOnlyPartitionColOrVirtualColOrConstants(cn) && getTableAlias(cn) != null) {
          return true;
        }
      }
      return false;
    }


    /**
     * Check if the expression node satisfies the following :
     * Has all subexpressions containing constants or a partition/virtual column/coming from the
     * same table
     * @param en Expression Node Descriptor
     * @return true/false based on the condition specified in the above description.
     */
    private boolean hasAllSubExprWithConstOrPartColOrVirtualColWithOneTableAlias(ExprNodeDesc en) {
      if (!exprContainsOnlyPartitionColOrVirtualColOrConstants(en)) {
        return false;
      }

      Set<String> s = new HashSet<String>();
      Set<ExprNodeDesc> visited = new HashSet<ExprNodeDesc>();

      return getTableAliasHelper(en, s, visited);
    }


    /**
     * Return the expression node descriptor if the input expression node is a GenericUDFIn.
     * Else, return null.
     * @param en Expression Node Descriptor
     * @return The expression node descriptor if the input expression node represents an IN clause.
     * Else, return null.
     */
    private ExprNodeGenericFuncDesc getInExprNode(ExprNodeDesc en) {
      if (en == null) {
        return null;
      }

      if (en instanceof ExprNodeGenericFuncDesc && ((ExprNodeGenericFuncDesc)(en)).getGenericUDF()
          instanceof GenericUDFIn) {
        return (ExprNodeGenericFuncDesc) en;
      }
      return null;
    }


    /**
     * Helper used by getTableAlias
     * @param en Expression Node Descriptor
     * @param s Set of the table Aliases associated with the current Expression node.
     * @param visited Visited ExpressionNode set.
     * @return true if en has at most one table associated with it, else return false.
     */
    private boolean getTableAliasHelper(ExprNodeDesc en, Set<String> s, Set<ExprNodeDesc> visited) {
      visited.add(en);

      // The current expression node is a column, see if the column alias is already a part of
      // the return set, s. If not and we already have an entry in set s, this is an invalid expression
      // and return false.
      if (en instanceof ExprNodeColumnDesc) {
        if (s.size() > 0 &&
           !s.contains(((ExprNodeColumnDesc)en).getTabAlias())) {
          return false;
        }
        if (s.size() == 0) {
          s.add(((ExprNodeColumnDesc)en).getTabAlias());
        }
        return true;
      }
      if (en.getChildren() == null) {
        return true;
      }

      // Iterative through the children in a DFS manner to see if there is more than 1 table alias
      // referenced by the current expression node.
      for (ExprNodeDesc cn : en.getChildren()) {
        if (visited.contains(cn)) {
          continue;
        }
        if (cn instanceof ExprNodeColumnDesc) {
          s.add(((ExprNodeColumnDesc) cn).getTabAlias());
        } else if (!(cn instanceof ExprNodeConstantDesc)) {
          if (!getTableAliasHelper(cn, s, visited)) {
            return false;
          }
        }
      }
      return true;
    }


    /**
     * If the given expression has just a single table associated with it,
     * return the table alias associated with it. Else, return null.
     * @param en
     * @return The table alias associated with the expression if there is a single table
     * reference. Else, return null.
     */
    private String getTableAlias(ExprNodeDesc en) {
      Set<String> s = new HashSet<String>();
      Set<ExprNodeDesc> visited = new HashSet<ExprNodeDesc>();
      boolean singleTableAlias = getTableAliasHelper(en, s, visited);

      if (!singleTableAlias || s.size() == 0) {
        return null;
      }
      StringBuilder ans = new StringBuilder();
      for (String st : s) {
        ans.append(st);
      }
      return ans.toString();
    }


    /**
     * The main process method for StructInExprProcessor to generate additional predicates
     * containing only partition columns.
     */
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      ExprNodeGenericFuncDesc fd = getInExprNode((ExprNodeDesc)nd);

      /***************************************************************************************\
       BEGIN : Early terminations for Partition Column Separator
      /***************************************************************************************/
      // 1. If the input node is not an IN operator, we bail out.
      if (fd == null) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Partition columns not separated for " + fd + ", is not IN operator : ");
        }
        return null;
      }

      // 2. Check if the input is an IN operator with struct children
      List<ExprNodeDesc> children = fd.getChildren();
      if (!(children.get(0) instanceof ExprNodeGenericFuncDesc) ||
          (!(((ExprNodeGenericFuncDesc) children.get(0)).getGenericUDF()
           instanceof GenericUDFStruct))) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Partition columns not separated for " + fd + ", children size " +
           children.size() + ", child expression : " + children.get(0).getExprString());
        }
        return null;
      }

      // 3. See if the IN (STRUCT(EXP1, EXP2,..) has atleast one expression with partition
      // column with single table alias. If not bail out.
      // We might have expressions containing only partitioning columns, say, T1.A + T2.B
      // where T1.A and T2.B are both partitioning columns.
      // However, these expressions should not be considered as valid expressions for separation.
      if (!hasAtleastOneSubExprWithPartColOrVirtualColWithOneTableAlias(children.get(0))) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Partition columns not separated for " + fd +
            ", there are no expression containing partition columns in struct fields");
        }
        return null;
      }

      // 4. See if all the field expressions of the left hand side of IN are expressions 
      // containing constants or only partition columns coming from same table.
      // If so, we need not perform this optimization and we should bail out.
      if (hasAllSubExprWithConstOrPartColOrVirtualColWithOneTableAlias(children.get(0))) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Partition columns not separated for " + fd +
          ", all fields are expressions containing constants or only partition columns"
          + "coming from same table");
        }
        return null;
      }

      /***************************************************************************************\
       END : Early terminations for Partition Column Separator
      /***************************************************************************************/


      /***************************************************************************************\
       BEGIN : Actual processing of the IN (STRUCT(..)) expression.
      /***************************************************************************************/
      Map<String, TableInfo> tableAliasToInfo =
        new HashMap<>();
      ExprNodeGenericFuncDesc originalStructDesc = ((ExprNodeGenericFuncDesc) children.get(0));
      List<ExprNodeDesc> originalDescChildren = originalStructDesc.getChildren();
      /**
       * PASS 1 : Iterate through the original IN(STRUCT(..)) and populate the tableAlias to
       * predicate information inside tableAliasToInfo.
       */
      for (int i = 0; i < originalDescChildren.size(); i++) {
        ExprNodeDesc en =  originalDescChildren.get(i);
        String tabAlias = null;

        // If the current expression node does not have a virtual/partition column or
        // single table alias reference, ignore it and move to the next expression node.
        if (!exprContainsOnlyPartitionColOrVirtualColOrConstants(en) ||
            (tabAlias = getTableAlias(en)) == null) {
          continue;
        }

        TableInfo currTableInfo = null;

        // If the table alias to information map already contains the current table,
        // use the existing TableInfo object. Else, create a new one.
        if (tableAliasToInfo.containsKey(tabAlias)) {
          currTableInfo = tableAliasToInfo.get(tabAlias);
        } else {
          currTableInfo = new TableInfo();
        }
        currTableInfo.exprNodeLHSDescriptor.add(en);

        // Iterate through the children nodes of the IN clauses starting from index 1,
        // which corresponds to the right hand side of the IN list.
        // Insert the value corresponding to the current expression in currExprNodeInfo.exprNodeValues.
        for (int j = 1; j < children.size(); j++) {
          ExprNodeDesc currChildStructExpr = children.get(j);
          ExprNodeDesc newConstStructElement = null;

          // 1. Get the constant value associated with the current element in the struct.
          // If the current child struct expression is a constant struct.
          if (currChildStructExpr instanceof ExprNodeConstantDesc) {
            List<Object> cnCols = (List<Object>)(((ExprNodeConstantDesc) (children.get(j))).getValue());
            newConstStructElement = new ExprNodeConstantDesc(cnCols.get(i));
          } else {
            // This better be a generic struct with constant values as the children.
            List<ExprNodeDesc> cnChildren = ((ExprNodeGenericFuncDesc) children.get(j)).getChildren();
            newConstStructElement = new ExprNodeConstantDesc(
              (((ExprNodeConstantDesc) (cnChildren.get(i))).getValue()));
          }

          // 2. Insert the current constant value into exprNodeStructs list.
          // If there is no struct corresponding to the current element, create a new one, insert
          // the constant value into it and add the struct as part of exprNodeStructs.
          if (currTableInfo.exprNodeRHSStructs.size() < j) {
            List<ExprNodeDesc> newConstStructList = new ArrayList<ExprNodeDesc>();
            newConstStructList.add(newConstStructElement);
            currTableInfo.exprNodeRHSStructs.add(newConstStructList);
          } else {
            // We already have a struct node for the current index. Insert the constant value
            // into the corresponding struct node.
            currTableInfo.exprNodeRHSStructs.get(j-1).add(newConstStructElement);
          }
        }

        // Insert the current table alias entry into the map if not already present in tableAliasToInfo.
        if (!tableAliasToInfo.containsKey(tabAlias)) {
          tableAliasToInfo.put(tabAlias, currTableInfo);
        }
      }

      /**
       * PASS 2 : Iterate through the tableAliasToInfo populated via PASS 1
       * to generate the new expression.
       */
      // subExpr is the list containing generated IN clauses as a result of this optimization.
      final List<ExprNodeDesc> subExpr =
        new ArrayList<ExprNodeDesc>(originalDescChildren.size()+1);

      for (Entry<String, TableInfo> entry :
        tableAliasToInfo.entrySet()) {
        TableInfo currTableInfo = entry.getValue();
        List<List<ExprNodeDesc>> currConstStructList = currTableInfo.exprNodeRHSStructs;

        // IN(STRUCT(..)..) ExprNodeDesc list for the current table alias.
        List<ExprNodeDesc> currInStructExprList = new ArrayList<ExprNodeDesc>();

        // Add the left hand side of the IN clause which contains the struct definition.
        currInStructExprList.add(ExprNodeGenericFuncDesc.newInstance
          (FunctionRegistry.getFunctionInfo(STRUCT_UDF).getGenericUDF(),
          STRUCT_UDF,
          currTableInfo.exprNodeLHSDescriptor));

        // Generate the right hand side of the IN clause
        for (int i = 0; i < currConstStructList.size(); i++) {
          List<ExprNodeDesc> currConstStruct = currConstStructList.get(i);

          // Add the current constant struct to the right hand side of the IN clause.
          currInStructExprList.add(ExprNodeGenericFuncDesc.newInstance
            (FunctionRegistry.getFunctionInfo(STRUCT_UDF).getGenericUDF(),
            STRUCT_UDF,
            currConstStruct));
        }

        // Add the newly generated IN clause to subExpr.
        subExpr.add(new ExprNodeGenericFuncDesc(
          TypeInfoFactory.booleanTypeInfo, FunctionRegistry.
          getFunctionInfo(IN_UDF).getGenericUDF(), currInStructExprList));
      }
      /***************************************************************************************\
       END : Actual processing of the IN (STRUCT(..)) expression.
      /***************************************************************************************/

      // If there is only 1 table ALIAS, return it
      if (subExpr.size() == 1) {
        // Return the new expression containing only partition columns
        return subExpr.get(0);
      }
      // Return the new expression containing only partition columns
      // after concatenating them with AND operator
      return new ExprNodeGenericFuncDesc(
        TypeInfoFactory.booleanTypeInfo,
        FunctionRegistry.getFunctionInfo(AND_UDF).getGenericUDF(), subExpr);
    }
  }
}