PartitionPruner.java example

Explorer
hive_blinkdb-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer.ppr;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.Warehouse;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.Utilities;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.log.PerfLogger;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Partition;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.Transform;
import org.apache.hadoop.hive.ql.parse.ErrorMsg;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.PrunedPartitionList;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPOr;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.thrift.TException;

/**
 * The transformation step that does partition pruning.
 *
 */
public class PartitionPruner implements Transform {

  // The log
  private static final Log LOG = LogFactory
      .getLog("hive.ql.optimizer.ppr.PartitionPruner");

  /*
   * (non-Javadoc)
   *
   * @see
   * org.apache.hadoop.hive.ql.optimizer.Transform#transform(org.apache.hadoop
   * .hive.ql.parse.ParseContext)
   */
  @Override
  public ParseContext transform(ParseContext pctx) throws SemanticException {

    // create a the context for walking operators
    OpWalkerCtx opWalkerCtx = new OpWalkerCtx(pctx.getOpToPartPruner());

    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(new RuleRegExp("R1", "(TS%FIL%)|(TS%FIL%FIL%)"), OpProcFactory
        .getFilterProc());

    // The dispatcher fires the processor corresponding to the closest matching
    // rule and passes the context along
    Dispatcher disp = new DefaultRuleDispatcher(OpProcFactory.getDefaultProc(),
        opRules, opWalkerCtx);
    GraphWalker ogw = new DefaultGraphWalker(disp);

    // Create a list of topop nodes
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pctx.getTopOps().values());
    ogw.startWalking(topNodes, null);
    pctx.setHasNonPartCols(opWalkerCtx.getHasNonPartCols());

    return pctx;
  }

  /**
   * Find out whether the condition only contains partitioned columns. Note that
   * if the table is not partitioned, the function always returns true.
   * condition.
   *
   * @param tab
   *          the table object
   * @param expr
   *          the pruner expression for the table
   */
  public static boolean onlyContainsPartnCols(Table tab, ExprNodeDesc expr) {
    if (!tab.isPartitioned() || (expr == null)) {
      return true;
    }

    if (expr instanceof ExprNodeColumnDesc) {
      String colName = ((ExprNodeColumnDesc) expr).getColumn();
      return tab.isPartitionKey(colName);
    }

    // It cannot contain a non-deterministic function
    if ((expr instanceof ExprNodeGenericFuncDesc)
        && !FunctionRegistry.isDeterministic(((ExprNodeGenericFuncDesc) expr)
        .getGenericUDF())) {
      return false;
    }

    // All columns of the expression must be parttioned columns
    List<ExprNodeDesc> children = expr.getChildren();
    if (children != null) {
      for (int i = 0; i < children.size(); i++) {
        if (!onlyContainsPartnCols(tab, children.get(i))) {
          return false;
        }
      }
    }

    return true;
  }

  /**
   * Get the partition list for the table that satisfies the partition pruner
   * condition.
   *
   * @param tab
   *          the table object for the alias
   * @param prunerExpr
   *          the pruner expression for the alias
   * @param conf
   *          for checking whether "strict" mode is on.
   * @param alias
   *          for generating error message only.
   * @return the partition list for the table that satisfies the partition
   *         pruner condition.
   * @throws HiveException
   */
  public static PrunedPartitionList prune(Table tab, ExprNodeDesc prunerExpr,
      HiveConf conf, String alias,
      Map<String, PrunedPartitionList> prunedPartitionsMap) throws HiveException {
    LOG.trace("Started pruning partiton");
    LOG.trace("dbname = " + tab.getDbName());
    LOG.trace("tabname = " + tab.getTableName());
    LOG.trace("prune Expression = " + prunerExpr);

    String key = tab.getDbName() + "." + tab.getTableName() + ";";

    if (prunerExpr != null) {
      key = key + prunerExpr.getExprString();
    }
    PrunedPartitionList ret = prunedPartitionsMap.get(key);
    if (ret != null) {
      return ret;
    }

    LinkedHashSet<Partition> true_parts = new LinkedHashSet<Partition>();
    LinkedHashSet<Partition> unkn_parts = new LinkedHashSet<Partition>();
    LinkedHashSet<Partition> denied_parts = new LinkedHashSet<Partition>();

    try {
      StructObjectInspector rowObjectInspector = (StructObjectInspector) tab
          .getDeserializer().getObjectInspector();
      Object[] rowWithPart = new Object[2];

      if (tab.isPartitioned()) {
        // If the "strict" mode is on, we have to provide partition pruner for
        // each table.
        if ("strict".equalsIgnoreCase(HiveConf.getVar(conf,
            HiveConf.ConfVars.HIVEMAPREDMODE))) {
          if (!hasColumnExpr(prunerExpr)) {
            throw new SemanticException(ErrorMsg.NO_PARTITION_PREDICATE
                .getMsg("for Alias \"" + alias + "\" Table \""
                    + tab.getTableName() + "\""));
          }
        }

        if (prunerExpr == null) {
          // This can happen when hive.mapred.mode=nonstrict and there is no predicates at all
          // Add all partitions to the unknown_parts so that a MR job is generated.
          true_parts.addAll(Hive.get().getPartitions(tab));
        } else {
          // remove non-partition columns
          ExprNodeDesc compactExpr = prunerExpr.clone();
          compactExpr = compactExpr(compactExpr);
          LOG.debug("Filter w/ compacting: " +
              ((compactExpr != null) ? compactExpr.getExprString(): "null") +
              "; filter w/o compacting: " +
              ((prunerExpr != null) ? prunerExpr.getExprString(): "null"));
          if (compactExpr == null) {
            // This could happen when hive.mapred.mode=nonstrict and all the predicates
            // are on non-partition columns.
            unkn_parts.addAll(Hive.get().getPartitions(tab));
          } else if (Utilities.checkJDOPushDown(tab, compactExpr)) {
            String filter = compactExpr.getExprString();
            String oldFilter = prunerExpr.getExprString();

            if (filter.equals(oldFilter)) {
              // pruneExpr contains only partition columns
              pruneByPushDown(tab, true_parts, filter);
            } else {
              // pruneExpr contains non-partition columns
              pruneByPushDown(tab, unkn_parts, filter);
            }
          } else {
            pruneBySequentialScan(tab, true_parts, unkn_parts, denied_parts, prunerExpr, rowObjectInspector);
          }
        }
        LOG.debug("tabname = " + tab.getTableName() + " is partitioned");
      } else {
        true_parts.addAll(Hive.get().getPartitions(tab));
      }
    } catch (HiveException e) {
      throw e;
    } catch (Exception e) {
      throw new HiveException(e);
    }

    // Now return the set of partitions
    ret = new PrunedPartitionList(true_parts, unkn_parts, denied_parts);
    prunedPartitionsMap.put(key, ret);
    return ret;
  }

  /**
   * Taking a partition pruning expression, remove the null operands.
   * @param expr original partition pruning expression.
   * @return partition pruning expression that only contains partition columns.
   */
  static private ExprNodeDesc compactExpr(ExprNodeDesc expr) {
    if (expr instanceof ExprNodeConstantDesc) {
      if (((ExprNodeConstantDesc)expr).getValue() == null) {
        return null;
      } else {
        return expr;
      }
    } else if (expr instanceof ExprNodeGenericFuncDesc) {
      GenericUDF udf = ((ExprNodeGenericFuncDesc)expr).getGenericUDF();
      if (udf instanceof GenericUDFOPAnd ||
          udf instanceof GenericUDFOPOr) {
        List<ExprNodeDesc> children = ((ExprNodeGenericFuncDesc)expr).getChildren();
        ExprNodeDesc left = children.get(0);
        children.set(0, compactExpr(left));
        ExprNodeDesc right = children.get(1);
        children.set(1, compactExpr(right));
        if (children.get(0) == null && children.get(1) == null) {
          return null;
        } else if (children.get(0) == null) {
          return children.get(1);
        } else if (children.get(1) == null) {
          return children.get(0);
        }
      }
      return expr;
    }
    return expr;
  }

  /**
   * Pruning partition using JDO filtering.
   * @param tab the table containing the partitions.
   * @param true_parts the resulting partitions.
   * @param filter the SQL predicate that involves only partition columns
   * @throws HiveException
   * @throws MetaException
   * @throws NoSuchObjectException
   * @throws TException
   */
  static private void pruneByPushDown(Table tab, Set<Partition> true_parts, String filter)
      throws HiveException, MetaException, NoSuchObjectException, TException {
    Hive db = Hive.get();
    List<Partition> parts = db.getPartitionsByFilter(tab, filter);
    true_parts.addAll(parts);
    return;
  }

  /**
   * Pruning partition by getting the partition names first and pruning using Hive expression
   * evaluator.
   * @param tab the table containing the partitions.
   * @param true_parts the resulting partitions if the partition pruning expression only contains
   *        partition columns.
   * @param unkn_parts the resulting partitions if the partition pruning expression that only contains
   *        non-partition columns.
   * @param denied_parts pruned out partitions.
   * @param prunerExpr the SQL predicate that involves partition columns.
   * @param rowObjectInspector object inspector used by the evaluator
   * @throws Exception
   */
  static private void pruneBySequentialScan(Table tab, Set<Partition> true_parts, Set<Partition> unkn_parts,
      Set<Partition> denied_parts, ExprNodeDesc prunerExpr, StructObjectInspector rowObjectInspector)
      throws Exception {

    List<String> trueNames = null;
    List<String> unknNames = null;

    PerfLogger perfLogger = PerfLogger.getPerfLogger();

    perfLogger.PerfLogBegin(LOG, PerfLogger.PRUNE_LISTING);

    List<String> partNames = Hive.get().getPartitionNames(tab.getDbName(),
        tab.getTableName(), (short) -1);

    List<FieldSchema> pCols = tab.getPartCols();
    List<String> partCols = new ArrayList<String>(pCols.size());
    List<String> values = new ArrayList<String>(pCols.size());
    Object[] objectWithPart = new Object[2];

    for (FieldSchema pCol : pCols) {
      partCols.add(pCol.getName());
    }

    Map<PrimitiveObjectInspector, ExprNodeEvaluator> handle = PartExprEvalUtils.prepareExpr(
        prunerExpr, partCols, rowObjectInspector);

    for (String partName : partNames) {

      // Set all the variables here
      LinkedHashMap<String, String> partSpec = Warehouse
          .makeSpecFromName(partName);

      values.clear();
      for (Map.Entry<String, String> kv: partSpec.entrySet()) {
        values.add(kv.getValue());
      }
      objectWithPart[1] = values;

      // evaluate the expression tree
      Boolean r = (Boolean) PartExprEvalUtils.evaluateExprOnPart(handle, objectWithPart);

      if (r == null) {
        if (unknNames == null) {
          unknNames = new LinkedList<String>();
        }
        unknNames.add(partName);
        LOG.debug("retained unknown partition: " + partName);
      } else if (Boolean.TRUE.equals(r)) {
        if (trueNames == null) {
          trueNames = new LinkedList<String>();
        }
        trueNames.add(partName);
        LOG.debug("retained partition: " + partName);
      }
    }
    perfLogger.PerfLogEnd(LOG, PerfLogger.PRUNE_LISTING);

    perfLogger.PerfLogBegin(LOG, PerfLogger.PARTITION_RETRIEVING);
    if (trueNames != null) {
      List<Partition> parts = Hive.get().getPartitionsByNames(tab, trueNames);
      true_parts.addAll(parts);
    }
    if (unknNames != null) {
      List<Partition> parts = Hive.get().getPartitionsByNames(tab, unknNames);
      unkn_parts.addAll(parts);
    }
    perfLogger.PerfLogEnd(LOG, PerfLogger.PARTITION_RETRIEVING);
  }

  /**
   * Whether the expression contains a column node or not.
   */
  public static boolean hasColumnExpr(ExprNodeDesc desc) {
    // Return false for null
    if (desc == null) {
      return false;
    }
    // Return true for exprNodeColumnDesc
    if (desc instanceof ExprNodeColumnDesc) {
      return true;
    }
    // Return true in case one of the children is column expr.
    List<ExprNodeDesc> children = desc.getChildren();
    if (children != null) {
      for (int i = 0; i < children.size(); i++) {
        if (hasColumnExpr(children.get(i))) {
          return true;
        }
      }
    }
    // Return false otherwise
    return false;
  }

}