IndexPredicateAnalyzer.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.index;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils;
import org.apache.hadoop.hive.ql.plan.ExprNodeFieldDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToBinary;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToChar;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDate;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToDecimal;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUnixTimeStamp;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToUtcTimestamp;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFToVarchar;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBaseCompare;

/**
 * IndexPredicateAnalyzer decomposes predicates, separating the parts
 * which can be satisfied by an index from the parts which cannot.
 * Currently, it only supports pure conjunctions over binary expressions
 * comparing a column reference with a constant value.  It is assumed
 * that all column aliases encountered refer to the same table.
 */
public class IndexPredicateAnalyzer {

  private final Set<String> udfNames;
  private final Map<String, Set<String>> columnToUDFs;
  private FieldValidator fieldValidator;

  private boolean acceptsFields;

  public IndexPredicateAnalyzer() {
    udfNames = new HashSet<String>();
    columnToUDFs = new HashMap<String, Set<String>>();
  }

  public void setFieldValidator(FieldValidator fieldValidator) {
    this.fieldValidator = fieldValidator;
  }

  /**
   * Registers a comparison operator as one which can be satisfied
   * by an index search.  Unless this is called, analyzePredicate
   * will never find any indexable conditions.
   *
   * @param udfName name of comparison operator as returned
   * by either {@link GenericUDFBridge#getUdfName} (for simple UDF's)
   * or udf.getClass().getName() (for generic UDF's).
   */
  public void addComparisonOp(String udfName) {
    udfNames.add(udfName);
  }

  /**
   * Clears the set of column names allowed in comparisons.  (Initially, all
   * column names are allowed.)
   */
  public void clearAllowedColumnNames() {
    columnToUDFs.clear();
  }

  /**
   * Adds a column name to the set of column names allowed.
   *
   * @param columnName name of column to be allowed
   */
  public void allowColumnName(String columnName) {
    columnToUDFs.put(columnName, udfNames);
  }

  /**
   * add allowed functions per column
   * @param columnName
   * @param udfs
   */
  public void addComparisonOp(String columnName, String... udfs) {
    Set<String> allowed = columnToUDFs.get(columnName);
    if (allowed == null || allowed == udfNames) {
      // override
      columnToUDFs.put(columnName, new HashSet<String>(Arrays.asList(udfs)));
    } else {
      allowed.addAll(Arrays.asList(udfs));
    }
  }

  /**
   * Analyzes a predicate.
   *
   * @param predicate predicate to be analyzed
   *
   * @param searchConditions receives conditions produced by analysis
   *
   * @return residual predicate which could not be translated to
   * searchConditions
   */
  public ExprNodeDesc analyzePredicate(
    ExprNodeDesc predicate,
    final List<IndexSearchCondition> searchConditions) {

    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    NodeProcessor nodeProcessor = new NodeProcessor() {
      @Override
      public Object process(Node nd, Stack<Node> stack,
        NodeProcessorCtx procCtx, Object... nodeOutputs)
        throws SemanticException {

        // We can only push down stuff which appears as part of
        // a pure conjunction:  reject OR, CASE, etc.
        for (Node ancestor : stack) {
          if (nd == ancestor) {
            break;
          }
          if (!FunctionRegistry.isOpAnd((ExprNodeDesc) ancestor)) {
            return nd;
          }
        }

        return analyzeExpr((ExprNodeGenericFuncDesc) nd, searchConditions, nodeOutputs);
      }
    };

    Dispatcher disp = new DefaultRuleDispatcher(
      nodeProcessor, opRules, null);
    GraphWalker ogw = new DefaultGraphWalker(disp);
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.add(predicate);
    HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>();
    try {
      ogw.startWalking(topNodes, nodeOutput);
    } catch (SemanticException ex) {
      throw new RuntimeException(ex);
    }
    ExprNodeDesc residualPredicate = (ExprNodeDesc) nodeOutput.get(predicate);
    return residualPredicate;
  }

  //Check if ExprNodeColumnDesc is wrapped in expr.
  //If so, peel off. Otherwise return itself.
  private ExprNodeDesc getColumnExpr(ExprNodeDesc expr) {
    if (expr instanceof ExprNodeColumnDesc) {
      return expr;
    }
    ExprNodeGenericFuncDesc funcDesc = null;
    if (expr instanceof ExprNodeGenericFuncDesc) {
      funcDesc = (ExprNodeGenericFuncDesc) expr;
    }
    if (null == funcDesc) {
      return expr;
    }
    GenericUDF udf = funcDesc.getGenericUDF();
    // check if its a simple cast expression.
    if ((udf instanceof GenericUDFBridge || udf instanceof GenericUDFToBinary
        || udf instanceof GenericUDFToChar || udf instanceof GenericUDFToVarchar
        || udf instanceof GenericUDFToDecimal || udf instanceof GenericUDFToDate
        || udf instanceof GenericUDFToUnixTimeStamp || udf instanceof GenericUDFToUtcTimestamp)
        && funcDesc.getChildren().size() == 1
        && funcDesc.getChildren().get(0) instanceof ExprNodeColumnDesc) {
      return expr.getChildren().get(0);
    }
    return expr;
  }

  private ExprNodeDesc analyzeExpr(
    ExprNodeGenericFuncDesc expr,
    List<IndexSearchCondition> searchConditions,
    Object... nodeOutputs) throws SemanticException {

    if (FunctionRegistry.isOpAnd(expr)) {
      assert(nodeOutputs.length >= 2);
      List<ExprNodeDesc> residuals = new ArrayList<ExprNodeDesc>();
      for (Object residual : nodeOutputs) {
        if (null != residual) {
          residuals.add((ExprNodeDesc)residual);
        }
      }
      if (residuals.size() == 0) {
        return null;
      } else if (residuals.size() == 1) {
        return residuals.get(0);
      } else if (residuals.size() > 1) {
        return new ExprNodeGenericFuncDesc(
            TypeInfoFactory.booleanTypeInfo,
            FunctionRegistry.getGenericUDFForAnd(),
            residuals);
      }
    }

    GenericUDF genericUDF = expr.getGenericUDF();
    if (!(genericUDF instanceof GenericUDFBaseCompare)) {
      return expr;
    }
    ExprNodeDesc expr1 = (ExprNodeDesc) nodeOutputs[0];
    ExprNodeDesc expr2 = (ExprNodeDesc) nodeOutputs[1];
    // We may need to peel off the GenericUDFBridge that is added by CBO or user
    if (expr1.getTypeInfo().equals(expr2.getTypeInfo())) {
      expr1 = getColumnExpr(expr1);
      expr2 = getColumnExpr(expr2);
    }

    ExprNodeDesc[] extracted = ExprNodeDescUtils.extractComparePair(expr1, expr2);
    if (extracted == null || (extracted.length > 2 && !acceptsFields)) {
      return expr;
    }

    ExprNodeColumnDesc columnDesc;
    ExprNodeConstantDesc constantDesc;
    if (extracted[0] instanceof ExprNodeConstantDesc) {
      genericUDF = genericUDF.flip();
      columnDesc = (ExprNodeColumnDesc) extracted[1];
      constantDesc = (ExprNodeConstantDesc) extracted[0];
    } else {
      columnDesc = (ExprNodeColumnDesc) extracted[0];
      constantDesc = (ExprNodeConstantDesc) extracted[1];
    }

    Set<String> allowed = columnToUDFs.get(columnDesc.getColumn());
    if (allowed == null) {
      return expr;
    }

    String udfName = genericUDF.getUdfName();
    if (!allowed.contains(genericUDF.getUdfName())) {
      return expr;
    }

    String[] fields = null;
    if (extracted.length > 2) {
      ExprNodeFieldDesc fieldDesc = (ExprNodeFieldDesc) extracted[2];
      if (!isValidField(fieldDesc)) {
        return expr;
      }
      fields = ExprNodeDescUtils.extractFields(fieldDesc);
    }

    // We also need to update the expr so that the index query can be generated.
    // Note that, hive does not support UDFToDouble etc in the query text.
    List<ExprNodeDesc> list = new ArrayList<ExprNodeDesc>();
    list.add(expr1);
    list.add(expr2);
    ExprNodeGenericFuncDesc indexExpr =
            new ExprNodeGenericFuncDesc(expr.getTypeInfo(), expr.getGenericUDF(), list);

    searchConditions.add(
      new IndexSearchCondition(
        columnDesc,
        udfName,
        constantDesc,
        indexExpr,
        expr,
        fields));

    // we converted the expression to a search condition, so
    // remove it from the residual predicate
    return fields == null ? null : expr;
  }

  private boolean isValidField(ExprNodeFieldDesc field) {
    return fieldValidator == null || fieldValidator.validate(field);
  }

  /**
   * Translates search conditions back to ExprNodeDesc form (as
   * a left-deep conjunction).
   *
   * @param searchConditions (typically produced by analyzePredicate)
   *
   * @return ExprNodeGenericFuncDesc form of search conditions
   */
  public ExprNodeGenericFuncDesc translateSearchConditions(
    List<IndexSearchCondition> searchConditions) {

    ExprNodeGenericFuncDesc expr = null;
    for (IndexSearchCondition searchCondition : searchConditions) {
      if (expr == null) {
        expr = searchCondition.getIndexExpr();
        continue;
      }
      List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
      children.add(expr);
      children.add(searchCondition.getIndexExpr());
      expr = new ExprNodeGenericFuncDesc(
        TypeInfoFactory.booleanTypeInfo,
        FunctionRegistry.getGenericUDFForAnd(),
        children);
    }
    return expr;
  }

  /**
   * Translates original conditions back to ExprNodeDesc form (as
   * a left-deep conjunction).
   *
   * @param searchConditions (typically produced by analyzePredicate)
   *
   * @return ExprNodeGenericFuncDesc form of search conditions
   */
  public ExprNodeGenericFuncDesc translateOriginalConditions(
    List<IndexSearchCondition> searchConditions) {

    ExprNodeGenericFuncDesc expr = null;
    for (IndexSearchCondition searchCondition : searchConditions) {
      if (expr == null) {
        expr = searchCondition.getOriginalExpr();
        continue;
      }
      List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
      children.add(expr);
      children.add(searchCondition.getOriginalExpr());
      expr = new ExprNodeGenericFuncDesc(
        TypeInfoFactory.booleanTypeInfo,
        FunctionRegistry.getGenericUDFForAnd(),
        children);
    }
    return expr;
  }

  public void setAcceptsFields(boolean acceptsFields) {
    this.acceptsFields = acceptsFields;
  }

  public static interface FieldValidator {
    boolean validate(ExprNodeFieldDesc exprNodeDesc);
  }

  public static IndexPredicateAnalyzer createAnalyzer(boolean equalOnly) {

    IndexPredicateAnalyzer analyzer = new IndexPredicateAnalyzer();
    analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual");
    if (equalOnly) {
      return analyzer;
    }
    analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan");
    analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan");
    analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan");
    analyzer.addComparisonOp("org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan");

    return analyzer;
  }
}