MastiffIndexPredicateAnalyzer.java example

Explorer
----Data---Storage---master
- src
package org.apache.hadoop.hive.mastiff;

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator;
import org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorFactory;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.index.IndexSearchCondition;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.util.ReflectionUtils;

/**
 * IndexPredicateAnalyzer decomposes predicates, separating the parts
 * which can be satisfied by an index from the parts which cannot.
 * Currently, it only supports pure conjunctions over binary expressions
 * comparing a column reference with a constant value. It is assumed
 * that all column aliases encountered refer to the same table. <br>
 * 
 * MastiffIndexPredicateAnalyzer is a small modification of
 * {@link org.apache.hadoop.hive.ql.index.IndexPredicateAnalyzer}.
 * we modify the whitelist of allowed udf to the blacklist of not allowed ones
 */
public class MastiffIndexPredicateAnalyzer
{
  private static final Log LOG = LogFactory.getLog(MastiffIndexPredicateAnalyzer.class.getName());
  private final Set<String> unsupportedUdfNames;

  private Set<String> allowedColumnNames;

  public MastiffIndexPredicateAnalyzer() {
    unsupportedUdfNames = new HashSet<String>();
  }

  /**
   * Registers a comparison operator as one which can not be satisfied
   * by an index search. (Initially, all udfs are allowed.)
   * 
   * @param udfName
   *          name of comparison operator as returned
   *          by either {@link GenericUDFBridge#getUdfName} (for simple UDF's)
   *          or udf.getClass().getName() (for generic UDF's).
   */
  public void addComparisonOp(String udfName) {
    unsupportedUdfNames.add(udfName);
  }

  /**
   * Clears the set of column names allowed in comparisons. (Initially, all
   * column names are allowed.)
   */
  public void clearAllowedColumnNames() {
    allowedColumnNames = new HashSet<String>();
  }

  /**
   * Adds a column name to the set of column names allowed.
   * 
   * @param columnName
   *          name of column to be allowed
   */
  public void allowColumnName(String columnName) {
    if (allowedColumnNames == null) {
      clearAllowedColumnNames();
    }
    allowedColumnNames.add(columnName);
  }

  /**
   * Analyzes a predicate.
   * 
   * @param predicate
   *          predicate to be analyzed
   * 
   * @param searchConditions
   *          receives conditions produced by analysis
   * 
   * @return residual predicate which could not be translated to
   *         searchConditions
   */
  public ExprNodeDesc analyzePredicate(
      ExprNodeDesc predicate,
      final List<IndexSearchCondition> searchConditions) {

    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    NodeProcessor nodeProcessor = new NodeProcessor() {
      @Override
      public Object process(Node nd, Stack<Node> stack,
          NodeProcessorCtx procCtx, Object... nodeOutputs)
          throws SemanticException {

        // We can only push down stuff which appears as part of
        // a pure conjunction: reject OR, CASE, etc.
        for (Node ancestor : stack) {
          if (nd == ancestor) {
            break;
          }
          if (!FunctionRegistry.isOpAnd((ExprNodeDesc) ancestor)) {
            return nd;
          }
        }

        return analyzeExpr((ExprNodeDesc) nd, searchConditions, nodeOutputs);
      }
    };

    Dispatcher disp = new DefaultRuleDispatcher(
        nodeProcessor, opRules, null);
    GraphWalker ogw = new DefaultGraphWalker(disp);
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.add(predicate);
    HashMap<Node, Object> nodeOutput = new HashMap<Node, Object>();
    try {
      ogw.startWalking(topNodes, nodeOutput);
    } catch (SemanticException ex) {
      throw new RuntimeException(ex);
    }
    ExprNodeDesc residualPredicate = (ExprNodeDesc) nodeOutput.get(predicate);
    return residualPredicate;
  }

  private ExprNodeDesc analyzeExpr(
      ExprNodeDesc expr,
      List<IndexSearchCondition> searchConditions,
      Object... nodeOutputs) {

    if (!(expr instanceof ExprNodeGenericFuncDesc)) {
      return expr;
    }
    if (FunctionRegistry.isOpAnd(expr)) {
      assert (nodeOutputs.length == 2);
      ExprNodeDesc residual1 = (ExprNodeDesc) nodeOutputs[0];
      ExprNodeDesc residual2 = (ExprNodeDesc) nodeOutputs[1];
      if (residual1 == null) {
        return residual2;
      }
      if (residual2 == null) {
        return residual1;
      }
      List<ExprNodeDesc> residuals = new ArrayList<ExprNodeDesc>();
      residuals.add(residual1);
      residuals.add(residual2);
      return new ExprNodeGenericFuncDesc(
          TypeInfoFactory.booleanTypeInfo,
          FunctionRegistry.getGenericUDFForAnd(),
          residuals);
    }

    String udfName;
    ExprNodeGenericFuncDesc funcDesc = (ExprNodeGenericFuncDesc) expr;
    if (funcDesc.getGenericUDF() instanceof GenericUDFBridge) {
      GenericUDFBridge func = (GenericUDFBridge) funcDesc.getGenericUDF();
      udfName = func.getUdfName();
    } else {
      udfName = funcDesc.getGenericUDF().getClass().getName();
    }
    if (unsupportedUdfNames.contains(udfName)) {
      return expr;
    }

    ExprNodeDesc child1 = extractConstant((ExprNodeDesc) nodeOutputs[0]);
    ExprNodeDesc child2 = extractConstant((ExprNodeDesc) nodeOutputs[1]);
    ExprNodeColumnDesc columnDesc = null;
    ExprNodeConstantDesc constantDesc = null;
    if ((child1 instanceof ExprNodeColumnDesc)
        && (child2 instanceof ExprNodeConstantDesc)) {
      // COL <op> CONSTANT
      columnDesc = (ExprNodeColumnDesc) child1;
      constantDesc = (ExprNodeConstantDesc) child2;
    } else if ((child2 instanceof ExprNodeColumnDesc)
        && (child1 instanceof ExprNodeConstantDesc)) {
      // CONSTANT <op> COL
      columnDesc = (ExprNodeColumnDesc) child2;
      constantDesc = (ExprNodeConstantDesc) child1;
    }
    if (columnDesc == null) {
      return expr;
    }
    if (allowedColumnNames != null) {
      if (!allowedColumnNames.contains(columnDesc.getColumn())) {
        return expr;
      }
    }
    searchConditions.add(
        new IndexSearchCondition(
            columnDesc,
            udfName,
            constantDesc,
            expr));

    // we converted the expression to a search condition, so
    // remove it from the residual predicate
    return null;
  }

  private ExprNodeDesc extractConstant(ExprNodeDesc expr) {
    if (!(expr instanceof ExprNodeGenericFuncDesc)) {
      return expr;
    }
    ExprNodeConstantDesc folded = foldConstant(((ExprNodeGenericFuncDesc) expr));
    return folded == null ? expr : folded;
  }

  private ExprNodeConstantDesc foldConstant(ExprNodeGenericFuncDesc func) {
    GenericUDF udf = func.getGenericUDF();
    if (!FunctionRegistry.isDeterministic(udf) || FunctionRegistry.isStateful(udf)) {
      return null;
    }
    try {
      // If the UDF depends on any external resources, we can't fold because the
      // resources may not be available at compile time.
      if (udf instanceof GenericUDFBridge) {
        UDF internal = ReflectionUtils.newInstance(((GenericUDFBridge) udf).getUdfClass(), null);
        if (internal.getRequiredFiles() != null || internal.getRequiredJars() != null) {
          return null;
        }
      } else {
        if (udf.getRequiredFiles() != null || udf.getRequiredJars() != null) {
          return null;
        }
      }

      for (ExprNodeDesc child : func.getChildExprs()) {
        if (child instanceof ExprNodeConstantDesc) {
          continue;
        } else if (child instanceof ExprNodeGenericFuncDesc) {
          if (foldConstant((ExprNodeGenericFuncDesc) child) != null) {
            continue;
          }
        }
        return null;
      }
      ExprNodeEvaluator evaluator = ExprNodeEvaluatorFactory.get(func);
      ObjectInspector output = evaluator.initialize(null);

      Object constant = evaluator.evaluate(null);
      Object java = ObjectInspectorUtils.copyToStandardJavaObject(constant, output);

      return new ExprNodeConstantDesc(java);
    } catch (Exception e) {
      return null;
    }
  }

  /**
   * Translates search conditions back to ExprNodeDesc form (as
   * a left-deep conjunction).
   * 
   * @param searchConditions
   *          (typically produced by analyzePredicate)
   * 
   * @return ExprNodeDesc form of search conditions
   */
  public ExprNodeDesc translateSearchConditions(
      List<IndexSearchCondition> searchConditions) {

    ExprNodeDesc expr = null;
    for (IndexSearchCondition searchCondition : searchConditions) {
      if (expr == null) {
        expr = searchCondition.getComparisonExpr();
        continue;
      }
      List<ExprNodeDesc> children = new ArrayList<ExprNodeDesc>();
      children.add(expr);
      children.add(searchCondition.getComparisonExpr());
      expr = new ExprNodeGenericFuncDesc(
          TypeInfoFactory.booleanTypeInfo,
          FunctionRegistry.getGenericUDFForAnd(),
          children);
    }
    return expr;
  }
}