RewriteQueryUsingAggregateIndexCtx.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer.index;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;

import org.apache.hadoop.hive.ql.optimizer.FieldNode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.MetaStoreUtils;
import org.apache.hadoop.hive.ql.exec.ColumnInfo;
import org.apache.hadoop.hive.ql.exec.FunctionRegistry;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorUtils;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.exec.SelectOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.metadata.Hive;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.metadata.Table;
import org.apache.hadoop.hive.ql.optimizer.ColumnPrunerProcFactory;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.AggregationDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.GroupByDesc;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory;
import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils;

/**
 * RewriteQueryUsingAggregateIndexCtx class stores the
 * context for the {@link RewriteQueryUsingAggregateIndex}
 * used to rewrite operator plan with index table instead of base table.
 */

public final class RewriteQueryUsingAggregateIndexCtx  implements NodeProcessorCtx {
  private static final Logger LOG = LoggerFactory.getLogger(RewriteQueryUsingAggregateIndexCtx.class.getName());
  private RewriteQueryUsingAggregateIndexCtx(ParseContext parseContext, Hive hiveDb,
      RewriteCanApplyCtx canApplyCtx) {
    this.parseContext = parseContext;
    this.hiveDb = hiveDb;
    this.canApplyCtx = canApplyCtx;
    this.indexTableName = canApplyCtx.getIndexTableName();
    this.alias = canApplyCtx.getAlias();
    this.aggregateFunction = canApplyCtx.getAggFunction();
    this.indexKey = canApplyCtx.getIndexKey();
  }

  public static RewriteQueryUsingAggregateIndexCtx getInstance(ParseContext parseContext,
      Hive hiveDb, RewriteCanApplyCtx canApplyCtx) {
    return new RewriteQueryUsingAggregateIndexCtx(
        parseContext, hiveDb, canApplyCtx);
  }

  // Assumes one instance of this + single-threaded compilation for each query.
  private final Hive hiveDb;
  private final ParseContext parseContext;
  private final RewriteCanApplyCtx canApplyCtx;
  //We need the GenericUDAFEvaluator for GenericUDAF function "sum"
  private GenericUDAFEvaluator eval = null;
  private final String indexTableName;
  private final String alias;
  private final String aggregateFunction;
  private ExprNodeColumnDesc aggrExprNode = null;
  private String indexKey;

  public  ParseContext getParseContext() {
    return parseContext;
  }

  public Hive getHiveDb() {
    return hiveDb;
  }

  public String getIndexName() {
     return indexTableName;
  }

  public GenericUDAFEvaluator getEval() {
    return eval;
  }

  public void setEval(GenericUDAFEvaluator eval) {
    this.eval = eval;
  }

  public void setAggrExprNode(ExprNodeColumnDesc aggrExprNode) {
    this.aggrExprNode = aggrExprNode;
  }

  public ExprNodeColumnDesc getAggrExprNode() {
    return aggrExprNode;
  }

  public String getAlias() {
    return alias;
  }

  public String getAggregateFunction() {
    return aggregateFunction;
  }

  public String getIndexKey() {
    return indexKey;
  }

  public void setIndexKey(String indexKey) {
    this.indexKey = indexKey;
  }

  public void invokeRewriteQueryProc() throws SemanticException {
    this.replaceTableScanProcess(canApplyCtx.getTableScanOperator());
    //We need aggrExprNode. Thus, replaceGroupByOperatorProcess should come before replaceSelectOperatorProcess
    for (int index = 0; index < canApplyCtx.getGroupByOperators().size(); index++) {
      this.replaceGroupByOperatorProcess(canApplyCtx.getGroupByOperators().get(index), index);
    }
    for (SelectOperator selectperator : canApplyCtx.getSelectOperators()) {
      this.replaceSelectOperatorProcess(selectperator);
    }
  }

  /**
   * This method replaces the original TableScanOperator with the new
   * TableScanOperator and metadata that scans over the index table rather than
   * scanning over the original table.
   *
   */
  private void replaceTableScanProcess(TableScanOperator scanOperator) throws SemanticException {
    RewriteQueryUsingAggregateIndexCtx rewriteQueryCtx = this;
    String alias = rewriteQueryCtx.getAlias();

    // Need to remove the original TableScanOperators from these data structures
    // and add new ones
    HashMap<String, TableScanOperator> topOps = rewriteQueryCtx.getParseContext()
        .getTopOps();

    // remove original TableScanOperator
    topOps.remove(alias);

    String indexTableName = rewriteQueryCtx.getIndexName();
    Table indexTableHandle = null;
    try {
      indexTableHandle = rewriteQueryCtx.getHiveDb().getTable(indexTableName);
    } catch (HiveException e) {
      LOG.error("Error while getting the table handle for index table.");
      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
      throw new SemanticException(e.getMessage(), e);
    }

    // construct a new descriptor for the index table scan
    TableScanDesc indexTableScanDesc = new TableScanDesc(indexTableHandle);
    indexTableScanDesc.setGatherStats(false);

    String k = MetaStoreUtils.encodeTableName(indexTableName) + Path.SEPARATOR;
    indexTableScanDesc.setStatsAggPrefix(k);
    scanOperator.setConf(indexTableScanDesc);

    // Construct the new RowResolver for the new TableScanOperator
    ArrayList<ColumnInfo> sigRS = new ArrayList<ColumnInfo>();
    try {
      StructObjectInspector rowObjectInspector = (StructObjectInspector) indexTableHandle
          .getDeserializer().getObjectInspector();
      StructField field = rowObjectInspector.getStructFieldRef(rewriteQueryCtx.getIndexKey());
      sigRS.add(new ColumnInfo(field.getFieldName(), TypeInfoUtils.getTypeInfoFromObjectInspector(
              field.getFieldObjectInspector()), indexTableName, false));
    } catch (SerDeException e) {
      LOG.error("Error while creating the RowResolver for new TableScanOperator.");
      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
      throw new SemanticException(e.getMessage(), e);
    }
    RowSchema rs = new RowSchema(sigRS);

    // Set row resolver for new table
    String newAlias = indexTableName;
    int index = alias.lastIndexOf(":");
    if (index >= 0) {
      newAlias = alias.substring(0, index) + ":" + indexTableName;
    }

    // Scan operator now points to other table
    scanOperator.getConf().setAlias(newAlias);
    scanOperator.setAlias(indexTableName);
    topOps.put(newAlias, scanOperator);
    rewriteQueryCtx.getParseContext().setTopOps(topOps);

    ColumnPrunerProcFactory.setupNeededColumns(scanOperator, rs,
        Arrays.asList(new FieldNode(rewriteQueryCtx.getIndexKey())));
  }

  /**
   * This method replaces the original SelectOperator with the new
   * SelectOperator with a new column indexed_key_column.
   */
  private void replaceSelectOperatorProcess(SelectOperator operator) throws SemanticException {
    RewriteQueryUsingAggregateIndexCtx rewriteQueryCtx = this;
    // we need to set the colList, outputColumnNames, colExprMap,
    // rowSchema for only that SelectOperator which precedes the GroupByOperator
    // count(indexed_key_column) needs to be replaced by
    // sum(`_count_of_indexed_key_column`)
    List<ExprNodeDesc> selColList = operator.getConf().getColList();
    selColList.add(rewriteQueryCtx.getAggrExprNode());

    List<String> selOutputColNames = operator.getConf().getOutputColumnNames();
    selOutputColNames.add(rewriteQueryCtx.getAggrExprNode().getColumn());

    operator.getColumnExprMap().put(rewriteQueryCtx.getAggrExprNode().getColumn(),
        rewriteQueryCtx.getAggrExprNode());

    RowSchema selRS = operator.getSchema();
    List<ColumnInfo> selRSSignature = selRS.getSignature();
    // Need to create a new type for Column[_count_of_indexed_key_column] node
    PrimitiveTypeInfo pti = TypeInfoFactory.getPrimitiveTypeInfo("bigint");
    pti.setTypeName("bigint");
    ColumnInfo newCI = new ColumnInfo(rewriteQueryCtx.getAggregateFunction(), pti, "", false);
    selRSSignature.add(newCI);
    selRS.setSignature((ArrayList<ColumnInfo>) selRSSignature);
    operator.setSchema(selRS);
  }

  /**
   * We need to replace the count(indexed_column_key) GenericUDAF aggregation
   * function for group-by construct to "sum" GenericUDAF. This method creates a
   * new operator tree for a sample query that creates a GroupByOperator with
   * sum aggregation function and uses that GroupByOperator information to
   * replace the original GroupByOperator aggregation information. It replaces
   * the AggregationDesc (aggregation descriptor) of the old GroupByOperator
   * with the new Aggregation Desc of the new GroupByOperator.
   * @return
   */
  private void replaceGroupByOperatorProcess(GroupByOperator operator, int index)
      throws SemanticException {
    RewriteQueryUsingAggregateIndexCtx rewriteQueryCtx = this;

    // We need to replace the GroupByOperator which is before RS
    if (index == 0) {
      // the query contains the sum aggregation GenericUDAF
      String selReplacementCommand = "select sum(`" + rewriteQueryCtx.getAggregateFunction() + "`)"
          + " from `" + rewriteQueryCtx.getIndexName() + "` group by "
          + rewriteQueryCtx.getIndexKey() + " ";
      // retrieve the operator tree for the query, and the required GroupByOperator from it
      Operator<?> newOperatorTree = RewriteParseContextGenerator.generateOperatorTree(
              rewriteQueryCtx.getParseContext().getQueryState(),
              selReplacementCommand);

      // we get our new GroupByOperator here
      GroupByOperator newGbyOperator = OperatorUtils.findLastOperatorUpstream(
            newOperatorTree, GroupByOperator.class);
      if (newGbyOperator == null) {
        throw new SemanticException("Error replacing GroupBy operator.");
      }

      // we need this information to set the correct colList, outputColumnNames
      // in SelectOperator
      ExprNodeColumnDesc aggrExprNode = null;

      // Construct the new AggregationDesc to get rid of the current
      // internal names and replace them with new internal names
      // as required by the operator tree
      GroupByDesc newConf = newGbyOperator.getConf();
      List<AggregationDesc> newAggrList = newConf.getAggregators();
      if (newAggrList != null && newAggrList.size() > 0) {
        for (AggregationDesc aggregationDesc : newAggrList) {
          rewriteQueryCtx.setEval(aggregationDesc.getGenericUDAFEvaluator());
          aggrExprNode = (ExprNodeColumnDesc) aggregationDesc.getParameters().get(0);
          rewriteQueryCtx.setAggrExprNode(aggrExprNode);
        }
      }

      // Now the GroupByOperator has the new AggregationList;
      // sum(`_count_of_indexed_key`)
      // instead of count(indexed_key)
      GroupByDesc oldConf = operator.getConf();
      oldConf.setAggregators((ArrayList<AggregationDesc>) newAggrList);
      operator.setConf(oldConf);

    } else {
      // we just need to reset the GenericUDAFEvaluator and its name for this
      // GroupByOperator whose parent is the ReduceSinkOperator
      GroupByDesc childConf = operator.getConf();
      List<AggregationDesc> childAggrList = childConf.getAggregators();
      if (childAggrList != null && childAggrList.size() > 0) {
        for (AggregationDesc aggregationDesc : childAggrList) {
          List<ExprNodeDesc> paraList = aggregationDesc.getParameters();
          List<ObjectInspector> parametersOIList = new ArrayList<ObjectInspector>();
          for (ExprNodeDesc expr : paraList) {
            parametersOIList.add(expr.getWritableObjectInspector());
          }
          GenericUDAFEvaluator evaluator = FunctionRegistry.getGenericUDAFEvaluator("sum",
              parametersOIList, false, false);
          aggregationDesc.setGenericUDAFEvaluator(evaluator);
          aggregationDesc.setGenericUDAFName("sum");
        }
      }
    }
  }
}