MetadataOnlyOptimizer.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.optimizer.physical;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
import org.apache.hadoop.hive.ql.exec.GroupByOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.lib.RuleRegExp;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.TableScanDesc;

/**
 *
 * MetadataOnlyOptimizer determines to which TableScanOperators "metadata only"
 * optimization can be applied. Such operator must use only partition columns
 * (it is easy to check, because we are after column pruning and all places
 * where the data from the operator is used must go through GroupByOperator
 * distinct or distinct-like aggregations. Aggregation is distinct-like if
 * adding distinct wouldn't change the result, for example min, max.
 *
 * We cannot apply the optimization without group by, because the results depend
 * on the numbers of rows in partitions, for example count(hr) will count all
 * rows in matching partitions.
 *
 */
public class MetadataOnlyOptimizer implements PhysicalPlanResolver {
  static final Logger LOG = LoggerFactory.getLogger(MetadataOnlyOptimizer.class.getName());

  static class WalkerCtx implements NodeProcessorCtx {
    /* operators for which there is chance the optimization can be applied */
    private final HashSet<TableScanOperator> possible = new HashSet<TableScanOperator>();
    /* operators for which the optimization will be successful */
    private final HashSet<TableScanOperator> success = new HashSet<TableScanOperator>();

    /**
     * Sets operator as one for which there is a chance to apply optimization
     *
     * @param op
     *          the operator
     */
    public void setMayBeMetadataOnly(TableScanOperator op) {
      possible.add(op);
    }

    /** Convert all possible operators to success */
    public void convertMetadataOnly() {
      success.addAll(possible);
      possible.clear();
    }

    /**
     * Convert all possible operators to banned
     */
    public void convertNotMetadataOnly() {
      possible.clear();
      success.clear();
    }

    /**
     * Returns HashSet of collected operators for which the optimization may be
     * applicable.
     */
    public HashSet<TableScanOperator> getMayBeMetadataOnlyTableScans() {
      return possible;
    }

    /**
     * Returns HashSet of collected operators for which the optimization is
     * applicable.
     */
    public HashSet<TableScanOperator> getMetadataOnlyTableScans() {
      return success;
    }

  }

  static private class TableScanProcessor implements NodeProcessor {
    public TableScanProcessor() {
    }

    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      TableScanOperator tsOp = (TableScanOperator) nd;
      WalkerCtx walkerCtx = (WalkerCtx) procCtx;
      List<Integer> colIDs = tsOp.getNeededColumnIDs();
      TableScanDesc desc = tsOp.getConf();
      boolean noColNeeded = (colIDs == null) || (colIDs.isEmpty());
      boolean noVCneeded = (desc == null) || (desc.getVirtualCols() == null)
                             || (desc.getVirtualCols().isEmpty());
      boolean isSkipHF = desc.isNeedSkipHeaderFooters();
      if (noColNeeded && noVCneeded && !isSkipHF) {
        walkerCtx.setMayBeMetadataOnly(tsOp);
      }
      return nd;
    }
  }

  static private class FileSinkProcessor implements NodeProcessor {
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      WalkerCtx walkerCtx = (WalkerCtx) procCtx;
      // There can be atmost one element eligible to be converted to
      // metadata only
      if (walkerCtx.getMayBeMetadataOnlyTableScans().isEmpty()) {
        return nd;
      }

      for (Node op : stack) {
        if (op instanceof GroupByOperator) {
          GroupByOperator gby = (GroupByOperator) op;
          if (!gby.getConf().isDistinctLike()) {
            // GroupBy not distinct like, disabling
            walkerCtx.convertNotMetadataOnly();
            return nd;
          }
        }
      }

      walkerCtx.convertMetadataOnly();
      return nd;
    }
  }

  @Override
  public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>();
    opRules.put(new RuleRegExp("R1", TableScanOperator.getOperatorName() + "%"),
      new TableScanProcessor());
    opRules.put(new RuleRegExp("R2",
      GroupByOperator.getOperatorName() + "%.*" + FileSinkOperator.getOperatorName() + "%"),
      new FileSinkProcessor());
    Dispatcher disp = new NullScanTaskDispatcher(pctx, opRules);
    GraphWalker ogw = new DefaultGraphWalker(disp);
    ArrayList<Node> topNodes = new ArrayList<Node>();
    topNodes.addAll(pctx.getRootTasks());
    ogw.startWalking(topNodes, null);
    return pctx;
  }
}