NullScanTaskDispatcher.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.optimizer.physical;

import java.io.IOException;

import org.apache.hadoop.hive.common.StringInternUtils;
import org.apache.hadoop.hive.ql.exec.Utilities;

import org.apache.hadoop.hive.ql.io.ZeroRowsInputFormat;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat;
import org.apache.hadoop.hive.ql.io.NullScanFileSystem;
import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat;
import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
import org.apache.hadoop.hive.ql.lib.Dispatcher;
import org.apache.hadoop.hive.ql.lib.GraphWalker;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.PreOrderOnceWalker;
import org.apache.hadoop.hive.ql.lib.Rule;
import org.apache.hadoop.hive.ql.optimizer.physical.MetadataOnlyOptimizer.WalkerCtx;
import org.apache.hadoop.hive.ql.parse.ParseContext;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.PartitionDesc;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.NullStructSerDe;

/**
 * Iterate over all tasks one by one and removes all input paths from task if conditions as
 * defined in rules match.
 */
public class NullScanTaskDispatcher implements Dispatcher {

  static final Logger LOG = LoggerFactory.getLogger(NullScanTaskDispatcher.class.getName());

  private final PhysicalContext physicalContext;
  private final Map<Rule, NodeProcessor> rules;

  public NullScanTaskDispatcher(PhysicalContext context,  Map<Rule, NodeProcessor> rules) {
    super();
    physicalContext = context;
    this.rules = rules;
  }

  private String getAliasForTableScanOperator(MapWork work,
      TableScanOperator tso) {

    for (Map.Entry<String, Operator<? extends OperatorDesc>> entry :
      work.getAliasToWork().entrySet()) {
      if (entry.getValue() == tso) {
        return entry.getKey();
      }
    }

    return null;
  }

  private PartitionDesc changePartitionToMetadataOnly(PartitionDesc desc, Path path) {
    if (desc == null) return null;
    boolean isEmpty = false;
    try {
      isEmpty = Utilities.isEmptyPath(physicalContext.getConf(), path);
    } catch (IOException e) {
      LOG.error("Cannot determine if the table is empty", e);
    }
    desc.setInputFileFormatClass(
        isEmpty ? ZeroRowsInputFormat.class : OneNullRowInputFormat.class);
    desc.setOutputFileFormatClass(HiveIgnoreKeyTextOutputFormat.class);
    desc.getProperties().setProperty(serdeConstants.SERIALIZATION_LIB,
      NullStructSerDe.class.getName());
    return desc;
  }

  private void processAlias(MapWork work, Path path, ArrayList<String> aliasesAffected,
      ArrayList<String> aliases) {
    // the aliases that are allowed to map to a null scan.
    ArrayList<String> allowed = new ArrayList<String>();
    for (String alias : aliasesAffected) {
      if (aliases.contains(alias)) {
        allowed.add(alias);
      }
    }
    if (allowed.size() > 0) {
      PartitionDesc partDesc = work.getPathToPartitionInfo().get(path).clone();
      PartitionDesc newPartition = changePartitionToMetadataOnly(partDesc, path);
      // Prefix partition with something to avoid it being a hidden file.
      Path fakePath = new Path(NullScanFileSystem.getBase() + newPartition.getTableName()
          + "/part" + encode(newPartition.getPartSpec()));
      StringInternUtils.internUriStringsInPath(fakePath);
      work.addPathToPartitionInfo(fakePath, newPartition);
      work.addPathToAlias(fakePath, new ArrayList<>(allowed));
      aliasesAffected.removeAll(allowed);
      if (aliasesAffected.isEmpty()) {
        work.removePathToAlias(path);
        work.removePathToPartitionInfo(path);
      }
    }
  }

  private void processAlias(MapWork work, HashSet<TableScanOperator> tableScans) {
    ArrayList<String> aliases = new ArrayList<String>();
    for (TableScanOperator tso : tableScans) {
      // use LinkedHashMap<String, Operator<? extends OperatorDesc>>
      // getAliasToWork()
      // should not apply this for non-native table
      if (tso.getConf().getTableMetadata().getStorageHandler() != null) {
        continue;
      }
      String alias = getAliasForTableScanOperator(work, tso);
      aliases.add(alias);
      tso.getConf().setIsMetadataOnly(true);
    }
    // group path alias according to work
    LinkedHashMap<Path, ArrayList<String>> candidates = new LinkedHashMap<>();
    for (Path path : work.getPaths()) {
      ArrayList<String> aliasesAffected = work.getPathToAliases().get(path);
      if (aliasesAffected != null && aliasesAffected.size() > 0) {
        candidates.put(path, aliasesAffected);
      }
    }
    for (Entry<Path, ArrayList<String>> entry : candidates.entrySet()) {
      processAlias(work, entry.getKey(), entry.getValue(), aliases);
    }
  }

  // considered using URLEncoder, but it seemed too much
  private String encode(Map<String, String> partSpec) {
    return partSpec.toString().replaceAll("[{}:/#\\?, ]+", "_");
  }

  @Override
  public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs)
      throws SemanticException {
    Task<? extends Serializable> task = (Task<? extends Serializable>) nd;

    // create a the context for walking operators
    ParseContext parseContext = physicalContext.getParseContext();
    WalkerCtx walkerCtx = new WalkerCtx();

    List<MapWork> mapWorks = new ArrayList<MapWork>(task.getMapWork());
    Collections.sort(mapWorks, new Comparator<MapWork>() {
      @Override
      public int compare(MapWork o1, MapWork o2) {
        return o1.getName().compareTo(o2.getName());
      }
    });

    for (MapWork mapWork : mapWorks) {
      LOG.debug("Looking at: "+mapWork.getName());
      Collection<Operator<? extends OperatorDesc>> topOperators
        = mapWork.getAliasToWork().values();
      if (topOperators.size() == 0) {
        LOG.debug("No top operators");
        return null;
      }

      LOG.debug("Looking for table scans where optimization is applicable");

      // The dispatcher fires the processor corresponding to the closest
      // matching rule and passes the context along
      Dispatcher disp = new DefaultRuleDispatcher(null, rules, walkerCtx);
      GraphWalker ogw = new PreOrderOnceWalker(disp);

      // Create a list of topOp nodes
      ArrayList<Node> topNodes = new ArrayList<Node>();
      // Get the top Nodes for this task
      for (Operator<? extends OperatorDesc>
             workOperator : topOperators) {
        if (parseContext.getTopOps().values().contains(workOperator)) {
          topNodes.add(workOperator);
        }
      }

      Operator<? extends OperatorDesc> reducer = task.getReducer(mapWork);
      if (reducer != null) {
        topNodes.add(reducer);
      }

      ogw.startWalking(topNodes, null);

      LOG.debug(String.format("Found %d null table scans",
          walkerCtx.getMetadataOnlyTableScans().size()));
      if (walkerCtx.getMetadataOnlyTableScans().size() > 0)
        processAlias(mapWork, walkerCtx.getMetadataOnlyTableScans());
    }
    return null;
  }
}