MapJoinFactory.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.optimizer;

import java.io.Serializable;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Stack;

import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator;
import org.apache.hadoop.hive.ql.exec.TableScanOperator;
import org.apache.hadoop.hive.ql.exec.Task;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx;
import org.apache.hadoop.hive.ql.parse.SemanticException;
import org.apache.hadoop.hive.ql.plan.BucketMapJoinContext;
import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
import org.apache.hadoop.hive.ql.plan.MapWork;
import org.apache.hadoop.hive.ql.plan.MapredLocalWork;
import org.apache.hadoop.hive.ql.plan.MapredWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;

/**
 * Operator factory for MapJoin processing.
 */
public final class MapJoinFactory {

  public static int getPositionParent(AbstractMapJoinOperator<? extends MapJoinDesc> op,
      Stack<Node> stack) {
    int pos = 0;
    int size = stack.size();
    assert size >= 2 && stack.get(size - 1) == op;
    Operator<? extends OperatorDesc> parent =
        (Operator<? extends OperatorDesc>) stack.get(size - 2);
    List<Operator<? extends OperatorDesc>> parOp = op.getParentOperators();
    pos = parOp.indexOf(parent);
    assert pos < parOp.size();
    return pos;
  }

  /**
   * MapJoin processor.
   * The user can specify a mapjoin hint to specify that the input should be processed as a
   * mapjoin instead of map-reduce join. If hive.auto.convert.join is set to true, the
   * user need not specify the hint explicitly, but hive will automatically process the joins
   * as a mapjoin whenever possible. However, a join can only be processed as a bucketized
   * map-side join or a sort merge join, if the user has provided the hint explicitly. This
   * will be fixed as part of HIVE-3433, and eventually, we should remove support for mapjoin
   * hint.
   * However, currently, the mapjoin hint is processed as follows:
   * A mapjoin will have 'n' parents for a n-way mapjoin, and therefore the mapjoin operator
   * will be encountered 'n' times (one for each parent). Since a reduceSink operator is not
   * allowed before a mapjoin, the task for the mapjoin will always be a root task. The task
   * corresponding to the mapjoin is converted to a root task when the operator is encountered
   * for the first time. When the operator is encountered subsequently, the current task is
   * merged with the root task for the mapjoin. Note that, it is possible that the map-join task
   * may be performed as a bucketized map-side join (or sort-merge join), the map join operator
   * is enhanced to contain the bucketing info. when it is encountered.
   */
  private static class TableScanMapJoinProcessor implements NodeProcessor {

    public static void setupBucketMapJoinInfo(MapWork plan,
        AbstractMapJoinOperator<? extends MapJoinDesc> currMapJoinOp) {
      if (currMapJoinOp != null) {
        Map<String, Map<String, List<String>>> aliasBucketFileNameMapping =
            currMapJoinOp.getConf().getAliasBucketFileNameMapping();
        if (aliasBucketFileNameMapping != null) {
          MapredLocalWork localPlan = plan.getMapRedLocalWork();
          if (localPlan == null) {
            if (currMapJoinOp instanceof SMBMapJoinOperator) {
              localPlan = ((SMBMapJoinOperator) currMapJoinOp).getConf().getLocalWork();
            }
          } else {
            // local plan is not null, we want to merge it into SMBMapJoinOperator's local work
            if (currMapJoinOp instanceof SMBMapJoinOperator) {
              MapredLocalWork smbLocalWork = ((SMBMapJoinOperator) currMapJoinOp).getConf()
                  .getLocalWork();
              if (smbLocalWork != null) {
                localPlan.getAliasToFetchWork().putAll(smbLocalWork.getAliasToFetchWork());
                localPlan.getAliasToWork().putAll(smbLocalWork.getAliasToWork());
              }
            }
          }

          if (localPlan == null) {
            return;
          }

          if (currMapJoinOp instanceof SMBMapJoinOperator) {
            plan.setMapRedLocalWork(null);
            ((SMBMapJoinOperator) currMapJoinOp).getConf().setLocalWork(localPlan);
          } else {
            plan.setMapRedLocalWork(localPlan);
          }
          BucketMapJoinContext bucketMJCxt = new BucketMapJoinContext();
          localPlan.setBucketMapjoinContext(bucketMJCxt);
          bucketMJCxt.setAliasBucketFileNameMapping(aliasBucketFileNameMapping);
          bucketMJCxt.setBucketFileNameMapping(
              currMapJoinOp.getConf().getBigTableBucketNumMapping());
          localPlan.setInputFileChangeSensitive(true);
          bucketMJCxt.setMapJoinBigTableAlias(currMapJoinOp.getConf().getBigTableAlias());
          bucketMJCxt
              .setBucketMatcherClass(org.apache.hadoop.hive.ql.exec.DefaultBucketMatcher.class);
          bucketMJCxt.setBigTablePartSpecToFileMapping(
              currMapJoinOp.getConf().getBigTablePartSpecToFileMapping());
          // BucketizedHiveInputFormat should be used for either sort merge join or bucket map join
          if ((currMapJoinOp instanceof SMBMapJoinOperator)
              || (currMapJoinOp.getConf().isBucketMapJoin())) {
            plan.setUseBucketizedHiveInputFormat(true);
          }
        }
      }
    }

    /**
     * Initialize the current plan by adding it to root tasks. Since a reduce sink
     * cannot be present before a mapjoin, and the mapjoin operator is encountered
     * for the first time, the task corresposding to the mapjoin is added to the
     * root tasks.
     *
     * @param op
     *          the map join operator encountered
     * @param opProcCtx
     *          processing context
     * @param pos
     *          position of the parent
     */
    private static void initMapJoinPlan(AbstractMapJoinOperator<? extends MapJoinDesc> op,
        Task<? extends Serializable> currTask,
        GenMRProcContext opProcCtx, boolean local)
        throws SemanticException {

      // The map is overloaded to keep track of mapjoins also
      opProcCtx.getOpTaskMap().put(op, currTask);

      TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
      String currAliasId = opProcCtx.getCurrAliasId();
      GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, local, opProcCtx);
    }

    /**
     * Merge the current task with the task for the current mapjoin. The mapjoin operator
     * has already been encountered.
     *
     * @param op
     *          operator being processed
     * @param oldTask
     *          the old task for the current mapjoin
     * @param opProcCtx
     *          processing context
     * @param pos
     *          position of the parent in the stack
     */
    private static void joinMapJoinPlan(Task<? extends Serializable> oldTask,
        GenMRProcContext opProcCtx, boolean local)
        throws SemanticException {
      TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
      GenMapRedUtils.mergeInput(currTopOp, opProcCtx, oldTask, local);
    }

    /*
     * The mapjoin operator will be encountered many times (n times for a n-way join). Since a
     * reduceSink operator is not allowed before a mapjoin, the task for the mapjoin will always
     * be a root task. The task corresponding to the mapjoin is converted to a root task when the
     * operator is encountered for the first time. When the operator is encountered subsequently,
     * the current task is merged with the root task for the mapjoin. Note that, it is possible
     * that the map-join task may be performed as a bucketized map-side join (or sort-merge join),
     * the map join operator is enhanced to contain the bucketing info. when it is encountered.
     */
    @Override
    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
        Object... nodeOutputs) throws SemanticException {
      AbstractMapJoinOperator<MapJoinDesc> mapJoin = (AbstractMapJoinOperator<MapJoinDesc>) nd;
      GenMRProcContext ctx = (GenMRProcContext) procCtx;

      // find the branch on which this processor was invoked
      int pos = getPositionParent(mapJoin, stack);

      Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx
          .getMapCurrCtx();
      GenMapRedCtx mapredCtx = mapCurrCtx.get(mapJoin.getParentOperators().get(pos));
      Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
      MapredWork currPlan = (MapredWork) currTask.getWork();
      String currAliasId = mapredCtx.getCurrAliasId();
      HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap =
          ctx.getOpTaskMap();
      Task<? extends Serializable> oldTask = opTaskMap.get(mapJoin);

      ctx.setCurrAliasId(currAliasId);
      ctx.setCurrTask(currTask);

      // If we are seeing this mapjoin for the first time, initialize the plan.
      // If we are seeing this mapjoin for the second or later time then atleast one of the
      // branches for this mapjoin have been encounered. Join the plan with the plan created
      // the first time.
      boolean local = pos != mapJoin.getConf().getPosBigTable();
      if (oldTask == null) {
        assert currPlan.getReduceWork() == null;
        initMapJoinPlan(mapJoin, currTask, ctx, local);
      } else {
        // The current plan can be thrown away after being merged with the
        // original plan
        joinMapJoinPlan(oldTask, ctx, local);
        ctx.setCurrTask(currTask = oldTask);
      }
      MapredWork plan = (MapredWork) currTask.getWork();
      setupBucketMapJoinInfo(plan.getMapWork(), mapJoin);

      mapCurrCtx.put(mapJoin, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));

      // local aliases need not to hand over context further
      return !local;
    }
  }

  public static NodeProcessor getTableScanMapJoin() {
    return new TableScanMapJoinProcessor();
  }

  private MapJoinFactory() {
    // prevent instantiation
  }
}