GenTezWork.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.parse;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Stack;

import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator;
import org.apache.hadoop.hive.ql.exec.DummyStoreOperator;
import org.apache.hadoop.hive.ql.exec.HashTableDummyOperator;
import org.apache.hadoop.hive.ql.exec.MapJoinOperator;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.OperatorFactory;
import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
import org.apache.hadoop.hive.ql.exec.RowSchema;
import org.apache.hadoop.hive.ql.lib.Node;
import org.apache.hadoop.hive.ql.lib.NodeProcessor;
import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
import org.apache.hadoop.hive.ql.optimizer.GenMapRedUtils;
import org.apache.hadoop.hive.ql.optimizer.ReduceSinkMapJoinProc;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.MergeJoinWork;
import org.apache.hadoop.hive.ql.plan.OperatorDesc;
import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
import org.apache.hadoop.hive.ql.plan.ReduceWork;
import org.apache.hadoop.hive.ql.plan.TezEdgeProperty;
import org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType;
import org.apache.hadoop.hive.ql.plan.TezWork;
import org.apache.hadoop.hive.ql.plan.TezWork.VertexType;
import org.apache.hadoop.hive.ql.plan.UnionWork;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * GenTezWork separates the operator tree into tez tasks.
 * It is called once per leaf operator (operator that forces
 * a new execution unit.) and break the operators into work
 * and tasks along the way.
 */
public class GenTezWork implements NodeProcessor {

  private static final Logger LOG = LoggerFactory.getLogger(GenTezWork.class.getName());

  private final GenTezUtils utils;

  public GenTezWork(GenTezUtils utils) {
    this.utils = utils;
  }

  @Override
  public Object process(Node nd, Stack<Node> stack,
      NodeProcessorCtx procContext, Object... nodeOutputs)
      throws SemanticException {

    GenTezProcContext context = (GenTezProcContext) procContext;

    assert context != null && context.currentTask != null
        && context.currentRootOperator != null;

    // Operator is a file sink or reduce sink. Something that forces
    // a new vertex.
    Operator<?> operator = (Operator<?>) nd;

    // root is the start of the operator pipeline we're currently
    // packing into a vertex, typically a table scan, union or join
    Operator<?> root = context.currentRootOperator;

    LOG.debug("Root operator: " + root);
    LOG.debug("Leaf operator: " + operator);

    if (context.clonedReduceSinks.contains(operator)) {
      // if we're visiting a terminal we've created ourselves,
      // just skip and keep going
      return null;
    }

    TezWork tezWork = context.currentTask.getWork();

    // Right now the work graph is pretty simple. If there is no
    // Preceding work we have a root and will generate a map
    // vertex. If there is a preceding work we will generate
    // a reduce vertex
    BaseWork work;
    if (context.rootToWorkMap.containsKey(root)) {
      // having seen the root operator before means there was a branch in the
      // operator graph. There's typically two reasons for that: a) mux/demux
      // b) multi insert. Mux/Demux will hit the same leaf again, multi insert
      // will result into a vertex with multiple FS or RS operators.
      if (context.childToWorkMap.containsKey(operator)) {
        // if we've seen both root and child, we can bail.

        // clear out the mapjoin set. we don't need it anymore.
        context.currentMapJoinOperators.clear();

        // clear out the union set. we don't need it anymore.
        context.currentUnionOperators.clear();

        return null;
      } else {
        // At this point we don't have to do anything special. Just
        // run through the regular paces w/o creating a new task.
        work = context.rootToWorkMap.get(root);
      }
    } else {
      // create a new vertex
      if (context.preceedingWork == null) {
        work = utils.createMapWork(context, root, tezWork, null);
      } else {
        work = GenTezUtils.createReduceWork(context, root, tezWork);
      }
      context.rootToWorkMap.put(root, work);
    }

    // this is where we set the sort columns that we will be using for KeyValueInputMerge
    if (operator instanceof DummyStoreOperator) {
      work.addSortCols(root.getOpTraits().getSortCols().get(0));
    }

    if (!context.childToWorkMap.containsKey(operator)) {
      List<BaseWork> workItems = new LinkedList<BaseWork>();
      workItems.add(work);
      context.childToWorkMap.put(operator, workItems);
    } else {
      context.childToWorkMap.get(operator).add(work);
    }

    // this transformation needs to be first because it changes the work item itself.
    // which can affect the working of all downstream transformations.
    if (context.currentMergeJoinOperator != null) {
      // we are currently walking the big table side of the merge join. we need to create or hook up
      // merge join work.
      MergeJoinWork mergeJoinWork = null;
      if (context.opMergeJoinWorkMap.containsKey(context.currentMergeJoinOperator)) {
        // we have found a merge work corresponding to this closing operator. Hook up this work.
        mergeJoinWork = context.opMergeJoinWorkMap.get(context.currentMergeJoinOperator);
      } else {
        // we need to create the merge join work
        mergeJoinWork = new MergeJoinWork();
        mergeJoinWork.setMergeJoinOperator(context.currentMergeJoinOperator);
        tezWork.add(mergeJoinWork);
        context.opMergeJoinWorkMap.put(context.currentMergeJoinOperator, mergeJoinWork);
      }
      // connect the work correctly.
      work.addSortCols(root.getOpTraits().getSortCols().get(0));
      mergeJoinWork.addMergedWork(work, null, context.leafOperatorToFollowingWork);
      Operator<? extends OperatorDesc> parentOp =
          getParentFromStack(context.currentMergeJoinOperator, stack);
      // Set the big table position. Both the reduce work and merge join operator
      // should be set with the same value.
//      int pos = context.currentMergeJoinOperator.getTagForOperator(parentOp);
      int pos = context.currentMergeJoinOperator.getConf().getBigTablePosition();
      work.setTag(pos);
      context.currentMergeJoinOperator.getConf().setBigTablePosition(pos);
      tezWork.setVertexType(work, VertexType.MULTI_INPUT_UNINITIALIZED_EDGES);
      for (BaseWork parentWork : tezWork.getParents(work)) {
        TezEdgeProperty edgeProp = tezWork.getEdgeProperty(parentWork, work);
        tezWork.disconnect(parentWork, work);
        tezWork.connect(parentWork, mergeJoinWork, edgeProp);
      }

      for (BaseWork childWork : tezWork.getChildren(work)) {
        TezEdgeProperty edgeProp = tezWork.getEdgeProperty(work, childWork);
        tezWork.disconnect(work, childWork);
        tezWork.connect(mergeJoinWork, childWork, edgeProp);
      }
      tezWork.remove(work);
      context.rootToWorkMap.put(root, mergeJoinWork);
      context.childToWorkMap.get(operator).remove(work);
      context.childToWorkMap.get(operator).add(mergeJoinWork);
      work = mergeJoinWork;
      context.currentMergeJoinOperator = null;
    }

    // remember which mapjoin operator links with which work
    if (!context.currentMapJoinOperators.isEmpty()) {
      for (MapJoinOperator mj: context.currentMapJoinOperators) {
        // For dynamic partitioned hash join, ReduceSinkMapJoinProc rule may not get run for all
        // of the ReduceSink parents, because the parents of the MapJoin operator get
        // removed later on in this method. Keep track of the parent to mapjoin mapping
        // so we can later run the same logic that is run in ReduceSinkMapJoinProc.
        if (mj.getConf().isDynamicPartitionHashJoin()) {
          // Since this is a dynamic partitioned hash join, the work for this join should be a ReduceWork
          ReduceWork reduceWork = (ReduceWork) work;
          int bigTablePosition = mj.getConf().getPosBigTable();
          reduceWork.setTag(bigTablePosition);

          // Use context.mapJoinParentMap to get the original RS parents, because
          // the MapJoin's parents may have been replaced by dummy operator.
          List<Operator<?>> mapJoinOriginalParents = context.mapJoinParentMap.get(mj);
          if (mapJoinOriginalParents == null) {
            throw new SemanticException("Unexpected error - context.mapJoinParentMap did not have an entry for " + mj);
          }
          for (int pos = 0; pos < mapJoinOriginalParents.size(); ++pos) {
            // This processing only needs to happen for the small tables
            if (pos == bigTablePosition) {
              continue;
            }
            Operator<?> parentOp = mapJoinOriginalParents.get(pos);
            context.smallTableParentToMapJoinMap.put(parentOp, mj);

            ReduceSinkOperator parentRS = (ReduceSinkOperator) parentOp;

            // TableDesc needed for dynamic partitioned hash join
            GenMapRedUtils.setKeyAndValueDesc(reduceWork, parentRS);

            // For small table RS parents that have already been processed, we need to
            // add the tag to the RS work to the reduce work that contains this map join.
            // This was not being done for normal mapjoins, where the small table typically
            // has its ReduceSink parent removed.
            if (!context.mapJoinToUnprocessedSmallTableReduceSinks.get(mj).contains(parentRS)) {
              // This reduce sink has been processed already, so the work for the parentRS exists
              BaseWork parentWork = ReduceSinkMapJoinProc.getMapJoinParentWork(context, parentRS);
              int tag = parentRS.getConf().getTag();
              tag = (tag == -1 ? 0 : tag);
              reduceWork.getTagToInput().put(tag, parentWork.getName());
            }

          }
        }

        LOG.debug("Processing map join: " + mj);
        // remember the mapping in case we scan another branch of the
        // mapjoin later
        if (!context.mapJoinWorkMap.containsKey(mj)) {
          List<BaseWork> workItems = new LinkedList<BaseWork>();
          workItems.add(work);
          context.mapJoinWorkMap.put(mj, workItems);
        } else {
          context.mapJoinWorkMap.get(mj).add(work);
        }

        /*
         * this happens in case of map join operations.
         * The tree looks like this:
         *
         *        RS <--- we are here perhaps
         *        |
         *     MapJoin
         *     /     \
         *   RS       TS
         *  /
         * TS
         *
         * If we are at the RS pointed above, and we may have already visited the
         * RS following the TS, we have already generated work for the TS-RS.
         * We need to hook the current work to this generated work.
         */
        if (context.linkOpWithWorkMap.containsKey(mj)) {
          Map<BaseWork,TezEdgeProperty> linkWorkMap = context.linkOpWithWorkMap.get(mj);
          if (linkWorkMap != null) {
             // Note: it's not quite clear why this is done inside this if. Seems like it should be on the top level.
            if (context.linkChildOpWithDummyOp.containsKey(mj)) {
               if (LOG.isDebugEnabled()) {
                LOG.debug("Adding dummy ops to work: " + work.getName() + ": " + context.linkChildOpWithDummyOp.get(mj));
               }
              for (Operator<?> dummy: context.linkChildOpWithDummyOp.get(mj)) {
                work.addDummyOp((HashTableDummyOperator) dummy);
              }
            }
            for (Entry<BaseWork,TezEdgeProperty> parentWorkMap : linkWorkMap.entrySet()) {
              BaseWork parentWork = parentWorkMap.getKey();
              LOG.debug("connecting "+parentWork.getName()+" with "+work.getName());
              TezEdgeProperty edgeProp = parentWorkMap.getValue();
              tezWork.connect(parentWork, work, edgeProp);
              if (edgeProp.getEdgeType() == EdgeType.CUSTOM_EDGE) {
                tezWork.setVertexType(work, VertexType.INITIALIZED_EDGES);
              }

              // need to set up output name for reduce sink now that we know the name
              // of the downstream work
              for (ReduceSinkOperator r:
                     context.linkWorkWithReduceSinkMap.get(parentWork)) {
                if (!context.mapJoinParentMap.get(mj).contains(r)) {
                  // We might be visiting twice because of reutilization of intermediary results.
                  // If that is the case, we do not need to do anything because either we have
                  // already connected this RS operator or we will connect it at subsequent pass.
                  continue;
                }
                if (r.getConf().getOutputName() != null) {
                  LOG.debug("Cloning reduce sink " + r + " for multi-child broadcast edge");
                  // we've already set this one up. Need to clone for the next work.
                  r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(
                      r.getCompilationOpContext(), (ReduceSinkDesc)r.getConf().clone(),
                      new RowSchema(r.getSchema()), r.getParentOperators());
                  context.clonedReduceSinks.add(r);
                }
                r.getConf().setOutputName(work.getName());
                context.connectedReduceSinks.add(r);
              }
            }
          }
        }
      }
      // clear out the set. we don't need it anymore.
      context.currentMapJoinOperators.clear();
    }

    // This is where we cut the tree as described above. We also remember that
    // we might have to connect parent work with this work later.
    for (Operator<?> parent : new ArrayList<Operator<?>>(root.getParentOperators())) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Removing " + parent + " as parent from " + root);
      }
      context.leafOperatorToFollowingWork.remove(parent);
      context.leafOperatorToFollowingWork.put(parent, work);
      root.removeParent(parent);
    }

    if (!context.currentUnionOperators.isEmpty()) {
      // if there are union all operators, it means that the walking context contains union all operators.
      // please see more details of context.currentUnionOperator in GenTezWorkWalker

      UnionWork unionWork;
      if (context.unionWorkMap.containsKey(operator)) {
        // we've seen this terminal before and have created a union work object.
        // just need to add this work to it. There will be no children of this one
        // since we've passed this operator before.
        assert operator.getChildOperators().isEmpty();
        unionWork = (UnionWork) context.unionWorkMap.get(operator);
        // finally connect the union work with work
        connectUnionWorkWithWork(unionWork, work, tezWork, context);
      } else {
        // we've not seen this terminal before. we need to check
        // rootUnionWorkMap which contains the information of mapping the root
        // operator of a union work to a union work
        unionWork = context.rootUnionWorkMap.get(root);
        if (unionWork == null) {
          // if unionWork is null, it means it is the first time. we need to
          // create a union work object and add this work to it. Subsequent
          // work should reference the union and not the actual work.
          unionWork = GenTezUtils.createUnionWork(context, root, operator, tezWork);
          // finally connect the union work with work
          connectUnionWorkWithWork(unionWork, work, tezWork, context);
        }
      }
      context.currentUnionOperators.clear();
      work = unionWork;

    }

    // We're scanning a tree from roots to leaf (this is not technically
    // correct, demux and mux operators might form a diamond shape, but
    // we will only scan one path and ignore the others, because the
    // diamond shape is always contained in a single vertex). The scan
    // is depth first and because we remove parents when we pack a pipeline
    // into a vertex we will never visit any node twice. But because of that
    // we might have a situation where we need to connect 'work' that comes after
    // the 'work' we're currently looking at.
    //
    // Also note: the concept of leaf and root is reversed in hive for historical
    // reasons. Roots are data sources, leaves are data sinks. I know.
    if (context.leafOperatorToFollowingWork.containsKey(operator)) {

      BaseWork followingWork = context.leafOperatorToFollowingWork.get(operator);
      long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);

      LOG.debug("Second pass. Leaf operator: "+operator
        +" has common downstream work: "+followingWork);

      if (operator instanceof DummyStoreOperator) {
        // this is the small table side.
        assert (followingWork instanceof MergeJoinWork);
        MergeJoinWork mergeJoinWork = (MergeJoinWork) followingWork;
        CommonMergeJoinOperator mergeJoinOp = mergeJoinWork.getMergeJoinOperator();
        work.setTag(mergeJoinOp.getTagForOperator(operator));
        mergeJoinWork.addMergedWork(null, work, context.leafOperatorToFollowingWork);
        tezWork.setVertexType(mergeJoinWork, VertexType.MULTI_INPUT_UNINITIALIZED_EDGES);
        for (BaseWork parentWork : tezWork.getParents(work)) {
          TezEdgeProperty edgeProp = tezWork.getEdgeProperty(parentWork, work);
          tezWork.disconnect(parentWork, work);
          tezWork.connect(parentWork, mergeJoinWork, edgeProp);
        }
        work = mergeJoinWork;
      } else {
        // need to add this branch to the key + value info
        assert operator instanceof ReduceSinkOperator
            && ((followingWork instanceof ReduceWork) || (followingWork instanceof MergeJoinWork)
                || followingWork instanceof UnionWork);
        ReduceSinkOperator rs = (ReduceSinkOperator) operator;
        ReduceWork rWork = null;
        if (followingWork instanceof MergeJoinWork) {
          MergeJoinWork mergeJoinWork = (MergeJoinWork) followingWork;
          rWork = (ReduceWork) mergeJoinWork.getMainWork();
        } else if (followingWork instanceof UnionWork) {
          // this can only be possible if there is merge work followed by the union
          UnionWork unionWork = (UnionWork) followingWork;
          int index = getFollowingWorkIndex(tezWork, unionWork, rs);
          BaseWork baseWork = tezWork.getChildren(unionWork).get(index);
          if (baseWork instanceof MergeJoinWork) {
            MergeJoinWork mergeJoinWork = (MergeJoinWork) baseWork;
            // disconnect the connection to union work and connect to merge work
            followingWork = mergeJoinWork;
            rWork = (ReduceWork) mergeJoinWork.getMainWork();
          } else {
            rWork = (ReduceWork) baseWork;
          }
        } else {
          rWork = (ReduceWork) followingWork;
        }
        GenMapRedUtils.setKeyAndValueDesc(rWork, rs);

        // remember which parent belongs to which tag
        int tag = rs.getConf().getTag();
        rWork.getTagToInput().put(tag == -1 ? 0 : tag, work.getName());

        // remember the output name of the reduce sink
        rs.getConf().setOutputName(rWork.getName());

        // For dynamic partitioned hash join, run the ReduceSinkMapJoinProc logic for any
        // ReduceSink parents that we missed.
        MapJoinOperator mj = context.smallTableParentToMapJoinMap.get(rs);
        if (mj != null) {
          // Only need to run the logic for tables we missed
          if (context.mapJoinToUnprocessedSmallTableReduceSinks.get(mj).contains(rs)) {
            // ReduceSinkMapJoinProc logic does not work unless the ReduceSink is connected as
            // a parent of the MapJoin, but at this point we have already removed all of the
            // parents from the MapJoin.
            // Try temporarily adding the RS as a parent
            ArrayList<Operator<?>> tempMJParents = new ArrayList<Operator<?>>();
            tempMJParents.add(rs);
            mj.setParentOperators(tempMJParents);
            // ReduceSink also needs MapJoin as child
            List<Operator<?>> rsChildren = rs.getChildOperators();
            rsChildren.add(mj);

            // Since the MapJoin has had all of its other parents removed at this point,
            // it would be bad here if processReduceSinkToHashJoin() tries to do anything
            // with the RS parent based on its position in the list of parents.
            ReduceSinkMapJoinProc.processReduceSinkToHashJoin(rs, mj, context);

            // Remove any parents from MapJoin again
            mj.removeParents();
            // TODO: do we also need to remove the MapJoin from the list of RS's children?
          }
        }

        if (!context.connectedReduceSinks.contains(rs)) {
          // add dependency between the two work items
          TezEdgeProperty edgeProp;
          EdgeType edgeType = GenTezUtils.determineEdgeType(work, followingWork, rs);
          if (rWork.isAutoReduceParallelism()) {
            edgeProp =
                new TezEdgeProperty(context.conf, edgeType, true, rWork.isSlowStart(),
                    rWork.getMinReduceTasks(), rWork.getMaxReduceTasks(), bytesPerReducer);
          } else {
            edgeProp = new TezEdgeProperty(edgeType);
            edgeProp.setSlowStart(rWork.isSlowStart());
          }
          tezWork.connect(work, followingWork, edgeProp);
          context.connectedReduceSinks.add(rs);
        }
      }
    } else {
      LOG.debug("First pass. Leaf operator: "+operator);
    }

    // No children means we're at the bottom. If there are more operators to scan
    // the next item will be a new root.
    if (!operator.getChildOperators().isEmpty()) {
      assert operator.getChildOperators().size() == 1;
      context.parentOfRoot = operator;
      context.currentRootOperator = operator.getChildOperators().get(0);
      context.preceedingWork = work;
    }

    return null;
  }

  private int getFollowingWorkIndex(TezWork tezWork, UnionWork unionWork, ReduceSinkOperator rs)
      throws SemanticException {
    int index = 0;
    for (BaseWork baseWork : tezWork.getChildren(unionWork)) {
      TezEdgeProperty edgeProperty = tezWork.getEdgeProperty(unionWork, baseWork);
      if (edgeProperty.getEdgeType() != TezEdgeProperty.EdgeType.CONTAINS) {
        return index;
      }
      index++;
    }
    throw new SemanticException("Following work not found for the reduce sink: " + rs.getName());
  }

  @SuppressWarnings("unchecked")
  private Operator<? extends OperatorDesc> getParentFromStack(Node currentMergeJoinOperator,
      Stack<Node> stack) {
    int pos = stack.indexOf(currentMergeJoinOperator);
    return (Operator<? extends OperatorDesc>) stack.get(pos - 1);
  }

  private void connectUnionWorkWithWork(UnionWork unionWork, BaseWork work, TezWork tezWork,
      GenTezProcContext context) {
    LOG.debug("Connecting union work (" + unionWork + ") with work (" + work + ")");
    TezEdgeProperty edgeProp = new TezEdgeProperty(EdgeType.CONTAINS);
    tezWork.connect(unionWork, work, edgeProp);
    unionWork.addUnionOperators(context.currentUnionOperators);
    context.workWithUnionOperators.add(work);
  }
}