TezWork.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.hive.ql.plan;

import java.io.Serializable;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.ql.exec.tez.DagUtils;
import org.apache.hadoop.hive.ql.plan.TezEdgeProperty.EdgeType;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.hive.ql.plan.Explain.Level;
import org.apache.hadoop.hive.ql.plan.Explain.Vectorization;

/**
 * TezWork. This class encapsulates all the work objects that can be executed
 * in a single tez job. Currently it's basically a tree with MapWork at the
 * leaves and and ReduceWork in all other nodes.
 *
 */
@SuppressWarnings("serial")
@Explain(displayName = "Tez", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED },
    vectorization = Vectorization.SUMMARY_PATH)
public class TezWork extends AbstractOperatorDesc {

  public enum VertexType {
    AUTO_INITIALIZED_EDGES, // no custom vertex or edge
    INITIALIZED_EDGES, // custom vertex and custom edge but single MR Input
    MULTI_INPUT_INITIALIZED_EDGES, // custom vertex, custom edge and multi MR Input
    MULTI_INPUT_UNINITIALIZED_EDGES // custom vertex, no custom edge, multi MR Input
    ;

    public static boolean isCustomInputType(VertexType vertex) {
      if ((vertex == null) || (vertex == AUTO_INITIALIZED_EDGES)) {
        return false;
      } else {
        return true;
      }
    }
  }

  private static transient final Logger LOG = LoggerFactory.getLogger(TezWork.class);

  private static final AtomicInteger counter = new AtomicInteger(1);
  private final String dagId;
  private final String queryName;
  private final Set<BaseWork> roots = new HashSet<BaseWork>();
  private final Set<BaseWork> leaves = new HashSet<BaseWork>();
  private final Map<BaseWork, List<BaseWork>> workGraph = new HashMap<BaseWork, List<BaseWork>>();
  private final Map<BaseWork, List<BaseWork>> invertedWorkGraph = new HashMap<BaseWork, List<BaseWork>>();
  private final Map<Pair<BaseWork, BaseWork>, TezEdgeProperty> edgeProperties =
      new HashMap<Pair<BaseWork, BaseWork>, TezEdgeProperty>();
  private final Map<BaseWork, VertexType> workVertexTypeMap = new HashMap<BaseWork, VertexType>();

  public TezWork(String queryId) {
    this(queryId, null);
  }

  public TezWork(String queryId, Configuration conf) {
    this.dagId = queryId + ":" + counter.getAndIncrement();
    String queryName = (conf != null) ? DagUtils.getUserSpecifiedDagName(conf) : null;
    if (queryName == null) {
      queryName = this.dagId;
    }
    this.queryName = queryName;
  }

  @Explain(displayName = "DagName")
  public String getName() {
    return queryName;
  }

  @Explain(displayName = "DagId")
  public String getDagId() {
    return dagId;
  }

  /**
   * getWorkMap returns a map of "vertex name" to BaseWork
   */
  @Explain(displayName = "Vertices", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED },
      vectorization = Vectorization.SUMMARY_PATH)
  public Map<String, BaseWork> getWorkMap() {
    Map<String, BaseWork> result = new LinkedHashMap<String, BaseWork>();
    for (BaseWork w: getAllWork()) {
      result.put(w.getName(), w);
    }
    return result;
  }

  /**
   * getAllWork returns a topologically sorted list of BaseWork
   */
  public List<BaseWork> getAllWork() {

    List<BaseWork> result = new LinkedList<BaseWork>();
    Set<BaseWork> seen = new HashSet<BaseWork>();

    for (BaseWork leaf: leaves) {
      // make sure all leaves are visited at least once
      visit(leaf, seen, result);
    }

    return result;
  }

  public Collection<BaseWork> getAllWorkUnsorted() {
    return workGraph.keySet();
  }

  private void visit(BaseWork child, Set<BaseWork> seen, List<BaseWork> result) {

    if (seen.contains(child)) {
      // don't visit multiple times
      return;
    }

    seen.add(child);

    for (BaseWork parent: getParents(child)) {
      if (!seen.contains(parent)) {
        visit(parent, seen, result);
      }
    }

    result.add(child);
  }

  /**
   * add all nodes in the collection without any connections
   */
  public void addAll(Collection<BaseWork> c) {
    for (BaseWork w: c) {
      this.add(w);
    }
  }

  /**
   * add all nodes in the collection without any connections
   */
  public void addAll(BaseWork[] bws) {
    for (BaseWork w: bws) {
      this.add(w);
    }
  }

  /**
   * add creates a new node in the graph without any connections
   */
  public void add(BaseWork w) {
    if (workGraph.containsKey(w)) {
      return;
    }
    workGraph.put(w, new LinkedList<BaseWork>());
    invertedWorkGraph.put(w, new LinkedList<BaseWork>());
    roots.add(w);
    leaves.add(w);
  }

  /**
   * disconnect removes an edge between a and b. Both a and
   * b have to be in the graph. If there is no matching edge
   * no change happens.
   */
  public void disconnect(BaseWork a, BaseWork b) {
    workGraph.get(a).remove(b);
    invertedWorkGraph.get(b).remove(a);
    if (getParents(b).isEmpty()) {
      roots.add(b);
    }
    if (getChildren(a).isEmpty()) {
      leaves.add(a);
    }
  }

  /**
   * getRoots returns all nodes that do not have a parent.
   */
  public Set<BaseWork> getRoots() {
    return new HashSet<BaseWork>(roots);
  }

  /**
   * getLeaves returns all nodes that do not have a child
   */
  public Set<BaseWork> getLeaves() {
    return new HashSet<BaseWork>(leaves);
  }

  /**
   * getParents returns all the nodes with edges leading into work
   */
  public List<BaseWork> getParents(BaseWork work) {
    assert invertedWorkGraph.containsKey(work)
      && invertedWorkGraph.get(work) != null;
    return new LinkedList<BaseWork>(invertedWorkGraph.get(work));
  }

  /**
   * getChildren returns all the nodes with edges leading out of work
   */
  public List<BaseWork> getChildren(BaseWork work) {
    assert workGraph.containsKey(work)
      && workGraph.get(work) != null;
    return new LinkedList<BaseWork>(workGraph.get(work));
  }

  /**
   * remove removes a node from the graph and removes all edges with
   * work as start or end point. No change to the graph if the node
   * doesn't exist.
   */
  public void remove(BaseWork work) {

    if (!workGraph.containsKey(work)) {
      return;
    }

    List<BaseWork> children = getChildren(work);
    List<BaseWork> parents = getParents(work);

    for (BaseWork w: children) {
      invertedWorkGraph.get(w).remove(work);
      if (invertedWorkGraph.get(w).size() == 0) {
        roots.add(w);
      }
    }

    for (BaseWork w: parents) {
      workGraph.get(w).remove(work);
      if (workGraph.get(w).size() == 0) {
        leaves.add(w);
      }
    }

    roots.remove(work);
    leaves.remove(work);

    workGraph.remove(work);
    invertedWorkGraph.remove(work);
  }

  public EdgeType getEdgeType(BaseWork a, BaseWork b) {
    return edgeProperties.get(new ImmutablePair(a,b)).getEdgeType();
  }

  /**
   * returns the edge type connecting work a and b
   */
  public TezEdgeProperty getEdgeProperty(BaseWork a, BaseWork b) {
    return edgeProperties.get(new ImmutablePair(a,b));
  }

  /*
   * Dependency is a class used for explain
   */
  public class Dependency implements Serializable, Comparable<Dependency> {
    public BaseWork w;
    public EdgeType type;

    @Explain(displayName = "Name")
    public String getName() {
      return w.getName();
    }

    @Explain(displayName = "Type")
    public String getType() {
      return type.toString();
    }

    @Override
    public int compareTo(Dependency o) {
      int compare = getName().compareTo(o.getName());
      if (compare == 0) {
        compare = getType().compareTo(o.getType());
      }
      return compare;
    }
  }

  @Explain(displayName = "Edges", explainLevels = { Level.USER, Level.DEFAULT, Level.EXTENDED },
      vectorization = Vectorization.SUMMARY_PATH)
  public Map<String, List<Dependency>> getDependencyMap() {
    Map<String, List<Dependency>> result = new LinkedHashMap<String, List<Dependency>>();
    for (Map.Entry<BaseWork, List<BaseWork>> entry: invertedWorkGraph.entrySet()) {
      List dependencies = new LinkedList<Dependency>();
      for (BaseWork d: entry.getValue()) {
        Dependency dependency = new Dependency();
        dependency.w = d;
        dependency.type = getEdgeType(d, entry.getKey());
        dependencies.add(dependency);
      }
      if (!dependencies.isEmpty()) {
        Collections.sort(dependencies);
        result.put(entry.getKey().getName(), dependencies);
      }
    }
    return result;
  }

  private static final String MR_JAR_PROPERTY = "tmpjars";
  /**
   * Calls configureJobConf on instances of work that are part of this TezWork.
   * Uses the passed job configuration to extract "tmpjars" added by these, so that Tez
   * could add them to the job proper Tez way. This is a very hacky way but currently
   * there's no good way to get these JARs - both storage handler interface, and HBase
   * code, would have to change to get the list directly (right now it adds to tmpjars).
   * This will happen in 0.14 hopefully.
   * @param jobConf Job configuration.
   * @return List of files added to tmpjars by storage handlers.
   */
  public String[] configureJobConfAndExtractJars(JobConf jobConf) {
    String[] oldTmpJars = jobConf.getStrings(MR_JAR_PROPERTY);
    jobConf.setStrings(MR_JAR_PROPERTY, new String[0]);
    for (BaseWork work : workGraph.keySet()) {
      work.configureJobConf(jobConf);
    }
    String[] newTmpJars = jobConf.getStrings(MR_JAR_PROPERTY);
    if (oldTmpJars != null || newTmpJars != null) {
      String[] finalTmpJars;
      if (oldTmpJars == null || oldTmpJars.length == 0) {
        // Avoid a copy when oldTmpJars is null or empty
        finalTmpJars = newTmpJars;
      } else if (newTmpJars == null || newTmpJars.length == 0) {
        // Avoid a copy when newTmpJars is null or empty
        finalTmpJars = oldTmpJars;
      } else {
        // Both are non-empty, only copy now
        finalTmpJars = new String[oldTmpJars.length + newTmpJars.length];
        System.arraycopy(oldTmpJars, 0, finalTmpJars, 0, oldTmpJars.length);
        System.arraycopy(newTmpJars, 0, finalTmpJars, oldTmpJars.length, newTmpJars.length);
      }

      jobConf.setStrings(MR_JAR_PROPERTY, finalTmpJars);
      return finalTmpJars;
    }
    return newTmpJars;
   }

  /**
   * connect adds an edge between a and b. Both nodes have
   * to be added prior to calling connect.
   * @param
   */
  public void connect(BaseWork a, BaseWork b,
      TezEdgeProperty edgeProp) {
    workGraph.get(a).add(b);
    invertedWorkGraph.get(b).add(a);
    roots.remove(b);
    leaves.remove(a);
    ImmutablePair workPair = new ImmutablePair(a, b);
    edgeProperties.put(workPair, edgeProp);
  }

  public void setVertexType(BaseWork w, VertexType incomingVertexType) {
    VertexType vertexType = workVertexTypeMap.get(w);
    if (vertexType == null) {
      vertexType = VertexType.AUTO_INITIALIZED_EDGES;
    }
    switch (vertexType) {
    case INITIALIZED_EDGES:
      if (incomingVertexType == VertexType.MULTI_INPUT_UNINITIALIZED_EDGES) {
        vertexType = VertexType.MULTI_INPUT_INITIALIZED_EDGES;
      }
      break;

    case MULTI_INPUT_INITIALIZED_EDGES:
      // nothing to do
      break;

    case MULTI_INPUT_UNINITIALIZED_EDGES:
      if (incomingVertexType == VertexType.INITIALIZED_EDGES) {
        vertexType = VertexType.MULTI_INPUT_INITIALIZED_EDGES;
      }
      break;

    case AUTO_INITIALIZED_EDGES:
      vertexType = incomingVertexType;
      break;

    default:
      break;
    }
    workVertexTypeMap.put(w, vertexType);
  }

  public VertexType getVertexType(BaseWork w) {
    return workVertexTypeMap.get(w);
  }

  public boolean getLlapMode() {
    for (BaseWork work : getAllWork()) {
      if (work.getLlapMode()) {
        return true;
      }
    }
    return false;
  }
}