SparkUtilities.java example

Explorer
hive-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.hive.ql.exec.spark;

import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.Collection;

import com.google.common.base.Preconditions;
import org.apache.commons.io.FilenameUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.exec.Operator;
import org.apache.hadoop.hive.ql.exec.TaskFactory;
import org.apache.hadoop.hive.ql.exec.spark.session.SparkSession;
import org.apache.hadoop.hive.ql.exec.spark.session.SparkSessionManager;
import org.apache.hadoop.hive.ql.io.HiveKey;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.plan.BaseWork;
import org.apache.hadoop.hive.ql.plan.SparkWork;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hive.spark.client.SparkClientUtilities;
import org.apache.spark.Dependency;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.rdd.RDD;
import org.apache.spark.rdd.UnionRDD;
import scala.collection.JavaConversions;

/**
 * Contains utilities methods used as part of Spark tasks.
 */
public class SparkUtilities {

  public static HiveKey copyHiveKey(HiveKey key) {
    HiveKey copy = new HiveKey();
    copy.setDistKeyLength(key.getDistKeyLength());
    copy.setHashCode(key.hashCode());
    copy.set(key);
    return copy;
  }

  public static BytesWritable copyBytesWritable(BytesWritable bw) {
    BytesWritable copy = new BytesWritable();
    copy.set(bw);
    return copy;
  }

  /**
   * Uploads a local file to HDFS
   *
   * @param source
   * @param conf
   * @return
   * @throws IOException
   */
  public static URI uploadToHDFS(URI source, HiveConf conf) throws IOException {
    Path localFile = new Path(source.getPath());
    Path remoteFile = new Path(SessionState.get().getSparkSession().getHDFSSessionDir(),
        getFileName(source));
    FileSystem fileSystem = FileSystem.get(remoteFile.toUri(), conf);
    // Overwrite if the remote file already exists. Whether the file can be added
    // on executor is up to spark, i.e. spark.files.overwrite
    fileSystem.copyFromLocalFile(false, true, localFile, remoteFile);
    Path fullPath = fileSystem.getFileStatus(remoteFile).getPath();
    return fullPath.toUri();
  }

  // checks if a resource has to be uploaded to HDFS for yarn-cluster mode
  public static boolean needUploadToHDFS(URI source, SparkConf sparkConf) {
    String master = sparkConf.get("spark.master");
    String deployMode = sparkConf.contains("spark.submit.deployMode") ?
        sparkConf.get("spark.submit.deployMode") : null;
    return SparkClientUtilities.isYarnClusterMode(master, deployMode) &&
        !(source.getScheme().equals("hdfs") || source.getScheme().equals("viewfs"));
  }

  private static String getFileName(URI uri) {
    if (uri == null) {
      return null;
    }

    String name = FilenameUtils.getName(uri.getPath());
    return name;
  }

  public static boolean isDedicatedCluster(Configuration conf) {
    String master = conf.get("spark.master");
    return SparkClientUtilities.isYarnMaster(master) || SparkClientUtilities.isLocalMaster(master);
  }

  public static SparkSession getSparkSession(HiveConf conf,
      SparkSessionManager sparkSessionManager) throws HiveException {
    SparkSession sparkSession = SessionState.get().getSparkSession();
    HiveConf sessionConf = SessionState.get().getConf();

    // Spark configurations are updated close the existing session
    // In case of async queries or confOverlay is not empty,
    // sessionConf and conf are different objects
    if (sessionConf.getSparkConfigUpdated() || conf.getSparkConfigUpdated()) {
      sparkSessionManager.closeSession(sparkSession);
      sparkSession =  null;
      conf.setSparkConfigUpdated(false);
      sessionConf.setSparkConfigUpdated(false);
    }
    sparkSession = sparkSessionManager.getSession(sparkSession, conf, true);
    SessionState.get().setSparkSession(sparkSession);
    return sparkSession;
  }


  public static String rddGraphToString(JavaPairRDD rdd) {
    StringBuilder sb = new StringBuilder();
    rddToString(rdd.rdd(), sb, "");
    return sb.toString();
  }

  private static void rddToString(RDD rdd, StringBuilder sb, String offset) {
    sb.append(offset).append(rdd.getClass().getCanonicalName()).append("[").append(rdd.hashCode()).append("]");
    if (rdd.getStorageLevel().useMemory()) {
      sb.append("(cached)");
    }
    sb.append("\n");
    Collection<Dependency> dependencies = JavaConversions.asJavaCollection(rdd.dependencies());
    if (dependencies != null) {
      offset += "\t";
      for (Dependency dependency : dependencies) {
        RDD parentRdd = dependency.rdd();
        rddToString(parentRdd, sb, offset);
      }
    } else if (rdd instanceof UnionRDD) {
      UnionRDD unionRDD = (UnionRDD) rdd;
      offset += "\t";
      Collection<RDD> parentRdds = JavaConversions.asJavaCollection(unionRDD.rdds());
      for (RDD parentRdd : parentRdds) {
        rddToString(parentRdd, sb, offset);
      }
    }
  }

  /**
   * Generate a temporary path for dynamic partition pruning in Spark branch
   * TODO: no longer need this if we use accumulator!
   * @param basePath
   * @param id
   * @return
   */
  public static Path generateTmpPathForPartitionPruning(Path basePath, String id) {
    return new Path(basePath, id);
  }

  /**
   * Return the ID for this BaseWork, in String form.
   * @param work the input BaseWork
   * @return the unique ID for this BaseWork
   */
  public static String getWorkId(BaseWork work) {
    String workName = work.getName();
    return workName.substring(workName.indexOf(" ") + 1);
  }

  public static SparkTask createSparkTask(HiveConf conf) {
    return (SparkTask) TaskFactory.get(
        new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID)), conf);
  }

  public static SparkTask createSparkTask(SparkWork work, HiveConf conf) {
    return (SparkTask) TaskFactory.get(work, conf);
  }

  /**
   * Recursively find all operators under root, that are of class clazz, and
   * put them in result.
   * @param result all operators under root that are of class clazz
   * @param root the root operator under which all operators will be examined
   * @param clazz clas to collect. Must NOT be null.
   */
  public static void collectOp(Collection<Operator<?>> result, Operator<?> root, Class<?> clazz) {
    Preconditions.checkArgument(clazz != null, "AssertionError: clazz should not be null");
    if (root == null) {
      return;
    }
    if (clazz.equals(root.getClass())) {
      result.add(root);
    }
    for (Operator<?> child : root.getChildOperators()) {
      collectOp(result, child, clazz);
    }
  }
}