/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * <p/> * http://www.apache.org/licenses/LICENSE-2.0 * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec.spark; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; import java.util.Properties; import java.util.Set; import org.apache.commons.compress.utils.CharsetNames; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.hive.common.LogUtils; import org.apache.hadoop.hive.ql.exec.Utilities; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hive.spark.client.SparkClientUtilities; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hbase.HBaseConfiguration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConfUtil; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; import org.apache.hadoop.hive.ql.io.HiveKey; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Writable; import org.apache.hive.spark.client.rpc.RpcConfiguration; import org.apache.spark.SparkConf; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.base.Strings; import com.google.common.collect.Sets; public class HiveSparkClientFactory { protected static final transient Logger LOG = LoggerFactory.getLogger(HiveSparkClientFactory.class); private static final String SPARK_DEFAULT_CONF_FILE = "spark-defaults.conf"; private static final String SPARK_DEFAULT_MASTER = "yarn"; private static final String SPARK_DEFAULT_DEPLOY_MODE = "cluster"; private static final String SPARK_DEFAULT_APP_NAME = "Hive on Spark"; private static final String SPARK_DEFAULT_SERIALIZER = "org.apache.spark.serializer.KryoSerializer"; private static final String SPARK_DEFAULT_REFERENCE_TRACKING = "false"; private static final String SPARK_WAIT_APP_COMPLETE = "spark.yarn.submit.waitAppCompletion"; private static final String SPARK_DEPLOY_MODE = "spark.submit.deployMode"; public static HiveSparkClient createHiveSparkClient(HiveConf hiveconf) throws Exception { Map<String, String> sparkConf = initiateSparkConf(hiveconf); // Submit spark job through local spark context while spark master is local mode, otherwise submit // spark job through remote spark context. String master = sparkConf.get("spark.master"); if (master.equals("local") || master.startsWith("local[")) { // With local spark context, all user sessions share the same spark context. return LocalHiveSparkClient.getInstance(generateSparkConf(sparkConf)); } else { return new RemoteHiveSparkClient(hiveconf, sparkConf); } } public static Map<String, String> initiateSparkConf(HiveConf hiveConf) { Map<String, String> sparkConf = new HashMap<String, String>(); HBaseConfiguration.addHbaseResources(hiveConf); // set default spark configurations. sparkConf.put("spark.master", SPARK_DEFAULT_MASTER); final String appNameKey = "spark.app.name"; String appName = hiveConf.get(appNameKey); if (appName == null) { appName = SPARK_DEFAULT_APP_NAME; } sparkConf.put(appNameKey, appName); sparkConf.put("spark.serializer", SPARK_DEFAULT_SERIALIZER); sparkConf.put("spark.kryo.referenceTracking", SPARK_DEFAULT_REFERENCE_TRACKING); // load properties from spark-defaults.conf. InputStream inputStream = null; try { inputStream = HiveSparkClientFactory.class.getClassLoader() .getResourceAsStream(SPARK_DEFAULT_CONF_FILE); if (inputStream != null) { LOG.info("loading spark properties from:" + SPARK_DEFAULT_CONF_FILE); Properties properties = new Properties(); properties.load(new InputStreamReader(inputStream, CharsetNames.UTF_8)); for (String propertyName : properties.stringPropertyNames()) { if (propertyName.startsWith("spark")) { String value = properties.getProperty(propertyName); sparkConf.put(propertyName, properties.getProperty(propertyName)); LOG.info(String.format( "load spark property from %s (%s -> %s).", SPARK_DEFAULT_CONF_FILE, propertyName, LogUtils.maskIfPassword(propertyName,value))); } } } } catch (IOException e) { LOG.info("Failed to open spark configuration file:" + SPARK_DEFAULT_CONF_FILE, e); } finally { if (inputStream != null) { try { inputStream.close(); } catch (IOException e) { LOG.debug("Failed to close inputstream.", e); } } } // load properties from hive configurations, including both spark.* properties, // properties for remote driver RPC, and yarn properties for Spark on YARN mode. String sparkMaster = hiveConf.get("spark.master"); if (sparkMaster == null) { sparkMaster = sparkConf.get("spark.master"); hiveConf.set("spark.master", sparkMaster); } String deployMode = null; if (!SparkClientUtilities.isLocalMaster(sparkMaster)) { deployMode = hiveConf.get(SPARK_DEPLOY_MODE); if (deployMode == null) { deployMode = sparkConf.get(SPARK_DEPLOY_MODE); if (deployMode == null) { deployMode = SparkClientUtilities.getDeployModeFromMaster(sparkMaster); } if (deployMode == null) { deployMode = SPARK_DEFAULT_DEPLOY_MODE; } hiveConf.set(SPARK_DEPLOY_MODE, deployMode); } } if (SessionState.get() != null && SessionState.get().getConf() != null) { SessionState.get().getConf().set("spark.master", sparkMaster); if (deployMode != null) { SessionState.get().getConf().set(SPARK_DEPLOY_MODE, deployMode); } } if (SparkClientUtilities.isYarnClusterMode(sparkMaster, deployMode)) { sparkConf.put("spark.yarn.maxAppAttempts", "1"); } for (Map.Entry<String, String> entry : hiveConf) { String propertyName = entry.getKey(); if (propertyName.startsWith("spark")) { String value = hiveConf.get(propertyName); sparkConf.put(propertyName, value); LOG.info(String.format( "load spark property from hive configuration (%s -> %s).", propertyName, LogUtils.maskIfPassword(propertyName,value))); } else if (propertyName.startsWith("yarn") && SparkClientUtilities.isYarnMaster(sparkMaster)) { String value = hiveConf.get(propertyName); // Add spark.hadoop prefix for yarn properties as SparkConf only accept properties // started with spark prefix, Spark would remove spark.hadoop prefix lately and add // it to its hadoop configuration. sparkConf.put("spark.hadoop." + propertyName, value); LOG.info(String.format( "load yarn property from hive configuration in %s mode (%s -> %s).", sparkMaster, propertyName, LogUtils.maskIfPassword(propertyName,value))); } else if (propertyName.equals(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY)) { String value = hiveConf.get(propertyName); if (value != null && !value.isEmpty()) { sparkConf.put("spark.hadoop." + propertyName, value); } } else if (propertyName.startsWith("hbase") || propertyName.startsWith("zookeeper.znode")) { // Add HBase related configuration to Spark because in security mode, Spark needs it // to generate hbase delegation token for Spark. This is a temp solution to deal with // Spark problem. String value = hiveConf.get(propertyName); sparkConf.put("spark.hadoop." + propertyName, value); LOG.info(String.format( "load HBase configuration (%s -> %s).", propertyName, LogUtils.maskIfPassword(propertyName,value))); } else if (propertyName.startsWith("oozie")) { String value = hiveConf.get(propertyName); sparkConf.put("spark." + propertyName, value); LOG.info(String.format( "Pass Oozie configuration (%s -> %s).", propertyName, LogUtils.maskIfPassword(propertyName,value))); } if (RpcConfiguration.HIVE_SPARK_RSC_CONFIGS.contains(propertyName)) { String value = RpcConfiguration.getValue(hiveConf, propertyName); sparkConf.put(propertyName, value); LOG.info(String.format( "load RPC property from hive configuration (%s -> %s).", propertyName, LogUtils.maskIfPassword(propertyName,value))); } } Set<String> classes = Sets.newHashSet( Splitter.on(",").trimResults().omitEmptyStrings().split( Strings.nullToEmpty(sparkConf.get("spark.kryo.classesToRegister")))); classes.add(Writable.class.getName()); classes.add(VectorizedRowBatch.class.getName()); classes.add(BytesWritable.class.getName()); classes.add(HiveKey.class.getName()); sparkConf.put("spark.kryo.classesToRegister", Joiner.on(",").join(classes)); // set yarn queue name final String sparkQueueNameKey = "spark.yarn.queue"; if (SparkClientUtilities.isYarnMaster(sparkMaster) && hiveConf.get(sparkQueueNameKey) == null) { String queueName = hiveConf.get("mapreduce.job.queuename"); if (queueName != null) { sparkConf.put(sparkQueueNameKey, queueName); } } // Disable it to avoid verbose app state report in yarn-cluster mode if (SparkClientUtilities.isYarnClusterMode(sparkMaster, deployMode) && sparkConf.get(SPARK_WAIT_APP_COMPLETE) == null) { sparkConf.put(SPARK_WAIT_APP_COMPLETE, "false"); } // Set the credential provider passwords if found, if there is job specific password // the credential provider location is set directly in the execute method of LocalSparkClient // and submit method of RemoteHiveSparkClient when the job config is created String password = HiveConfUtil.getJobCredentialProviderPassword(hiveConf); if(password != null) { addCredentialProviderPassword(sparkConf, password); } return sparkConf; } private static void addCredentialProviderPassword(Map<String, String> sparkConf, String jobCredstorePassword) { sparkConf.put("spark.yarn.appMasterEnv.HADOOP_CREDSTORE_PASSWORD", jobCredstorePassword); sparkConf.put("spark.executorEnv.HADOOP_CREDSTORE_PASSWORD", jobCredstorePassword); } static SparkConf generateSparkConf(Map<String, String> conf) { SparkConf sparkConf = new SparkConf(false); for (Map.Entry<String, String> entry : conf.entrySet()) { sparkConf.set(entry.getKey(), entry.getValue()); } return sparkConf; } }