/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.llap.daemon.impl;
import org.apache.hadoop.hive.llap.LlapOutputFormatService;
import java.io.IOException;
import java.lang.management.ManagementFactory;
import java.lang.management.MemoryPoolMXBean;
import java.lang.management.MemoryType;
import java.net.InetSocketAddress;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import javax.management.ObjectName;
import javax.net.SocketFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hive.common.JvmPauseMonitor;
import org.apache.hadoop.hive.common.LogUtils;
import org.apache.hadoop.hive.common.UgiFactory;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
import org.apache.hadoop.hive.llap.DaemonId;
import org.apache.hadoop.hive.llap.LlapDaemonInfo;
import org.apache.hadoop.hive.llap.LlapUtil;
import org.apache.hadoop.hive.llap.configuration.LlapDaemonConfiguration;
import org.apache.hadoop.hive.llap.daemon.ContainerRunner;
import org.apache.hadoop.hive.llap.daemon.QueryFailedHandler;
import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.QueryCompleteRequestProto;
import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.QueryCompleteResponseProto;
import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.SourceStateUpdatedRequestProto;
import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.SourceStateUpdatedResponseProto;
import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.SubmitWorkRequestProto;
import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.SubmitWorkResponseProto;
import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.TerminateFragmentRequestProto;
import org.apache.hadoop.hive.llap.daemon.rpc.LlapDaemonProtocolProtos.TerminateFragmentResponseProto;
import org.apache.hadoop.hive.llap.daemon.services.impl.LlapWebServices;
import org.apache.hadoop.hive.llap.io.api.LlapProxy;
import org.apache.hadoop.hive.llap.metrics.LlapDaemonExecutorMetrics;
import org.apache.hadoop.hive.llap.metrics.LlapDaemonJvmMetrics;
import org.apache.hadoop.hive.llap.metrics.LlapMetricsSystem;
import org.apache.hadoop.hive.llap.metrics.MetricsUtils;
import org.apache.hadoop.hive.llap.registry.impl.LlapRegistryService;
import org.apache.hadoop.hive.llap.security.LlapUgiFactoryFactory;
import org.apache.hadoop.hive.llap.security.SecretManager;
import org.apache.hadoop.hive.llap.shufflehandler.ShuffleHandler;
import org.apache.hadoop.hive.ql.exec.SerializationUtilities;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBridge.UdfWhitelistChecker;
import org.apache.hadoop.metrics2.util.MBeans;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.service.CompositeService;
import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.hive.common.util.ShutdownHookManager;
import org.apache.logging.log4j.core.config.Configurator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Preconditions;
import com.google.common.primitives.Ints;
public class LlapDaemon extends CompositeService implements ContainerRunner, LlapDaemonMXBean {
private static final Logger LOG = LoggerFactory.getLogger(LlapDaemon.class);
private final Configuration shuffleHandlerConf;
private final SecretManager secretManager;
private final LlapProtocolServerImpl server;
private final ContainerRunnerImpl containerRunner;
private final AMReporter amReporter;
private final LlapRegistryService registry;
private final LlapWebServices webServices;
private final AtomicLong numSubmissions = new AtomicLong(0);
private final JvmPauseMonitor pauseMonitor;
private final ObjectName llapDaemonInfoBean;
private final LlapDaemonExecutorMetrics metrics;
private final FunctionLocalizer fnLocalizer;
// Parameters used for JMX
private final boolean llapIoEnabled;
private final long executorMemoryPerInstance;
private final long ioMemoryPerInstance;
private final int numExecutors;
private final long maxJvmMemory;
private final String[] localDirs;
private final DaemonId daemonId;
private final SocketFactory socketFactory;
// TODO Not the best way to share the address
private final AtomicReference<InetSocketAddress> srvAddress = new AtomicReference<>(),
mngAddress = new AtomicReference<>();
private final AtomicReference<Integer> shufflePort = new AtomicReference<>();
public LlapDaemon(Configuration daemonConf, int numExecutors, long executorMemoryBytes,
boolean ioEnabled, boolean isDirectCache, long ioMemoryBytes, String[] localDirs, int srvPort,
int mngPort, int shufflePort, int webPort, String appName) {
super("LlapDaemon");
printAsciiArt();
Preconditions.checkArgument(numExecutors > 0);
Preconditions.checkArgument(srvPort == 0 || (srvPort > 1024 && srvPort < 65536),
"Server RPC Port must be between 1025 and 65535, or 0 automatic selection");
Preconditions.checkArgument(mngPort == 0 || (mngPort > 1024 && mngPort < 65536),
"Management RPC Port must be between 1025 and 65535, or 0 automatic selection");
Preconditions.checkArgument(localDirs != null && localDirs.length > 0,
"Work dirs must be specified");
Preconditions.checkArgument(shufflePort == 0 || (shufflePort > 1024 && shufflePort < 65536),
"Shuffle Port must be betwee 1024 and 65535, or 0 for automatic selection");
int outputFormatServicePort = HiveConf.getIntVar(daemonConf, HiveConf.ConfVars.LLAP_DAEMON_OUTPUT_SERVICE_PORT);
Preconditions.checkArgument(outputFormatServicePort == 0
|| (outputFormatServicePort > 1024 && outputFormatServicePort < 65536),
"OutputFormatService Port must be between 1024 and 65535, or 0 for automatic selection");
String hosts = HiveConf.getTrimmedVar(daemonConf, ConfVars.LLAP_DAEMON_SERVICE_HOSTS);
if (hosts.startsWith("@")) {
String zkHosts = HiveConf.getTrimmedVar(daemonConf, ConfVars.HIVE_ZOOKEEPER_QUORUM);
LOG.info("Zookeeper Quorum: {}", zkHosts);
Preconditions.checkArgument(zkHosts != null && !zkHosts.trim().isEmpty(),
"LLAP service hosts startswith '@' but hive.zookeeper.quorum is not set." +
" hive.zookeeper.quorum must be set.");
}
String hostName = MetricsUtils.getHostName();
try {
// re-login with kerberos. This makes sure all daemons have the same login user.
if (UserGroupInformation.isSecurityEnabled()) {
final String daemonPrincipal = HiveConf.getVar(daemonConf, ConfVars.LLAP_KERBEROS_PRINCIPAL);
final String daemonKeytab = HiveConf.getVar(daemonConf, ConfVars.LLAP_KERBEROS_KEYTAB_FILE);
LlapUtil.loginWithKerberosAndUpdateCurrentUser(daemonPrincipal, daemonKeytab);
}
String currentUser = UserGroupInformation.getCurrentUser().getShortUserName();
LOG.info("Starting daemon as user: {}", currentUser);
daemonId = new DaemonId(currentUser, LlapUtil.generateClusterName(daemonConf),
hostName, appName, System.currentTimeMillis());
} catch (IOException ex) {
throw new RuntimeException(ex);
}
this.maxJvmMemory = getTotalHeapSize();
this.llapIoEnabled = ioEnabled;
long xmxHeadRoomBytes = determineXmxHeadroom(daemonConf, executorMemoryBytes, maxJvmMemory);
this.executorMemoryPerInstance = executorMemoryBytes - xmxHeadRoomBytes;
this.ioMemoryPerInstance = ioMemoryBytes;
this.numExecutors = numExecutors;
this.localDirs = localDirs;
int waitQueueSize = HiveConf.getIntVar(
daemonConf, ConfVars.LLAP_DAEMON_TASK_SCHEDULER_WAIT_QUEUE_SIZE);
boolean enablePreemption = HiveConf.getBoolVar(
daemonConf, ConfVars.LLAP_DAEMON_TASK_SCHEDULER_ENABLE_PREEMPTION);
final String logMsg = "Attempting to start LlapDaemon with the following configuration: " +
"maxJvmMemory=" + maxJvmMemory + " ("
+ LlapUtil.humanReadableByteCount(maxJvmMemory) + ")" +
", requestedExecutorMemory=" + executorMemoryBytes +
" (" + LlapUtil.humanReadableByteCount(executorMemoryBytes) + ")" +
", llapIoCacheSize=" + ioMemoryBytes + " ("
+ LlapUtil.humanReadableByteCount(ioMemoryBytes) + ")" +
", xmxHeadRoomMemory=" + xmxHeadRoomBytes + " ("
+ LlapUtil.humanReadableByteCount(xmxHeadRoomBytes) + ")" +
", adjustedExecutorMemory=" + executorMemoryPerInstance +
" (" + LlapUtil.humanReadableByteCount(executorMemoryPerInstance) + ")" +
", numExecutors=" + numExecutors +
", llapIoEnabled=" + ioEnabled +
", llapIoCacheIsDirect=" + isDirectCache +
", rpcListenerPort=" + srvPort +
", mngListenerPort=" + mngPort +
", webPort=" + webPort +
", outputFormatSvcPort=" + outputFormatServicePort +
", workDirs=" + Arrays.toString(localDirs) +
", shufflePort=" + shufflePort +
", waitQueueSize= " + waitQueueSize +
", enablePreemption= " + enablePreemption;
LOG.info(logMsg);
final String currTSISO8601 = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSSZ").format(new Date());
// Time based log retrieval may not fetch the above log line so logging to stderr for debugging purpose.
System.err.println(currTSISO8601 + " " + logMsg);
long memRequired =
executorMemoryBytes + (ioEnabled && isDirectCache == false ? ioMemoryBytes : 0);
// TODO: this check is somewhat bogus as the maxJvmMemory != Xmx parameters (see annotation in LlapServiceDriver)
Preconditions.checkState(maxJvmMemory >= memRequired,
"Invalid configuration. Xmx value too small. maxAvailable=" + LlapUtil.humanReadableByteCount(maxJvmMemory) +
", configured(exec + io if enabled)=" + LlapUtil.humanReadableByteCount(memRequired));
this.shuffleHandlerConf = new Configuration(daemonConf);
this.shuffleHandlerConf.setInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, shufflePort);
this.shuffleHandlerConf.set(ShuffleHandler.SHUFFLE_HANDLER_LOCAL_DIRS,
StringUtils.arrayToString(localDirs));
this.shuffleHandlerConf.setBoolean(ShuffleHandler.SHUFFLE_DIR_WATCHER_ENABLED,
HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_DAEMON_SHUFFLE_DIR_WATCHER_ENABLED));
// Less frequently set parameter, not passing in as a param.
int numHandlers = HiveConf.getIntVar(daemonConf, ConfVars.LLAP_DAEMON_RPC_NUM_HANDLERS);
// Initialize the function localizer.
ClassLoader executorClassLoader = null;
if (HiveConf.getBoolVar(daemonConf, ConfVars.LLAP_DAEMON_DOWNLOAD_PERMANENT_FNS)) {
this.fnLocalizer = new FunctionLocalizer(daemonConf, localDirs[0]);
executorClassLoader = fnLocalizer.getClassLoader();
// Set up the hook that will disallow creating non-whitelisted UDFs anywhere in the plan.
// We are not using a specific hook for GenericUDFBridge - that doesn't work in MiniLlap
// because the daemon is embedded, so the client also gets this hook and Kryo is brittle.
SerializationUtilities.setGlobalHook(new LlapGlobalUdfChecker(fnLocalizer));
} else {
this.fnLocalizer = null;
SerializationUtilities.setGlobalHook(new LlapGlobalUdfChecker(new StaticPermanentFunctionChecker(daemonConf)));
executorClassLoader = Thread.currentThread().getContextClassLoader();
}
// Initialize the metrics system
LlapMetricsSystem.initialize("LlapDaemon");
this.pauseMonitor = new JvmPauseMonitor(daemonConf);
pauseMonitor.start();
String displayNameJvm = "LlapDaemonJvmMetrics-" + hostName;
String sessionId = MetricsUtils.getUUID();
LlapDaemonJvmMetrics.create(displayNameJvm, sessionId);
String displayName = "LlapDaemonExecutorMetrics-" + hostName;
daemonConf.set("llap.daemon.metrics.sessionid", sessionId);
String[] strIntervals = HiveConf.getTrimmedStringsVar(daemonConf,
HiveConf.ConfVars.LLAP_DAEMON_TASK_PREEMPTION_METRICS_INTERVALS);
List<Integer> intervalList = new ArrayList<>();
if (strIntervals != null) {
for (String strInterval : strIntervals) {
try {
intervalList.add(Integer.valueOf(strInterval));
} catch (NumberFormatException e) {
LOG.warn("Ignoring task pre-emption metrics interval {} from {} as it is invalid",
strInterval, Arrays.toString(strIntervals));
}
}
}
this.metrics = LlapDaemonExecutorMetrics.create(displayName, sessionId, numExecutors,
Ints.toArray(intervalList));
this.metrics.setMemoryPerInstance(executorMemoryPerInstance);
this.metrics.setCacheMemoryPerInstance(ioMemoryBytes);
this.metrics.setJvmMaxMemory(maxJvmMemory);
this.metrics.setWaitQueueSize(waitQueueSize);
this.metrics.getJvmMetrics().setPauseMonitor(pauseMonitor);
this.llapDaemonInfoBean = MBeans.register("LlapDaemon", "LlapDaemonInfo", this);
LOG.info("Started LlapMetricsSystem with displayName: " + displayName +
" sessionId: " + sessionId);
int maxAmReporterThreads = HiveConf.getIntVar(daemonConf, ConfVars.LLAP_DAEMON_AM_REPORTER_MAX_THREADS);
this.socketFactory = NetUtils.getDefaultSocketFactory(daemonConf);
this.amReporter = new AMReporter(numExecutors, maxAmReporterThreads, srvAddress,
new QueryFailedHandlerProxy(), daemonConf, daemonId, socketFactory);
SecretManager sm = null;
if (UserGroupInformation.isSecurityEnabled()) {
sm = SecretManager.createSecretManager(daemonConf, daemonId.getClusterString());
}
this.secretManager = sm;
this.server = new LlapProtocolServerImpl(secretManager,
numHandlers, this, srvAddress, mngAddress, srvPort, mngPort, daemonId);
UgiFactory fsUgiFactory = null;
try {
fsUgiFactory = LlapUgiFactoryFactory.createFsUgiFactory(daemonConf);
} catch (IOException e) {
throw new RuntimeException(e);
}
this.containerRunner = new ContainerRunnerImpl(daemonConf, numExecutors, waitQueueSize,
enablePreemption, localDirs, this.shufflePort, srvAddress, executorMemoryPerInstance, metrics,
amReporter, executorClassLoader, daemonId, fsUgiFactory, socketFactory);
addIfService(containerRunner);
// Not adding the registry as a service, since we need to control when it is initialized - conf used to pickup properties.
this.registry = new LlapRegistryService(true);
if (HiveConf.getBoolVar(daemonConf, HiveConf.ConfVars.HIVE_IN_TEST)) {
this.webServices = null;
} else {
this.webServices = new LlapWebServices(webPort, this, registry);
addIfService(webServices);
}
// Bring up the server only after all other components have started.
addIfService(server);
// AMReporter after the server so that it gets the correct address. It knows how to deal with
// requests before it is started.
addIfService(amReporter);
}
private static long determineXmxHeadroom(
Configuration daemonConf, long executorMemoryBytes, long maxJvmMemory) {
String headroomStr = HiveConf.getVar(daemonConf, ConfVars.LLAP_DAEMON_XMX_HEADROOM).trim();
long xmxHeadRoomBytes = Long.MAX_VALUE;
try {
if (headroomStr.endsWith("%")) {
long percentage = Integer.parseInt(headroomStr.substring(0, headroomStr.length() - 1));
Preconditions.checkState(percentage >= 0 && percentage < 100,
"Headroom percentage should be in [0, 100) range; found " + headroomStr);
xmxHeadRoomBytes = maxJvmMemory * percentage / 100L;
} else {
xmxHeadRoomBytes = HiveConf.toSizeBytes(headroomStr);
}
} catch (NumberFormatException ex) {
throw new RuntimeException("Invalid headroom configuration " + headroomStr);
}
Preconditions.checkArgument(xmxHeadRoomBytes < executorMemoryBytes,
"LLAP daemon headroom size should be less than daemon max memory size. headRoomBytes: "
+ xmxHeadRoomBytes + " executorMemoryBytes: " + executorMemoryBytes + " (derived from "
+ headroomStr + " out of xmx of " + maxJvmMemory + ")");
return xmxHeadRoomBytes;
}
private static void initializeLogging(final Configuration conf) {
long start = System.currentTimeMillis();
URL llap_l4j2 = LlapDaemon.class.getClassLoader().getResource(
LlapConstants.LOG4j2_PROPERTIES_FILE);
if (llap_l4j2 != null) {
final boolean async = LogUtils.checkAndSetAsyncLogging(conf);
// required for MDC based routing appender so that child threads can inherit the MDC context
System.setProperty("isThreadContextMapInheritable", "true");
Configurator.initialize("LlapDaemonLog4j2", llap_l4j2.toString());
long end = System.currentTimeMillis();
LOG.debug("LLAP daemon logging initialized from {} in {} ms. Async: {}",
llap_l4j2, (end - start), async);
} else {
throw new RuntimeException("Log initialization failed." +
" Unable to locate " + LlapConstants.LOG4j2_PROPERTIES_FILE + " file in classpath");
}
}
public static long getTotalHeapSize() {
// runtime.getMax() gives a very different number from the actual Xmx sizing.
// you can iterate through the
// http://docs.oracle.com/javase/7/docs/api/java/lang/management/MemoryPoolMXBean.html
// from java.lang.management to figure this out, but the hard-coded params in the llap run.sh
// result in 89% usable heap (-XX:NewRatio=8) + a survivor region which is technically not
// in the usable space.
long total = 0;
for (MemoryPoolMXBean mp : ManagementFactory.getMemoryPoolMXBeans()) {
long sz = mp.getUsage().getMax();
if (mp.getName().contains("Survivor")) {
sz *= 2; // there are 2 survivor spaces
}
if (mp.getType().equals(MemoryType.HEAP)) {
total += sz;
}
}
// round up to the next MB
total += (total % (1024*1024));
return total;
}
private void printAsciiArt() {
final String asciiArt = "" +
"$$\\ $$\\ $$$$$$\\ $$$$$$$\\\n" +
"$$ | $$ | $$ __$$\\ $$ __$$\\\n" +
"$$ | $$ | $$ / $$ |$$ | $$ |\n" +
"$$ | $$ | $$$$$$$$ |$$$$$$$ |\n" +
"$$ | $$ | $$ __$$ |$$ ____/\n" +
"$$ | $$ | $$ | $$ |$$ |\n" +
"$$$$$$$$\\ $$$$$$$$\\ $$ | $$ |$$ |\n" +
"\\________|\\________|\\__| \\__|\\__|\n" +
"\n";
LOG.info("\n\n" + asciiArt);
}
@Override
public void serviceInit(Configuration conf) throws Exception {
super.serviceInit(conf);
LlapProxy.setDaemon(true);
if (fnLocalizer != null) {
fnLocalizer.init();
fnLocalizer.startLocalizeAllFunctions();
}
if (isIoEnabled()) {
LlapProxy.initializeLlapIo(conf);
}
}
@Override
public void serviceStart() throws Exception {
// Start the Shuffle service before the listener - until it's a service as well.
ShuffleHandler.initializeAndStart(shuffleHandlerConf);
LOG.info("Setting shuffle port to: " + ShuffleHandler.get().getPort());
this.shufflePort.set(ShuffleHandler.get().getPort());
getConfig()
.setInt(ConfVars.LLAP_DAEMON_YARN_SHUFFLE_PORT.varname, ShuffleHandler.get().getPort());
LlapOutputFormatService.initializeAndStart(getConfig(), secretManager);
super.serviceStart();
// Setup the actual ports in the configuration.
getConfig().setInt(ConfVars.LLAP_DAEMON_RPC_PORT.varname, server.getBindAddress().getPort());
getConfig().setInt(ConfVars.LLAP_MANAGEMENT_RPC_PORT.varname, server.getManagementBindAddress().getPort());
if (webServices != null) {
getConfig().setInt(ConfVars.LLAP_DAEMON_WEB_PORT.varname, webServices.getPort());
}
getConfig().setInt(ConfVars.LLAP_DAEMON_OUTPUT_SERVICE_PORT.varname, LlapOutputFormatService.get().getPort());
// Ensure this is set in the config so that the AM can read it.
getConfig()
.setIfUnset(ConfVars.LLAP_DAEMON_TASK_SCHEDULER_WAIT_QUEUE_SIZE.varname,
ConfVars.LLAP_DAEMON_TASK_SCHEDULER_WAIT_QUEUE_SIZE
.getDefaultValue());
this.registry.init(getConfig());
this.registry.start();
LOG.info(
"LlapDaemon serviceStart complete. RPC Port={}, ManagementPort={}, ShuflePort={}, WebPort={}",
server.getBindAddress().getPort(), server.getManagementBindAddress().getPort(),
ShuffleHandler.get().getPort(), (webServices == null ? "" : webServices.getPort()));
}
public void serviceStop() throws Exception {
if (registry != null) {
this.registry.stop();
}
super.serviceStop();
ShuffleHandler.shutdown();
shutdown();
LlapOutputFormatService.get().stop();
LOG.info("LlapDaemon shutdown complete");
}
public void shutdown() {
LOG.info("LlapDaemon shutdown invoked");
if (llapDaemonInfoBean != null) {
try {
MBeans.unregister(llapDaemonInfoBean);
} catch (Throwable ex) {
LOG.info("Error unregistering the bean; ignoring", ex);
}
}
if (pauseMonitor != null) {
pauseMonitor.stop();
}
if (metrics != null) {
LlapMetricsSystem.shutdown();
}
LlapProxy.close();
if (fnLocalizer != null) {
fnLocalizer.close();
}
}
public static void main(String[] args) throws Exception {
Thread.setDefaultUncaughtExceptionHandler(new LlapDaemonUncaughtExceptionHandler());
LlapDaemon llapDaemon = null;
try {
// Cache settings will need to be setup in llap-daemon-site.xml - since the daemons don't read hive-site.xml
// Ideally, these properties should be part of LlapDameonConf rather than HiveConf
LlapDaemonConfiguration daemonConf = new LlapDaemonConfiguration();
String containerIdStr = System.getenv(ApplicationConstants.Environment.CONTAINER_ID.name());
String appName = null;
if (containerIdStr != null && !containerIdStr.isEmpty()) {
daemonConf.set(ConfVars.LLAP_DAEMON_CONTAINER_ID.varname, containerIdStr);
appName = ConverterUtils.toContainerId(containerIdStr)
.getApplicationAttemptId().getApplicationId().toString();
} else {
daemonConf.unset(ConfVars.LLAP_DAEMON_CONTAINER_ID.varname);
// Note, we assume production LLAP always runs under YARN.
LOG.error("Cannot find " + ApplicationConstants.Environment.CONTAINER_ID.toString()
+ "; LLAP tokens may grant access to subsequent instances of the cluster with"
+ " the same name");
appName = null;
}
String nmHost = System.getenv(ApplicationConstants.Environment.NM_HOST.name());
String nmPort = System.getenv(ApplicationConstants.Environment.NM_PORT.name());
if (!org.apache.commons.lang3.StringUtils.isBlank(nmHost) && !org.apache.commons.lang3.StringUtils.isBlank(nmPort)) {
String nmAddress = nmHost + ":" + nmPort;
daemonConf.set(ConfVars.LLAP_DAEMON_NM_ADDRESS.varname, nmAddress);
} else {
daemonConf.unset(ConfVars.LLAP_DAEMON_NM_ADDRESS.varname);
// Unlikely, but log the actual values in case one of the two was empty/null
LOG.warn(
"NodeManager host/port not found in environment. Values retrieved: host={}, port={}",
nmHost, nmPort);
}
String workDirsString = System.getenv(ApplicationConstants.Environment.LOCAL_DIRS.name());
String localDirList = LlapUtil.getDaemonLocalDirString(daemonConf, workDirsString);
String[] localDirs = (localDirList == null || localDirList.isEmpty()) ?
new String[0] : StringUtils.getTrimmedStrings(localDirList);
int rpcPort = HiveConf.getIntVar(daemonConf, ConfVars.LLAP_DAEMON_RPC_PORT);
int mngPort = HiveConf.getIntVar(daemonConf, ConfVars.LLAP_MANAGEMENT_RPC_PORT);
int shufflePort = daemonConf
.getInt(ShuffleHandler.SHUFFLE_PORT_CONFIG_KEY, ShuffleHandler.DEFAULT_SHUFFLE_PORT);
int webPort = HiveConf.getIntVar(daemonConf, ConfVars.LLAP_DAEMON_WEB_PORT);
LlapDaemonInfo.initialize(appName, daemonConf);
int numExecutors = LlapDaemonInfo.INSTANCE.getNumExecutors();
long executorMemoryBytes = LlapDaemonInfo.INSTANCE.getExecutorMemory();
long ioMemoryBytes = LlapDaemonInfo.INSTANCE.getCacheSize();
boolean isDirectCache = LlapDaemonInfo.INSTANCE.isDirectCache();
boolean isLlapIo = LlapDaemonInfo.INSTANCE.isLlapIo();
LlapDaemon.initializeLogging(daemonConf);
llapDaemon =
new LlapDaemon(daemonConf, numExecutors, executorMemoryBytes, isLlapIo, isDirectCache,
ioMemoryBytes, localDirs, rpcPort, mngPort, shufflePort, webPort, appName);
LOG.info("Adding shutdown hook for LlapDaemon");
ShutdownHookManager.addShutdownHook(new CompositeServiceShutdownHook(llapDaemon), 1);
llapDaemon.init(daemonConf);
llapDaemon.start();
LOG.info("Started LlapDaemon");
// Relying on the RPC threads to keep the service alive.
} catch (Throwable t) {
// TODO Replace this with a ExceptionHandler / ShutdownHook
LOG.error("Failed to start LLAP Daemon with exception", t);
if (llapDaemon != null) {
llapDaemon.shutdown();
}
System.exit(-1);
}
}
@Override
public SubmitWorkResponseProto submitWork(
SubmitWorkRequestProto request) throws IOException {
numSubmissions.incrementAndGet();
return containerRunner.submitWork(request);
}
@Override
public SourceStateUpdatedResponseProto sourceStateUpdated(
SourceStateUpdatedRequestProto request) throws IOException {
return containerRunner.sourceStateUpdated(request);
}
@Override
public QueryCompleteResponseProto queryComplete(
QueryCompleteRequestProto request) throws IOException {
return containerRunner.queryComplete(request);
}
@Override
public TerminateFragmentResponseProto terminateFragment(
TerminateFragmentRequestProto request) throws IOException {
return containerRunner.terminateFragment(request);
}
@VisibleForTesting
public long getNumSubmissions() {
return numSubmissions.get();
}
public InetSocketAddress getListenerAddress() {
return server.getBindAddress();
}
// LlapDaemonMXBean methods. Will be exposed via JMX
@Override
public int getRpcPort() {
return server.getBindAddress().getPort();
}
@Override
public int getNumExecutors() {
return numExecutors;
}
@Override
public int getShufflePort() {
return ShuffleHandler.get().getPort();
}
@Override
public String getLocalDirs() {
return Joiner.on(",").skipNulls().join(localDirs);
}
@Override
public Set<String> getExecutorsStatus() {
return containerRunner.getExecutorStatus();
}
@Override
public int getNumActive() {
return containerRunner.getNumActive();
}
@Override
public long getExecutorMemoryPerInstance() {
return executorMemoryPerInstance;
}
@Override
public long getIoMemoryPerInstance() {
return ioMemoryPerInstance;
}
@Override
public boolean isIoEnabled() {
return llapIoEnabled;
}
@Override
public long getMaxJvmMemory() {
return maxJvmMemory;
}
/**
* A global hook that checks all subclasses of GenericUDF against the whitelist. It also injects
* us into GenericUDFBridge-s, to check with the whitelist before instantiating a UDF.
*/
private static final class LlapGlobalUdfChecker extends SerializationUtilities.Hook {
private UdfWhitelistChecker fnCheckerImpl;
public LlapGlobalUdfChecker(UdfWhitelistChecker fnCheckerImpl) {
this.fnCheckerImpl = fnCheckerImpl;
}
@Override
public boolean preRead(Class<?> type) {
// 1) Don't call postRead - we will have checked everything here.
// 2) Ignore GenericUDFBridge, it's checked separately in LlapUdfBridgeChecker.
if (GenericUDFBridge.class == type) return true; // Run post-hook.
if (!(GenericUDF.class.isAssignableFrom(type) || UDF.class.isAssignableFrom(type))
|| fnCheckerImpl.isUdfAllowed(type)) return false;
throw new SecurityException("UDF " + type.getCanonicalName() + " is not allowed");
}
@Override
public Object postRead(Object o) {
if (o == null) return o;
Class<?> type = o.getClass();
if (GenericUDFBridge.class == type) {
((GenericUDFBridge)o).setUdfChecker(fnCheckerImpl);
}
// This won't usually be called otherwise.
preRead(type);
return o;
}
}
private static class LlapDaemonUncaughtExceptionHandler implements Thread.UncaughtExceptionHandler {
@Override
public void uncaughtException(Thread t, Throwable e) {
LOG.info("UncaughtExceptionHandler invoked");
if(ShutdownHookManager.isShutdownInProgress()) {
LOG.warn("Thread {} threw a Throwable, but we are shutting down, so ignoring this", t, e);
} else if(e instanceof Error) {
try {
LOG.error("Thread {} threw an Error. Shutting down now...", t, e);
} catch (Throwable err) {
//We don't want to not exit because of an issue with logging
}
if(e instanceof OutOfMemoryError) {
//After catching an OOM java says it is undefined behavior, so don't
//even try to clean up or we can get stuck on shutdown.
try {
System.err.println("Halting due to Out Of Memory Error...");
e.printStackTrace();
} catch (Throwable err) {
//Again we done want to exit because of logging issues.
}
ExitUtil.halt(-1);
} else {
ExitUtil.terminate(-1);
}
} else {
LOG.error("Thread {} threw an Exception. Shutting down now...", t, e);
ExitUtil.terminate(-1);
}
}
}
private class QueryFailedHandlerProxy implements QueryFailedHandler {
@Override
public void queryFailed(QueryIdentifier queryIdentifier) {
containerRunner.queryFailed(queryIdentifier);
}
}
}