/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.llap.cli;
import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.text.DecimalFormat;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.hive.common.classification.InterfaceAudience;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.llap.cli.LlapStatusOptionsProcessor.LlapStatusOptions;
import org.apache.hadoop.hive.llap.cli.status.LlapStatusHelpers;
import org.apache.hadoop.hive.llap.cli.status.LlapStatusHelpers.AppStatusBuilder;
import org.apache.hadoop.hive.llap.cli.status.LlapStatusHelpers.LlapInstance;
import org.apache.hadoop.hive.llap.configuration.LlapDaemonConfiguration;
import org.apache.hadoop.hive.llap.registry.ServiceInstance;
import org.apache.hadoop.hive.llap.registry.impl.LlapRegistryService;
import org.apache.hadoop.hive.ql.session.SessionState;
import org.apache.hadoop.yarn.api.records.ApplicationReport;
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.util.Clock;
import org.apache.hadoop.yarn.util.SystemClock;
import org.apache.slider.api.ClusterDescription;
import org.apache.slider.api.ClusterDescriptionKeys;
import org.apache.slider.api.StateValues;
import org.apache.slider.api.StatusKeys;
import org.apache.slider.api.types.ApplicationDiagnostics;
import org.apache.slider.api.types.ContainerInformation;
import org.apache.slider.client.SliderClient;
import org.apache.slider.common.params.ActionDiagnosticArgs;
import org.apache.slider.core.exceptions.SliderException;
import org.codehaus.jackson.map.ObjectMapper;
import org.codehaus.jackson.map.SerializationConfig;
import org.codehaus.jackson.map.annotate.JsonSerialize;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class LlapStatusServiceDriver {
private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class);
private static final Logger CONSOLE_LOGGER = LoggerFactory.getLogger("LlapStatusServiceDriverConsole");
// Defining a bunch of configs here instead of in HiveConf. These are experimental, and mainly
// for use when retry handling is fixed in Yarn/Hadoop
private static final String CONF_PREFIX = "hive.llapcli.";
// The following two keys should ideally be used to control RM connect timeouts. However,
// they don't seem to work. The IPC timeout needs to be set instead.
@InterfaceAudience.Private
private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS =
CONF_PREFIX + "yarn.rm.connect.max-wait-ms";
private static final long CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT = 10000l;
@InterfaceAudience.Private
private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS =
CONF_PREFIX + "yarn.rm.connect.retry-interval.ms";
private static final long CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT = 5000l;
// As of Hadoop 2.7 - this is what controls the RM timeout.
@InterfaceAudience.Private
private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES =
CONF_PREFIX + "ipc.client.max-retries";
private static final int CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT = 2;
@InterfaceAudience.Private
private static final String CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS =
CONF_PREFIX + "ipc.client.connect.retry-interval-ms";
private static final long CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT = 1500l;
// As of Hadoop 2.8 - this timeout spec behaves in a strnage manner. "2000,1" means 2000s with 1 retry.
// However it does this - but does it thrice. Essentially - #retries+2 is the number of times the entire config
// is retried. "2000,1" means 3 retries - each with 1 retry with a random 2000ms sleep.
@InterfaceAudience.Private
private static final String CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC =
CONF_PREFIX + "timeline.service.fs-store.retry.policy.spec";
private static final String
CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT = "2000, 1";
private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS =
CONF_PREFIX + "zk-registry.timeout-ms";
private static final long CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT = 20000l;
private static final long LOG_SUMMARY_INTERVAL = 15000L; // Log summary every ~15 seconds.
private static final String LLAP_KEY = "LLAP";
private final Configuration conf;
private final Clock clock = new SystemClock();
private String appName = null;
private SliderClient sliderClient = null;
private Configuration llapRegistryConf = null;
private LlapRegistryService llapRegistry = null;
@VisibleForTesting
AppStatusBuilder appStatusBuilder;
public LlapStatusServiceDriver() {
SessionState ss = SessionState.get();
conf = (ss != null) ? ss.getConf() : new HiveConf(SessionState.class);
setupConf();
}
private void setupConf() {
for (String f : LlapDaemonConfiguration.DAEMON_CONFIGS) {
conf.addResource(f);
}
conf.reloadConfiguration();
// Setup timeouts for various services.
// Once we move to a Hadoop-2.8 dependency, the following paramteer can be used.
// conf.set(YarnConfiguration.TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC);
conf.set("yarn.timeline-service.entity-group-fs-store.retry-policy-spec",
conf.get(CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC,
CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT));
conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS,
CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT));
conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
conf.getLong(CONFIG_YARN_RM_RETRY_INTERVAL_MS, CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT));
conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY,
conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES,
CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT));
conf.setLong(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_RETRY_INTERVAL_KEY,
conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS,
CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT));
HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, (conf
.getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) +
"ms"));
llapRegistryConf = new Configuration(conf);
}
/**
* Parse command line options.
*
* @param args
* @return command line options.
*/
public LlapStatusOptions parseOptions(String[] args) throws
LlapStatusCliException {
LlapStatusOptionsProcessor optionsProcessor = new LlapStatusOptionsProcessor();
LlapStatusOptions options;
try {
options = optionsProcessor.processOptions(args);
return options;
} catch (Exception e) {
LOG.info("Failed to parse arguments", e);
throw new LlapStatusCliException(ExitCode.INCORRECT_USAGE, "Incorrect usage");
}
}
public int run(LlapStatusOptions options, long watchTimeoutMs) {
appStatusBuilder = new AppStatusBuilder();
try {
if (appName == null) {
// user provided configs
for (Map.Entry<Object, Object> props : options.getConf().entrySet()) {
conf.set((String) props.getKey(), (String) props.getValue());
}
appName = options.getName();
if (StringUtils.isEmpty(appName)) {
appName = HiveConf.getVar(conf, HiveConf.ConfVars.LLAP_DAEMON_SERVICE_HOSTS);
if (appName.startsWith("@") && appName.length() > 1) {
// This is a valid slider app name. Parse it out.
appName = appName.substring(1);
} else {
// Invalid app name. Checked later.
appName = null;
}
}
if (StringUtils.isEmpty(appName)) {
String message =
"Invalid app name. This must be setup via config or passed in as a parameter." +
" This tool works with clusters deployed by Slider/YARN";
LOG.info(message);
return ExitCode.INCORRECT_USAGE.getInt();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Using appName: {}", appName);
}
llapRegistryConf.set(HiveConf.ConfVars.LLAP_DAEMON_SERVICE_HOSTS.varname, "@" + appName);
}
try {
if (sliderClient == null) {
sliderClient = LlapSliderUtils.createSliderClient(conf);
}
} catch (Exception e) {
LlapStatusCliException le = new LlapStatusCliException(
LlapStatusServiceDriver.ExitCode.SLIDER_CLIENT_ERROR_CREATE_FAILED,
"Failed to create slider client", e);
logError(le);
return le.getExitCode().getInt();
}
// Get the App report from YARN
ApplicationReport appReport;
try {
appReport = LlapSliderUtils.getAppReport(appName, sliderClient, options.getFindAppTimeoutMs());
} catch (LlapStatusCliException e) {
logError(e);
return e.getExitCode().getInt();
}
// Process the report to decide whether to go to slider.
ExitCode ret;
try {
ret = processAppReport(appReport, appStatusBuilder);
} catch (LlapStatusCliException e) {
logError(e);
return e.getExitCode().getInt();
}
if (ret != ExitCode.SUCCESS) {
return ret.getInt();
} else if (EnumSet.of(LlapStatusHelpers.State.APP_NOT_FOUND, LlapStatusHelpers.State.COMPLETE, LlapStatusHelpers.State.LAUNCHING)
.contains(appStatusBuilder.getState())) {
return ExitCode.SUCCESS.getInt();
} else {
// Get information from slider.
try {
ret = populateAppStatusFromSliderStatus(appName, sliderClient, appStatusBuilder);
} catch (LlapStatusCliException e) {
// In case of failure, send back whatever is constructed sop far - which wouldbe from the AppReport
logError(e);
return e.getExitCode().getInt();
}
}
if (ret != ExitCode.SUCCESS) {
return ret.getInt();
} else {
try {
ret = populateAppStatusFromSliderDiagnostics(appName, sliderClient, appStatusBuilder);
} catch (LlapStatusCliException e) {
logError(e);
return e.getExitCode().getInt();
}
}
if (ret != ExitCode.SUCCESS) {
return ret.getInt();
} else {
try {
ret = populateAppStatusFromLlapRegistry(appStatusBuilder, watchTimeoutMs);
} catch (LlapStatusCliException e) {
logError(e);
return e.getExitCode().getInt();
}
}
return ret.getInt();
} finally {
if (LOG.isDebugEnabled()) {
LOG.debug("Final AppState: " + appStatusBuilder.toString());
}
}
}
public void outputJson(PrintWriter writer) throws
LlapStatusCliException {
ObjectMapper mapper = new ObjectMapper();
mapper.configure(SerializationConfig.Feature.FAIL_ON_EMPTY_BEANS, false);
mapper.setSerializationInclusion(JsonSerialize.Inclusion.NON_NULL);
mapper.setSerializationInclusion(JsonSerialize.Inclusion.NON_EMPTY);
try {
writer.println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(appStatusBuilder));
} catch (IOException e) {
LOG.warn("Failed to create JSON", e);
throw new LlapStatusCliException(ExitCode.LLAP_JSON_GENERATION_ERROR, "Failed to create JSON",
e);
}
}
private SliderClient createSliderClient() throws LlapStatusCliException {
if (sliderClient != null) {
return sliderClient;
}
try {
sliderClient = LlapSliderUtils.createSliderClient(conf);
} catch (Exception e) {
throw new LlapStatusCliException(ExitCode.SLIDER_CLIENT_ERROR_CREATE_FAILED,
"Failed to create slider client", e);
}
return sliderClient;
}
private ApplicationReport getAppReport(String appName, SliderClient sliderClient,
long timeoutMs) throws LlapStatusCliException {
long startTime = clock.getTime();
long timeoutTime = timeoutMs < 0 ? Long.MAX_VALUE : (startTime + timeoutMs);
ApplicationReport appReport = null;
// TODO HIVE-13454 Maybe add an option to wait for a certain amount of time for the app to
// move to running state. Potentially even wait for the containers to be launched.
// while (clock.getTime() < timeoutTime && appReport == null) {
while (appReport == null) {
try {
appReport = sliderClient.getYarnAppListClient().findInstance(appName);
if (timeoutMs == 0) {
// break immediately if timeout is 0
break;
}
// Otherwise sleep, and try again.
if (appReport == null) {
long remainingTime = Math.min(timeoutTime - clock.getTime(), 500l);
if (remainingTime > 0) {
Thread.sleep(remainingTime);
} else {
break;
}
}
} catch (Exception e) { // No point separating IOException vs YarnException vs others
throw new LlapStatusCliException(ExitCode.YARN_ERROR,
"Failed to get Yarn AppReport", e);
}
}
return appReport;
}
/**
* Populates parts of the AppStatus
*
* @param appReport
* @param appStatusBuilder
* @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible
* @throws LlapStatusCliException
*/
private ExitCode processAppReport(ApplicationReport appReport,
AppStatusBuilder appStatusBuilder) throws
LlapStatusCliException {
if (appReport == null) {
appStatusBuilder.setState(LlapStatusHelpers.State.APP_NOT_FOUND);
LOG.info("No Application Found");
return ExitCode.SUCCESS;
}
// TODO Maybe add the YARN URL for the app.
appStatusBuilder.setAmInfo(
new LlapStatusHelpers.AmInfo().setAppName(appReport.getName()).setAppType(appReport.getApplicationType()));
appStatusBuilder.setAppStartTime(appReport.getStartTime());
switch (appReport.getYarnApplicationState()) {
case NEW:
case NEW_SAVING:
case SUBMITTED:
appStatusBuilder.setState(LlapStatusHelpers.State.LAUNCHING);
return ExitCode.SUCCESS;
case ACCEPTED:
appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString());
appStatusBuilder.setState(LlapStatusHelpers.State.LAUNCHING);
return ExitCode.SUCCESS;
case RUNNING:
appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString());
// If the app state is running, get additional information from Slider itself.
return ExitCode.SUCCESS;
case FINISHED:
case FAILED:
case KILLED:
appStatusBuilder.maybeCreateAndGetAmInfo().setAppId(appReport.getApplicationId().toString());
appStatusBuilder.setAppFinishTime(appReport.getFinishTime());
appStatusBuilder.setState(LlapStatusHelpers.State.COMPLETE);
ApplicationDiagnostics appDiagnostics = LlapSliderUtils.getApplicationDiagnosticsFromYarnDiagnostics(appReport, LOG);
if (appDiagnostics == null) {
LOG.warn("AppDiagnostics not available for YARN application report");
} else {
processAppDiagnostics(appStatusBuilder, appDiagnostics, true);
}
return ExitCode.SUCCESS;
default:
throw new LlapStatusCliException(ExitCode.INTERNAL_ERROR,
"Unknown Yarn Application State: " + appReport.getYarnApplicationState());
}
}
/**
* Populates information from SliderStatus.
*
* @param appName
* @param sliderClient
* @param appStatusBuilder
* @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible
* @throws LlapStatusCliException
*/
private ExitCode populateAppStatusFromSliderStatus(String appName, SliderClient sliderClient, AppStatusBuilder appStatusBuilder) throws
LlapStatusCliException {
ClusterDescription clusterDescription;
try {
clusterDescription = sliderClient.getClusterDescription(appName);
} catch (SliderException e) {
throw new LlapStatusCliException(ExitCode.SLIDER_CLIENT_ERROR_OTHER,
"Failed to get cluster description from slider. SliderErrorCode=" + (e).getExitCode(), e);
} catch (Exception e) {
throw new LlapStatusCliException(ExitCode.SLIDER_CLIENT_ERROR_OTHER,
"Failed to get cluster description from slider", e);
}
if (clusterDescription == null) {
LOG.info("Slider ClusterDescription not available");
return ExitCode.SLIDER_CLIENT_ERROR_OTHER; // ClusterDescription should always be present.
} else {
// Process the Cluster Status returned by slider.
appStatusBuilder.setOriginalConfigurationPath(clusterDescription.originConfigurationPath);
appStatusBuilder.setGeneratedConfigurationPath(clusterDescription.generatedConfigurationPath);
appStatusBuilder.setAppStartTime(clusterDescription.createTime);
// Finish populating AMInfo
appStatusBuilder.maybeCreateAndGetAmInfo().setAmWebUrl(clusterDescription.getInfo(StatusKeys.INFO_AM_WEB_URL));
appStatusBuilder.maybeCreateAndGetAmInfo().setHostname(clusterDescription.getInfo(StatusKeys.INFO_AM_HOSTNAME));
appStatusBuilder.maybeCreateAndGetAmInfo().setContainerId(clusterDescription.getInfo(StatusKeys.INFO_AM_CONTAINER_ID));
if (clusterDescription.statistics != null) {
Map<String, Integer> llapStats = clusterDescription.statistics.get(LLAP_KEY);
if (llapStats != null) {
int desiredContainers = llapStats.get(StatusKeys.STATISTICS_CONTAINERS_DESIRED);
int liveContainers = llapStats.get(StatusKeys.STATISTICS_CONTAINERS_LIVE);
appStatusBuilder.setDesiredInstances(desiredContainers);
appStatusBuilder.setLiveInstances(liveContainers);
} else {
throw new LlapStatusCliException(ExitCode.SLIDER_CLIENT_ERROR_OTHER,
"Failed to get statistics for LLAP"); // Error since LLAP should always exist.
}
// TODO HIVE-13454 Use some information from here such as containers.start.failed
// and containers.failed.recently to provide an estimate of whether this app is healthy or not.
} else {
throw new LlapStatusCliException(ExitCode.SLIDER_CLIENT_ERROR_OTHER,
"Failed to get statistics"); // Error since statistics should always exist.
}
// Code to locate container status via slider. Not using this at the moment.
if (clusterDescription.status != null) {
Object liveObject = clusterDescription.status.get(ClusterDescriptionKeys.KEY_CLUSTER_LIVE);
if (liveObject != null) {
Map<String, Map<String, Map<String, Object>>> liveEntity =
(Map<String, Map<String, Map<String, Object>>>) liveObject;
Map<String, Map<String, Object>> llapEntity = liveEntity.get(LLAP_KEY);
if (llapEntity != null) { // Not a problem. Nothing has come up yet.
for (Map.Entry<String, Map<String, Object>> containerEntry : llapEntity.entrySet()) {
String containerIdString = containerEntry.getKey();
Map<String, Object> containerParams = containerEntry.getValue();
String host = (String) containerParams.get("host");
LlapInstance
llapInstance = new LlapInstance(host, containerIdString);
appStatusBuilder.addNewRunningLlapInstance(llapInstance);
}
}
}
}
return ExitCode.SUCCESS;
}
}
/**
* Populates information based on the slider diagnostics call. Must be invoked
* after populating status from slider status.
* @param appName
* @param sliderClient
* @param appStatusBuilder
* @return
* @throws LlapStatusCliException
*/
private ExitCode populateAppStatusFromSliderDiagnostics(String appName,
SliderClient sliderClient,
AppStatusBuilder appStatusBuilder) throws
LlapStatusCliException {
ApplicationDiagnostics appDiagnostics;
try {
ActionDiagnosticArgs args = new ActionDiagnosticArgs();
args.containers = true;
args.name = appName;
appDiagnostics =
sliderClient.actionDiagnosticContainers(args);
} catch (YarnException | IOException | URISyntaxException e) {
throw new LlapStatusCliException(
ExitCode.SLIDER_CLIENT_ERROR_OTHER,
"Failed to get container diagnostics from slider", e);
}
if (appDiagnostics == null) {
LOG.info("Slider container diagnostics not available");
return ExitCode.SLIDER_CLIENT_ERROR_OTHER;
}
processAppDiagnostics(appStatusBuilder, appDiagnostics, false);
return ExitCode.SUCCESS;
}
/**
* Populate additional information for containers from the LLAP registry. Must be invoked
* after Slider status. Also after slider-diagnostics.
* @param appStatusBuilder
* @return an ExitCode. An ExitCode other than ExitCode.SUCCESS implies future progress not possible
* @throws LlapStatusCliException
*/
private ExitCode populateAppStatusFromLlapRegistry(
AppStatusBuilder appStatusBuilder, long watchTimeoutMs) throws
LlapStatusCliException {
if (llapRegistry == null) {
try {
llapRegistry = LlapRegistryService.getClient(llapRegistryConf);
} catch (Exception e) {
throw new LlapStatusCliException(ExitCode.LLAP_REGISTRY_ERROR,
"Failed to create llap registry client", e);
}
}
Collection<ServiceInstance> serviceInstances;
try {
serviceInstances = llapRegistry.getInstances(watchTimeoutMs).getAll();
} catch (Exception e) {
throw new LlapStatusCliException(ExitCode.LLAP_REGISTRY_ERROR, "Failed to get instances from llap registry", e);
}
if (serviceInstances == null || serviceInstances.isEmpty()) {
if (LOG.isDebugEnabled()) {
LOG.debug("No information found in the LLAP registry");
}
appStatusBuilder.setLiveInstances(0);
appStatusBuilder.setState(LlapStatusHelpers.State.LAUNCHING);
appStatusBuilder.clearRunningLlapInstances();
return ExitCode.SUCCESS;
} else {
// Tracks instances known by both slider and llap.
List<LlapInstance> validatedInstances = new LinkedList<>();
List<String> llapExtraInstances = new LinkedList<>();
for (ServiceInstance serviceInstance : serviceInstances) {
String containerIdString = serviceInstance.getProperties().get(
HiveConf.ConfVars.LLAP_DAEMON_CONTAINER_ID.varname);
LlapInstance llapInstance = appStatusBuilder.removeAndGetRunningLlapInstanceForContainer(
containerIdString);
if (llapInstance != null) {
llapInstance.setMgmtPort(serviceInstance.getManagementPort());
llapInstance.setRpcPort(serviceInstance.getRpcPort());
llapInstance.setShufflePort(serviceInstance.getShufflePort());
llapInstance.setWebUrl(serviceInstance.getServicesAddress());
llapInstance.setStatusUrl(serviceInstance.getServicesAddress() + "/status");
validatedInstances.add(llapInstance);
} else {
// This likely indicates that an instance has recently restarted
// (the old instance has not been unregistered), and the new instances has not registered yet.
llapExtraInstances.add(containerIdString);
// This instance will not be added back, since it's services are not up yet.
}
}
appStatusBuilder.setLiveInstances(validatedInstances.size());
appStatusBuilder.setLaunchingInstances(llapExtraInstances.size());
if (validatedInstances.size() >= appStatusBuilder.getDesiredInstances()) {
appStatusBuilder.setState(LlapStatusHelpers.State.RUNNING_ALL);
if (validatedInstances.size() > appStatusBuilder.getDesiredInstances()) {
LOG.warn("Found more entries in LLAP registry, as compared to desired entries");
}
} else {
if (validatedInstances.size() > 0) {
appStatusBuilder.setState(LlapStatusHelpers.State.RUNNING_PARTIAL);
} else {
appStatusBuilder.setState(LlapStatusHelpers.State.LAUNCHING);
}
}
// At this point, everything that can be consumed from AppStatusBuilder has been consumed.
// Debug only
if (appStatusBuilder.allRunningInstances().size() > 0) {
// Containers likely to come up soon.
LOG.debug("Potential instances starting up: {}", appStatusBuilder.allRunningInstances());
}
if (llapExtraInstances.size() > 0) {
// Old containers which are likely shutting down, or new containers which
// launched between slider-status/slider-diagnostics. Skip for this iteration.
LOG.debug("Instances likely to shutdown soon: {}", llapExtraInstances);
}
appStatusBuilder.clearAndAddPreviouslyKnownRunningInstances(validatedInstances);
}
return ExitCode.SUCCESS;
}
private static void processAppDiagnostics(AppStatusBuilder appStatusBuilder,
ApplicationDiagnostics appDiagnostics, boolean appComplete) {
// For a running app this should be empty.
String finalMessage = appDiagnostics.getFinalMessage();
Collection<ContainerInformation> containerInfos =
appDiagnostics.getContainers();
appStatusBuilder.setDiagnostics(finalMessage);
if (containerInfos != null) {
for (ContainerInformation containerInformation : containerInfos) {
if (containerInformation.getState() == StateValues.STATE_LIVE && !appComplete) {
LlapInstance instance = appStatusBuilder
.removeAndGetCompletedLlapInstanceForContainer(
containerInformation.getContainerId());
if (instance ==
null) { // New launch. Not available during slider status, but available now.
instance = new LlapInstance(containerInformation.getHost(),
containerInformation.getContainerId());
}
instance.setLogUrl(containerInformation.getLogLink());
appStatusBuilder.addNewRunningLlapInstance(instance);
} else if (containerInformation.getState() ==
StateValues.STATE_STOPPED || appComplete) {
LlapInstance instance =
new LlapInstance(containerInformation.getHost(),
containerInformation.getContainerId());
instance.setLogUrl(containerInformation.getLogLink());
if (appComplete && containerInformation.getExitCode() !=
ContainerExitStatus.INVALID) {
instance
.setYarnContainerExitStatus(containerInformation.getExitCode());
}
instance.setDiagnostics(containerInformation.getDiagnostics());
appStatusBuilder.addNewCompleteLlapInstance(instance);
} else {
LOG.warn("Unexpected containerstate={}, for container={}",
containerInformation.getState(), containerInformation);
}
}
} else {
if (LOG.isDebugEnabled()) {
LOG.debug("ContainerInfos is null");
}
}
}
private static String constructCompletedContainerDiagnostics(List<LlapInstance> completedInstances) {
StringBuilder sb = new StringBuilder();
if (completedInstances == null || completedInstances.size() == 0) {
return "";
} else {
// TODO HIVE-15865 Ideally sort these by completion time, once that is available.
boolean isFirst = true;
for (LlapInstance instance : completedInstances) {
if (!isFirst) {
sb.append("\n");
} else {
isFirst = false;
}
if (instance.getYarnContainerExitStatus() ==
ContainerExitStatus.KILLED_EXCEEDED_PMEM ||
instance.getYarnContainerExitStatus() ==
ContainerExitStatus.KILLED_EXCEEDED_VMEM) {
sb.append("\tKILLED container (by YARN for exceeding memory limits): ");
} else {
// TODO HIVE-15865 Handle additional reasons like OS launch failed (Slider needs to give this info)
sb.append("\tFAILED container: ");
}
sb.append(" ").append(instance.getContainerId());
sb.append(", Logs at: ").append(instance.getLogUrl());
}
}
return sb.toString();
}
/**
* Helper method to construct a diagnostic message from a complete
* AppStatusBuilder.
*
* @return
*/
private static String constructDiagnostics(
AppStatusBuilder appStatusBuilder) {
StringBuilder sb = new StringBuilder();
switch (appStatusBuilder.getState()) {
case APP_NOT_FOUND:
sb.append("LLAP status unknown. Awaiting app launch");
break;
case LAUNCHING:
// This is a catch all state - when containers have not started yet, or LLAP has not started yet.
if (StringUtils.isNotBlank(appStatusBuilder.getAmInfo().getAppId())) {
sb.append("LLAP Starting up with AppId=")
.append(appStatusBuilder.getAmInfo().getAppId()).append(".");
if (appStatusBuilder.getDesiredInstances() != null) {
sb.append(" Started 0/").append(appStatusBuilder.getDesiredInstances()).append(" instances");
}
String containerDiagnostics = constructCompletedContainerDiagnostics(
appStatusBuilder.getCompletedInstances());
if (StringUtils.isNotEmpty(containerDiagnostics)) {
sb.append("\n").append(containerDiagnostics);
}
} else {
sb.append("Awaiting LLAP startup");
}
break;
case RUNNING_PARTIAL:
sb.append("LLAP Starting up with ApplicationId=")
.append(appStatusBuilder.getAmInfo().getAppId());
sb.append(" Started").append(appStatusBuilder.getLiveInstances())
.append("/").append(appStatusBuilder.getDesiredInstances())
.append(" instances");
String containerDiagnostics = constructCompletedContainerDiagnostics(
appStatusBuilder.getCompletedInstances());
if (StringUtils.isNotEmpty(containerDiagnostics)) {
sb.append("\n").append(containerDiagnostics);
}
// TODO HIVE-15865: Include information about pending requests, and last allocation time
// once Slider provides this information.
break;
case RUNNING_ALL:
sb.append("LLAP Application running with ApplicationId=")
.append(appStatusBuilder.getAmInfo().getAppId());
break;
case COMPLETE:
sb.append("LLAP Application already complete. ApplicationId=")
.append(appStatusBuilder.getAmInfo().getAppId());
containerDiagnostics = constructCompletedContainerDiagnostics(
appStatusBuilder.getCompletedInstances());
if (StringUtils.isNotEmpty(containerDiagnostics)) {
sb.append("\n").append(containerDiagnostics);
}
break;
case UNKNOWN:
sb.append("LLAP status unknown");
break;
}
if (StringUtils.isNotBlank(appStatusBuilder.getDiagnostics())) {
sb.append("\n").append(appStatusBuilder.getDiagnostics());
}
return sb.toString();
}
public enum ExitCode {
SUCCESS(0),
INCORRECT_USAGE(10),
YARN_ERROR(20),
SLIDER_CLIENT_ERROR_CREATE_FAILED(30),
SLIDER_CLIENT_ERROR_OTHER(31),
LLAP_REGISTRY_ERROR(40),
LLAP_JSON_GENERATION_ERROR(50),
// Error in the script itself - likely caused by an incompatible change, or new functionality / states added.
INTERNAL_ERROR(100);
private final int exitCode;
ExitCode(int exitCode) {
this.exitCode = exitCode;
}
public int getInt() {
return exitCode;
}
}
public static class LlapStatusCliException extends Exception {
final LlapStatusServiceDriver.ExitCode exitCode;
public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message) {
super(exitCode.getInt() +": " + message);
this.exitCode = exitCode;
}
public LlapStatusCliException(LlapStatusServiceDriver.ExitCode exitCode, String message, Throwable cause) {
super(message, cause);
this.exitCode = exitCode;
}
public LlapStatusServiceDriver.ExitCode getExitCode() {
return exitCode;
}
}
private static void logError(Throwable t) {
LOG.error("FAILED: " + t.getMessage(), t);
System.err.println("FAILED: " + t.getMessage());
}
public static void main(String[] args) {
LOG.info("LLAP status invoked with arguments = {}", Arrays.toString(args));
int ret = ExitCode.SUCCESS.getInt();
Clock clock = new SystemClock();
long startTime = clock.getTime();
long lastSummaryLogTime = -1;
LlapStatusServiceDriver statusServiceDriver = null;
LlapStatusOptions options = null;
try {
statusServiceDriver = new LlapStatusServiceDriver();
options = statusServiceDriver.parseOptions(args);
} catch (Throwable t) {
statusServiceDriver.close();
logError(t);
if (t instanceof LlapStatusCliException) {
LlapStatusCliException
ce = (LlapStatusCliException) t;
ret = ce.getExitCode().getInt();
} else {
ret = ExitCode.INTERNAL_ERROR.getInt();
}
}
if (ret != 0 || options == null) { // Failure / help
if (statusServiceDriver != null) {
statusServiceDriver.close();
}
System.exit(ret);
}
boolean firstAttempt = true;
final long refreshInterval = options.getRefreshIntervalMs();
final boolean watchMode = options.isWatchMode();
final long watchTimeout = options.getWatchTimeoutMs();
long numAttempts = watchTimeout / refreshInterval;
numAttempts = watchMode ? numAttempts : 1; // Break out of the loop fast if watchMode is disabled.
LlapStatusHelpers.State launchingState = null;
LlapStatusHelpers.State currentState = null;
boolean desiredStateAttained = false;
final float runningNodesThreshold = options.getRunningNodesThreshold();
try (OutputStream os = options.getOutputFile() == null ? System.out :
new BufferedOutputStream(new FileOutputStream(options.getOutputFile()));
PrintWriter pw = new PrintWriter(os)) {
LOG.info("Configured refresh interval: {}s. Watch timeout: {}s. Attempts remaining: {}." +
" Watch mode: {}. Running nodes threshold: {}.",
TimeUnit.SECONDS.convert(refreshInterval, TimeUnit.MILLISECONDS),
TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS),
numAttempts, watchMode, new DecimalFormat("#.###").format(runningNodesThreshold));
while (numAttempts > 0) {
try {
if (!firstAttempt) {
if (watchMode) {
try {
Thread.sleep(refreshInterval);
} catch (InterruptedException e) {
// ignore
}
} else {
// reported once, so break
break;
}
} else {
firstAttempt = false;
}
ret = statusServiceDriver.run(options, watchMode ? watchTimeout : 0);
currentState = statusServiceDriver.appStatusBuilder.getState();
try {
lastSummaryLogTime = LlapStatusServiceDriver
.maybeLogSummary(clock, lastSummaryLogTime, statusServiceDriver,
watchMode, watchTimeout, launchingState);
} catch (Exception e) {
LOG.warn("Failed to log summary", e);
}
if (ret == ExitCode.SUCCESS.getInt()) {
if (watchMode) {
// slider has started llap application, now if for some reason state changes to COMPLETE then fail fast
if (launchingState == null &&
(EnumSet.of(LlapStatusHelpers.State.LAUNCHING,
LlapStatusHelpers.State.RUNNING_PARTIAL,
LlapStatusHelpers.State.RUNNING_ALL)
.contains(currentState))) {
launchingState = currentState;
}
if (launchingState != null && currentState.equals(
LlapStatusHelpers.State.COMPLETE)) {
LOG.warn("Application stopped while launching. COMPLETE state reached while waiting for RUNNING state."
+ " Failing " + "fast..");
break;
}
if (!(currentState.equals(LlapStatusHelpers.State.RUNNING_PARTIAL) || currentState.equals(
LlapStatusHelpers.State.RUNNING_ALL))) {
if (LOG.isDebugEnabled()) {
LOG.debug(
"Current state: {}. Desired state: {}. {}/{} instances.",
currentState,
runningNodesThreshold == 1.0f ?
LlapStatusHelpers.State.RUNNING_ALL :
LlapStatusHelpers.State.RUNNING_PARTIAL,
statusServiceDriver.appStatusBuilder.getLiveInstances(),
statusServiceDriver.appStatusBuilder
.getDesiredInstances());
}
numAttempts--;
continue;
}
// we have reached RUNNING state, now check if running nodes threshold is met
final int liveInstances = statusServiceDriver.appStatusBuilder.getLiveInstances();
final int desiredInstances = statusServiceDriver.appStatusBuilder.getDesiredInstances();
if (desiredInstances > 0) {
final float ratio = (float) liveInstances / (float) desiredInstances;
if (ratio < runningNodesThreshold) {
if (LOG.isDebugEnabled()) {
LOG.debug(
"Waiting until running nodes threshold is reached. Current: {} Desired: {}." +
" {}/{} instances.",
new DecimalFormat("#.###").format(ratio),
new DecimalFormat("#.###")
.format(runningNodesThreshold),
statusServiceDriver.appStatusBuilder.getLiveInstances(),
statusServiceDriver.appStatusBuilder
.getDesiredInstances());
}
numAttempts--;
continue;
} else {
desiredStateAttained = true;
statusServiceDriver.appStatusBuilder.setRunningThresholdAchieved(true);
}
} else {
numAttempts--;
continue;
}
}
} else if (ret == ExitCode.YARN_ERROR.getInt() && watchMode) {
LOG.warn("Watch mode enabled and got YARN error. Retrying..");
numAttempts--;
continue;
} else if (ret == ExitCode.SLIDER_CLIENT_ERROR_CREATE_FAILED.getInt() && watchMode) {
LOG.warn("Watch mode enabled and slider client creation failed. Retrying..");
numAttempts--;
continue;
} else if (ret == ExitCode.SLIDER_CLIENT_ERROR_OTHER.getInt() && watchMode) {
LOG.warn("Watch mode enabled and got slider client error. Retrying..");
numAttempts--;
continue;
} else if (ret == ExitCode.LLAP_REGISTRY_ERROR.getInt() && watchMode) {
LOG.warn("Watch mode enabled and got LLAP registry error. Retrying..");
numAttempts--;
continue;
}
break;
} finally {
// TODO Remove this before commit.
}
}
// Log final state to CONSOLE_LOGGER
LlapStatusServiceDriver
.maybeLogSummary(clock, 0L, statusServiceDriver,
watchMode, watchTimeout, launchingState);
CONSOLE_LOGGER.info("\n\n\n");
// print current state before exiting
statusServiceDriver.outputJson(pw);
os.flush();
pw.flush();
if (numAttempts == 0 && watchMode && !desiredStateAttained) {
LOG.warn("Watch timeout {}s exhausted before desired state RUNNING is attained.",
TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS));
}
} catch (Throwable t) {
logError(t);
if (t instanceof LlapStatusCliException) {
LlapStatusCliException
ce = (LlapStatusCliException) t;
ret = ce.getExitCode().getInt();
} else {
ret = ExitCode.INTERNAL_ERROR.getInt();
}
} finally {
LOG.info("LLAP status finished");
statusServiceDriver.close();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Completed processing - exiting with " + ret);
}
System.exit(ret);
}
private static long maybeLogSummary(Clock clock, long lastSummaryLogTime,
LlapStatusServiceDriver statusServiceDriver,
boolean watchMode, long watchTimeout, LlapStatusHelpers.State launchingState) {
long currentTime = clock.getTime();
if (lastSummaryLogTime < currentTime - LOG_SUMMARY_INTERVAL) {
String diagString = null;
if (launchingState == null && statusServiceDriver.appStatusBuilder.getState() ==
LlapStatusHelpers.State.COMPLETE && watchMode) {
// First known state was COMPLETED. Wait for the app launch to start.
diagString = "Awaiting LLAP launch";
// Clear completed instances in this case. Don't want to provide information from the previous run.
statusServiceDriver.appStatusBuilder.clearCompletedLlapInstances();
} else {
diagString = constructDiagnostics(statusServiceDriver.appStatusBuilder);
}
if (lastSummaryLogTime == -1) {
if (watchMode) {
CONSOLE_LOGGER.info("\nLLAPSTATUS WatchMode with timeout={} s",
TimeUnit.SECONDS.convert(watchTimeout, TimeUnit.MILLISECONDS));
} else {
CONSOLE_LOGGER.info("\nLLAPSTATUS");
}
CONSOLE_LOGGER.info(
"--------------------------------------------------------------------------------");
}
CONSOLE_LOGGER.info(diagString);
CONSOLE_LOGGER.info(
"--------------------------------------------------------------------------------");
lastSummaryLogTime = currentTime;
}
return lastSummaryLogTime;
}
private void close() {
if (sliderClient != null) {
sliderClient.stop();
}
if (llapRegistry != null) {
llapRegistry.stop();
}
}
}