package com.alibaba.jstorm.yarn.appmaster;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.UndeclaredThrowableException;
import java.net.*;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.LinkedBlockingQueue;
import com.alibaba.jstorm.yarn.constants.JOYConstants;
import com.alibaba.jstorm.yarn.Log4jPropertyHelper;
import com.alibaba.jstorm.yarn.container.ExecutorLoader;
import com.alibaba.jstorm.yarn.context.JstormMasterContext;
import com.alibaba.jstorm.yarn.model.DSEntity;
import com.alibaba.jstorm.yarn.model.DSEvent;
import com.alibaba.jstorm.yarn.model.STARTType;
import com.alibaba.jstorm.yarn.registry.SlotPortsView;
import com.alibaba.jstorm.yarn.server.AMServer;
import com.alibaba.jstorm.yarn.utils.JstormYarnUtils;
import com.alibaba.jstorm.yarn.utils.PortScanner;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.registry.client.api.BindFlags;
import org.apache.hadoop.registry.client.api.RegistryOperations;
import org.apache.hadoop.registry.client.api.RegistryOperationsFactory;
import org.apache.hadoop.registry.client.binding.RegistryUtils;
import org.apache.hadoop.registry.client.types.Endpoint;
import org.apache.hadoop.registry.client.types.ServiceRecord;
import org.apache.hadoop.registry.client.types.yarn.PersistencePolicies;
import org.apache.hadoop.registry.client.types.yarn.YarnRegistryAttributes;
import org.apache.hadoop.registry.server.integration.RMRegistryOperationsService;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.ExitUtil;
import org.apache.hadoop.util.Shell;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.ContainerManagementProtocol;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.*;
import org.apache.hadoop.yarn.api.records.URL;
import org.apache.hadoop.yarn.api.records.timeline.TimelineEntity;
import org.apache.hadoop.yarn.api.records.timeline.TimelineEvent;
import org.apache.hadoop.yarn.api.records.timeline.TimelinePutResponse;
import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest;
import org.apache.hadoop.yarn.client.api.TimelineClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.client.api.async.NMClientAsync;
import org.apache.hadoop.yarn.client.api.async.impl.NMClientAsyncImpl;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
import org.apache.hadoop.yarn.util.ConverterUtils;
import org.apache.log4j.LogManager;
import com.google.common.annotations.VisibleForTesting;
import static com.alibaba.jstorm.yarn.constants.JOYConstants.*;
/**
* Created by fengjian on 16/4/7.
* Application master
*/
public class JstormMaster {
private static final Log LOG = LogFactory.getLog(JstormMaster.class);
public JstormMasterContext jstormMasterContext = new JstormMasterContext();
private Configuration conf;
// Handle to communicate with the Resource Manager
public AMRMClientAsync amRMClient;
// In both secure and non-secure modes, this points to the job-submitter.
@VisibleForTesting
UserGroupInformation appSubmitterUgi;
// Handle to communicate with the Node Manager
private NMClientAsync nmClientAsync;
// Listen to process the response from the Node Manager
private NMCallbackHandler containerListener;
// Launch threads
private List<Thread> launchThreads = new ArrayList<Thread>();
// Timeline Client
@VisibleForTesting
TimelineClient timelineClient;
private PortScanner portScanner;
/**
* The YARN registry service
*/
@SuppressWarnings("FieldAccessedSynchronizedAndUnsynchronized")
public RegistryOperations registryOperations;
public void killApplicationMaster() {
jstormMasterContext.done = true;
}
/**
* @param args Command line args
*/
public static void main(String[] args) {
boolean result = false;
try {
JstormMaster appMaster = new JstormMaster();
LOG.info("Initializing Jstorm Master!");
boolean doRun = appMaster.init(args);
if (!doRun) {
System.exit(JOYConstants.EXIT_SUCCESS);
}
appMaster.run();
// LRS won't finish at all
result = appMaster.finish();
} catch (Throwable t) {
LOG.fatal("Error running JstormMaster", t);
LogManager.shutdown();
ExitUtil.terminate(JOYConstants.EXIT_FAIL1, t);
}
if (result) {
LOG.info("Application Master completed successfully. exiting");
System.exit(JOYConstants.EXIT_SUCCESS);
} else {
LOG.info("Application Master failed. exiting");
System.exit(JOYConstants.EXIT_FAIL2);
}
}
public JstormMaster() {
// Set up the configuration
conf = new YarnConfiguration();
Path jstormyarnConfPath = new Path("jstorm-yarn.xml");
conf.addResource(jstormyarnConfPath);
}
/**
* Parse command line options
*
* @param args Command line args
* @return Whether init successful and run should be invoked
* @throws ParseException
* @throws IOException
*/
public boolean init(String[] args) throws ParseException, IOException {
Options opts = new Options();
opts.addOption(JOYConstants.APP_ATTEMPT_ID, true,
"App Attempt ID. Not to be used unless for testing purposes");
opts.addOption(JOYConstants.SHELL_SCRIPT, true,
"Environment for shell script. Specified as env_key=env_val pairs");
opts.addOption(JOYConstants.CONTAINER_MEMORY, true,
"Amount of memory in MB to be requested to run the shell command");
opts.addOption(JOYConstants.CONTAINER_VCORES, true,
"Amount of virtual cores to be requested to run the shell command");
opts.addOption(JOYConstants.NUM_CONTAINERS, true,
"No. of containers on which the shell command needs to be executed");
opts.addOption(JOYConstants.PRIORITY, true, "Application Priority. Default 0");
opts.addOption(JOYConstants.DEBUG, false, "Dump out debug information");
opts.addOption(JOYConstants.HELP, false, "Print usage");
if (args.length == 0) {
printUsage(opts);
throw new IllegalArgumentException(
"No args specified for application master to initialize");
}
try {
CommandLine cliParser = new GnuParser().parse(opts, args);
JstormYarnUtils.checkAndSetMasterOptions(cliParser, jstormMasterContext, this.conf);
} catch (Exception e) {
LOG.error(e);
}
return true;
}
/**
* Helper function to print usage
*
* @param opts Parsed command line options
*/
private void printUsage(Options opts) {
new HelpFormatter().printHelp("com.alibaba.jstorm.yarn.appmaster.JstormMaster", opts);
}
/**
* Build up the port scanner. This may include setting a port range.
*/
private void buildPortScanner() {
portScanner = new PortScanner();
portScanner.setPortRange(JOYConstants.PORT_RANGE);
}
/**
* Main run function for the application master
*
* @throws YarnException
* @throws IOException
*/
@SuppressWarnings({"unchecked"})
public void run() throws Exception {
LOG.info("Starting JstormMaster");
Credentials credentials =
UserGroupInformation.getCurrentUser().getCredentials();
DataOutputBuffer dob = new DataOutputBuffer();
credentials.writeTokenStorageToStream(dob);
// Now remove the AM->RM token so that containers cannot access it.
Iterator<Token<?>> iter = credentials.getAllTokens().iterator();
LOG.info("Executing with tokens:");
while (iter.hasNext()) {
Token<?> token = iter.next();
LOG.info(token);
if (token.getKind().equals(AMRMTokenIdentifier.KIND_NAME)) {
iter.remove();
}
}
jstormMasterContext.allTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
// Create appSubmitterUgi and add original tokens to it
String appSubmitterUserName =
System.getenv(ApplicationConstants.Environment.USER.name());
appSubmitterUgi =
UserGroupInformation.createRemoteUser(appSubmitterUserName);
appSubmitterUgi.addCredentials(credentials);
AMRMClientAsync.CallbackHandler allocListener = new RMCallbackHandler();
amRMClient = AMRMClientAsync.createAMRMClientAsync(JOYConstants.AM_RM_CLIENT_INTERVAL, allocListener);
jstormMasterContext.amRMClient = amRMClient;
amRMClient.init(conf);
amRMClient.start();
containerListener = createNMCallbackHandler();
nmClientAsync = new NMClientAsyncImpl(containerListener);
nmClientAsync.init(conf);
nmClientAsync.start();
startTimelineClient(conf);
if (timelineClient != null) {
publishApplicationAttemptEvent(timelineClient, jstormMasterContext.appAttemptID.toString(),
DSEvent.DS_APP_ATTEMPT_START, jstormMasterContext.domainId, appSubmitterUgi);
}
// Register self with ResourceManager
// This will start heartbeating to the RM
jstormMasterContext.appMasterHostname = NetUtils.getHostname();
//get available port
buildPortScanner();
jstormMasterContext.appMasterThriftPort = portScanner.getAvailablePort();
//since appMasterRpcPort not used yet, set appMasterRpcPort to appMasterThriftPort
jstormMasterContext.appMasterRpcPort = jstormMasterContext.appMasterThriftPort;
RegisterApplicationMasterResponse response = amRMClient
.registerApplicationMaster(jstormMasterContext.appMasterHostname, jstormMasterContext.appMasterRpcPort,
jstormMasterContext.appMasterTrackingUrl);
// Dump out information about cluster capability as seen by the
// resource manager
jstormMasterContext.maxMemory = response.getMaximumResourceCapability().getMemory();
LOG.info("Max mem capability of resources in this cluster " + jstormMasterContext.maxMemory);
jstormMasterContext.maxVcores = response.getMaximumResourceCapability().getVirtualCores();
LOG.info("Max vcores capability of resources in this cluster " + jstormMasterContext.maxVcores);
// A resource ask cannot exceed the max.
if (jstormMasterContext.containerMemory > jstormMasterContext.maxMemory) {
LOG.info("Container memory specified above max threshold of cluster."
+ " Using max value." + ", specified=" + jstormMasterContext.containerMemory + ", max="
+ jstormMasterContext.maxMemory);
jstormMasterContext.containerMemory = jstormMasterContext.maxMemory;
}
if (jstormMasterContext.containerVirtualCores > jstormMasterContext.maxVcores) {
LOG.info("Container virtual cores specified above max threshold of cluster."
+ " Using max value." + ", specified=" + jstormMasterContext.containerVirtualCores + ", max="
+ jstormMasterContext.maxVcores);
jstormMasterContext.containerVirtualCores = jstormMasterContext.maxVcores;
}
List<Container> previousAMRunningContainers =
response.getContainersFromPreviousAttempts();
LOG.info(jstormMasterContext.appAttemptID + " received " + previousAMRunningContainers.size()
+ " previous attempts' running containers on AM registration.");
jstormMasterContext.numAllocatedContainers.addAndGet(previousAMRunningContainers.size());
//Setup RegistryOperations
registryOperations = RegistryOperationsFactory.createInstance(JOYConstants.YARN_REGISTRY, conf);
setupInitialRegistryPaths();
registryOperations.start();
//add previous AM containers to supervisor and nimbus container list
for (Container container : previousAMRunningContainers) {
String containerPath = RegistryUtils.componentPath(
JOYConstants.APP_TYPE, jstormMasterContext.instanceName,
container.getId().getApplicationAttemptId().getApplicationId().toString(), container.getId().toString());
ServiceRecord sr = null;
try {
if (!registryOperations.exists(containerPath)) {
String contianerHost = container.getNodeId().getHost();
registryOperations.mknode(containerPath, true);
sr = new ServiceRecord();
sr.set(JOYConstants.HOST, contianerHost);
sr.set(YarnRegistryAttributes.YARN_ID, container.getId().toString());
sr.description = JOYConstants.CONTAINER;
sr.set(YarnRegistryAttributes.YARN_PERSISTENCE,
PersistencePolicies.CONTAINER);
registryOperations.bind(containerPath, sr, BindFlags.OVERWRITE);
}
} catch (IOException e) {
e.printStackTrace();
}
if (container.getPriority().getPriority() == 0)
jstormMasterContext.supervisorContainers.add(container);
else if (container.getPriority().getPriority() == 1) {
jstormMasterContext.nimbusContainers.add(container);
}
}
jstormMasterContext.requestBlockingQueue = new LinkedBlockingQueue<ContainerRequest>();
jstormMasterContext.service_user_name = RegistryUtils.currentUser();
jstormMasterContext.instanceName = conf.get(JOYConstants.INSTANCE_NAME_KEY);
this.jstormMasterContext.user = conf.get(JOYConstants.JSTORM_YARN_USER);
this.jstormMasterContext.password = conf.get(JOYConstants.JSTORM_YARN_PASSWORD);
this.jstormMasterContext.oldPassword = conf.get(JOYConstants.JSTORM_YARN_OLD_PASSWORD);
LOG.info("find available port for am rpc server which is : " + jstormMasterContext.appMasterThriftPort);
String appPath = RegistryUtils.servicePath(
JOYConstants.APP_TYPE, jstormMasterContext.instanceName, jstormMasterContext.appAttemptID.getApplicationId().toString());
String instancePath = RegistryUtils.serviceclassPath(
JOYConstants.APP_TYPE, jstormMasterContext.instanceName);
LOG.info("Registering application " + jstormMasterContext.appAttemptID.getApplicationId().toString());
ServiceRecord application = setupServiceRecord();
jstormMasterContext.nimbusDataDirPrefix = conf.get(JOYConstants.INSTANCE_DATA_DIR_KEY);
LOG.info("generate instancePath on zk , path is:" + instancePath);
if (registryOperations.exists(instancePath)) {
ServiceRecord previousRegister = registryOperations.resolve(instancePath);
application.set(JOYConstants.NIMBUS_HOST, previousRegister.get(JOYConstants.NIMBUS_HOST, JOYConstants.EMPTY));
application.set(JOYConstants.NIMBUS_CONTAINER, previousRegister.get(JOYConstants.NIMBUS_CONTAINER, JOYConstants.EMPTY));
application.set(JOYConstants.NIMBUS_LOCAL_DIR, previousRegister.get(JOYConstants.NIMBUS_LOCAL_DIR, JOYConstants.EMPTY));
jstormMasterContext.previousNimbusHost = previousRegister.get(JOYConstants.NIMBUS_HOST, "");
Date now = new Date();
Map<String, ServiceRecord> apps = RegistryUtils.listServiceRecords(registryOperations, instancePath);
for (String subAppPath : apps.keySet()) {
LOG.info("existApp:" + subAppPath);
ServiceRecord subApp = apps.get(subAppPath);
Long lastHeatBeatTime = 0l;
try {
lastHeatBeatTime = Long.parseLong(subApp.get(JOYConstants.APP_HEARTBEAT_TIME));
} catch (Exception e) {
LOG.error(e);
}
if (now.getTime() - lastHeatBeatTime > 5 * JOYConstants.HEARTBEAT_TIME_INTERVAL
|| lastHeatBeatTime > now.getTime() || subAppPath.trim().equals(appPath.trim())) {
LOG.info("application " + subAppPath + " not response , delete it!");
registryOperations.delete(subAppPath, true);
}
}
}
if (!jstormMasterContext.done) {
jstormMasterContext.config = conf;
registryOperations.mknode(appPath, true);
registryOperations.bind(instancePath, application, BindFlags.OVERWRITE);
ServiceRecord previousRegister = registryOperations.resolve(instancePath);
LOG.info("previousRegister:" + previousRegister.toString());
LOG.info("register path: " + instancePath);
AMServer as = new AMServer(jstormMasterContext.appMasterThriftPort);
as.Start(this);
}
}
private ServiceRecord setupServiceRecord() {
ServiceRecord application = new ServiceRecord();
application.set(YarnRegistryAttributes.YARN_ID, jstormMasterContext.appAttemptID.getApplicationId().toString());
application.description = JOYConstants.AM;
application.set(YarnRegistryAttributes.YARN_PERSISTENCE,
PersistencePolicies.PERMANENT);
Map<String, String> addresses = new HashMap<String, String>();
addresses.put(JOYConstants.HOST, jstormMasterContext.appMasterHostname);
addresses.put(JOYConstants.PORT, String.valueOf(jstormMasterContext.appMasterThriftPort));
Endpoint endpoint = new Endpoint(JOYConstants.HTTP, JOYConstants.HOST_PORT, JOYConstants.RPC, addresses);
application.addExternalEndpoint(endpoint);
return application;
}
void startTimelineClient(final Configuration conf)
throws YarnException, IOException, InterruptedException {
try {
appSubmitterUgi.doAs(new PrivilegedExceptionAction<Void>() {
@Override
public Void run() throws Exception {
if (conf.getBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED,
YarnConfiguration.DEFAULT_TIMELINE_SERVICE_ENABLED)) {
// Creating the Timeline Client
timelineClient = TimelineClient.createTimelineClient();
timelineClient.init(conf);
timelineClient.start();
} else {
timelineClient = null;
LOG.warn("Timeline service is not enabled");
}
return null;
}
});
} catch (UndeclaredThrowableException e) {
throw new YarnException(e.getCause());
}
}
NMCallbackHandler createNMCallbackHandler() {
return new NMCallbackHandler(this);
}
protected boolean finish() {
// wait for completion.
String appPath;
while (!jstormMasterContext.done
) {
try {
Thread.sleep(JOYConstants.HEARTBEAT_TIME_INTERVAL);
appPath = RegistryUtils.servicePath(
JOYConstants.APP_TYPE, jstormMasterContext.instanceName, jstormMasterContext.appAttemptID.getApplicationId().toString());
ServiceRecord app = new ServiceRecord();
Date now = new Date();
app.set(JOYConstants.APP_HEARTBEAT_TIME, String.valueOf(now.getTime()));
registryOperations.bind(appPath, app, BindFlags.OVERWRITE);
} catch (Exception ex) {
LOG.error(ex);
}
}
if (timelineClient != null) {
publishApplicationAttemptEvent(timelineClient, jstormMasterContext.appAttemptID.toString(),
DSEvent.DS_APP_ATTEMPT_END, jstormMasterContext.domainId, appSubmitterUgi);
}
appPath = RegistryUtils.servicePath(
JOYConstants.APP_TYPE, jstormMasterContext.instanceName, jstormMasterContext.appAttemptID.getApplicationId().toString());
try {
registryOperations.delete(appPath, true);
LOG.info("unRegister application' appPath:" + appPath);
} catch (IOException e) {
LOG.error("Failed to unRegister application's Registry", e);
}
// Join all launched threads
for (Thread launchThread : launchThreads) {
try {
launchThread.join(JOYConstants.JOIN_THREAD_TIMEOUT);
} catch (InterruptedException e) {
LOG.info("Exception thrown in thread join: " + e.getMessage());
e.printStackTrace();
}
}
// When the application completes, it should stop all running containers
LOG.info("Application completed. Stopping running containers");
nmClientAsync.stop();
// When the application completes, it should send a finish application
// signal to the RM
LOG.info("Application completed. Signalling finish to RM");
FinalApplicationStatus appStatus;
String appMessage = null;
boolean success = true;
if (jstormMasterContext.numFailedContainers.get() == 0 &&
jstormMasterContext.numCompletedContainers.get() == jstormMasterContext.numTotalContainers) {
appStatus = FinalApplicationStatus.SUCCEEDED;
} else {
appStatus = FinalApplicationStatus.FAILED;
appMessage = "Diagnostics." + ", total=" + jstormMasterContext.numTotalContainers
+ ", completed=" + jstormMasterContext.numCompletedContainers.get() + ", allocated="
+ jstormMasterContext.numAllocatedContainers.get() + ", failed="
+ jstormMasterContext.numFailedContainers.get();
LOG.info(appMessage);
success = false;
}
try {
amRMClient.unregisterApplicationMaster(appStatus, appMessage, null);
} catch (YarnException ex) {
LOG.error("Failed to unregister application", ex);
} catch (IOException e) {
LOG.error("Failed to unregister application", e);
}
amRMClient.stop();
// Stop Timeline Client
if (timelineClient != null) {
timelineClient.stop();
}
return success;
}
private class RMCallbackHandler implements AMRMClientAsync.CallbackHandler {
@SuppressWarnings("unchecked")
@Override
public void onContainersCompleted(List<ContainerStatus> completedContainers) {
LOG.info("Got response from RM for container ask, completedCnt="
+ completedContainers.size());
for (ContainerStatus containerStatus : completedContainers) {
LOG.info(jstormMasterContext.appAttemptID + " got container status for containerID="
+ containerStatus.getContainerId() + ", state="
+ containerStatus.getState() + ", exitStatus="
+ containerStatus.getExitStatus() + ", diagnostics="
+ containerStatus.getDiagnostics());
// non complete containers should not be here
assert (containerStatus.getState() == ContainerState.COMPLETE);
Map<Long, Container> nimbusMap = new HashMap<Long, Container>();
for (Container container : jstormMasterContext.nimbusContainers) {
nimbusMap.put(container.getId().getContainerId(), container);
}
Map<Long, Container> supervisorMap = new HashMap<Long, Container>();
for (Container container : jstormMasterContext.supervisorContainers) {
supervisorMap.put(container.getId().getContainerId(), container);
}
Long containerId = containerStatus.getContainerId().getContainerId();
// increment counters for completed/failed containers
int exitStatus = containerStatus.getExitStatus();
if (JOYConstants.EXIT_SUCCESS != exitStatus) {
// container failed
if (ContainerExitStatus.ABORTED != exitStatus) {
jstormMasterContext.numCompletedContainers.incrementAndGet();
jstormMasterContext.numFailedContainers.incrementAndGet();
} else {
// container was killed by framework, possibly preempted
// we should re-try as the container was lost for some reason
jstormMasterContext.numAllocatedContainers.decrementAndGet();
jstormMasterContext.numRequestedContainers.decrementAndGet();
}
if (nimbusMap.containsKey(containerId)) {
jstormMasterContext.nimbusContainers.remove(nimbusMap.get(containerId));
} else if (supervisorMap.containsKey(containerId)) {
jstormMasterContext.supervisorContainers.remove(supervisorMap.get(containerId));
}
} else {
//if container over and wasn't killed by framework ,then resend ContainerRequest and launch it again
jstormMasterContext.numCompletedContainers.incrementAndGet();
LOG.info("process in this Container completed by itself, should restart." + ", containerId="
+ containerStatus.getContainerId());
ContainerRequest containerAsk = null;
if (nimbusMap.containsKey(containerId)) {
Container nimbusContainer = nimbusMap.get(containerId);
containerAsk = setupContainerAskForRM(nimbusContainer.getResource().getMemory(), nimbusContainer.getResource().getVirtualCores(), nimbusContainer.getPriority().getPriority(), nimbusContainer.getNodeId().getHost());
LOG.info("restart nimbus container" + ", containerId="
+ containerStatus.getContainerId());
} else if (supervisorMap.containsKey(containerId)) {
Container supervisorContainer = supervisorMap.get(containerId);
containerAsk = setupContainerAskForRM(supervisorContainer.getResource().getMemory(), supervisorContainer.getResource().getVirtualCores(), supervisorContainer.getPriority().getPriority(), supervisorContainer.getNodeId().getHost());
LOG.info("restart supervisor container" + ", containerId="
+ containerStatus.getContainerId());
} else {
LOG.info("restart failed. cant find this container in exist queue" + ", containerId="
+ containerStatus.getContainerId());
}
if (containerAsk != null) {
amRMClient.addContainerRequest(containerAsk);
try {
jstormMasterContext.requestBlockingQueue.put(containerAsk);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
String containerPath = RegistryUtils.componentPath(
JOYConstants.APP_TYPE, jstormMasterContext.instanceName, jstormMasterContext.appAttemptID.getApplicationId().toString(), containerStatus.getContainerId().toString());
try {
if (registryOperations.exists(containerPath)) {
registryOperations.delete(containerPath, true);
}
} catch (Exception ex) {
LOG.error("error to delete registry of container when container complete", ex);
}
if (timelineClient != null) {
publishContainerEndEvent(
timelineClient, containerStatus, jstormMasterContext.domainId, appSubmitterUgi);
}
}
}
@Override
public void onContainersAllocated(List<Container> allocatedContainers) {
LOG.info("Got response from RM for container ask, allocatedCnt="
+ allocatedContainers.size());
jstormMasterContext.numAllocatedContainers.addAndGet(allocatedContainers.size());
for (Container allocatedContainer : allocatedContainers) {
LOG.info("Launching shell command on a new container."
+ ", containerId=" + allocatedContainer.getId()
+ ", containerNode=" + allocatedContainer.getNodeId().getHost()
+ ":" + allocatedContainer.getNodeId().getPort()
+ ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress()
+ ", containerResourceMemory"
+ allocatedContainer.getResource().getMemory()
+ ", containerResourceVirtualCores"
+ allocatedContainer.getResource().getVirtualCores());
// check priority to assign start type . this priority was assigned by JstormAMHandler
STARTType startType;
//todo: register every supervisor containers host
if (allocatedContainer.getPriority().getPriority() == 0) {
String supervisorHost = allocatedContainer.getNodeId().getHost();
startType = STARTType.SUPERVISOR;
String containerPath = RegistryUtils.componentPath(
JOYConstants.APP_TYPE, jstormMasterContext.instanceName,
allocatedContainer.getId().getApplicationAttemptId().getApplicationId().toString(), allocatedContainer.getId().toString());
ServiceRecord sr = null;
try {
if (!registryOperations.exists(containerPath)) {
registryOperations.mknode(containerPath, true);
sr = new ServiceRecord();
sr.set(JOYConstants.HOST, supervisorHost);
sr.set(YarnRegistryAttributes.YARN_ID, allocatedContainer.getId().toString());
sr.description = JOYConstants.CONTAINER;
sr.set(YarnRegistryAttributes.YARN_PERSISTENCE,
PersistencePolicies.CONTAINER);
registryOperations.bind(containerPath, sr, BindFlags.OVERWRITE);
}
} catch (IOException e) {
e.printStackTrace();
}
} else {
startType = STARTType.NIMBUS;
// set nimbusHost
jstormMasterContext.nimbusHost = allocatedContainer.getNodeId().getHost();
String path = RegistryUtils.serviceclassPath(
JOYConstants.APP_TYPE, jstormMasterContext.instanceName);
// when nimbus restart or failed,we need reload nimbus data in previous nimbus container
// so when nimbus container allocated we register nimbus's host, directory and containerId, pull previous nimbus
// data from previous nimbus host if necessary.
ServiceRecord serviceRecord = setupServiceRecord();
jstormMasterContext.previousNimbusHost = JOYConstants.EMPTY;
try {
ServiceRecord sr = registryOperations.resolve(path);
jstormMasterContext.previousNimbusHost = sr.get(JOYConstants.NIMBUS_HOST, JOYConstants.EMPTY);
LOG.info("previousNimbusHost is :" + jstormMasterContext.previousNimbusHost + "; nimbusHost is :" + jstormMasterContext.nimbusHost);
// nimbus location register, then we can restart nimbus with no work loss
serviceRecord.set(JOYConstants.NIMBUS_HOST, jstormMasterContext.nimbusHost);
serviceRecord.set(JOYConstants.NIMBUS_LOCAL_DIR, jstormMasterContext.nimbusDataDirPrefix);
serviceRecord.set(JOYConstants.NIMBUS_CONTAINER, allocatedContainer.getId().toString());
registryOperations.bind(path, serviceRecord, BindFlags.OVERWRITE);
} catch (Exception ex) {
LOG.error(ex);
}
LOG.info("allocated nimbus container , nimbus host is :" + jstormMasterContext.nimbusHost);
}
LaunchContainerRunnable runnableLaunchContainer =
new LaunchContainerRunnable(allocatedContainer, containerListener, startType);
Thread launchThread = new Thread(runnableLaunchContainer);
// launch and start the container on a separate thread to keep
// the main thread unblocked
// as all containers may not be allocated at one go.
launchThreads.add(launchThread);
launchThread.start();
// need to remove container request when allocated,
// otherwise RM will continues allocate container over needs
if (!jstormMasterContext.requestBlockingQueue.isEmpty()) {
try {
amRMClient.removeContainerRequest(jstormMasterContext.requestBlockingQueue.take());
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
@Override
public void onShutdownRequest() {
jstormMasterContext.done = true;
}
@Override
public void onNodesUpdated(List<NodeReport> updatedNodes) {
}
@Override
public float getProgress() {
// set progress to deliver to RM on next heartbeat
// float progress = (float) numCompletedContainers.get()
// / numTotalContainers;
// return progress;
// always be 50%
return 0.5f;
}
@Override
public void onError(Throwable e) {
jstormMasterContext.done = true;
amRMClient.stop();
}
}
@VisibleForTesting
static class NMCallbackHandler
implements NMClientAsync.CallbackHandler {
private ConcurrentMap<ContainerId, Container> containers =
new ConcurrentHashMap<ContainerId, Container>();
private final JstormMaster applicationMaster;
public NMCallbackHandler(JstormMaster applicationMaster) {
this.applicationMaster = applicationMaster;
}
public void addContainer(ContainerId containerId, Container container) {
containers.putIfAbsent(containerId, container);
}
@Override
public void onContainerStopped(ContainerId containerId) {
if (LOG.isDebugEnabled()) {
LOG.debug("Succeeded to stop Container " + containerId);
}
containers.remove(containerId);
}
@Override
public void onContainerStatusReceived(ContainerId containerId,
ContainerStatus containerStatus) {
if (LOG.isDebugEnabled()) {
LOG.debug("Container Status: id=" + containerId + ", status=" +
containerStatus);
}
}
@Override
public void onContainerStarted(ContainerId containerId,
Map<String, ByteBuffer> allServiceResponse) {
if (LOG.isDebugEnabled()) {
LOG.debug("Succeeded to start Container " + containerId);
}
Container container = containers.get(containerId);
if (container != null) {
applicationMaster.nmClientAsync.getContainerStatusAsync(containerId, container.getNodeId());
}
if (applicationMaster.timelineClient != null) {
JstormMaster.publishContainerStartEvent(
applicationMaster.timelineClient, container,
applicationMaster.jstormMasterContext.domainId, applicationMaster.appSubmitterUgi);
}
}
@Override
public void onStartContainerError(ContainerId containerId, Throwable t) {
LOG.error("Failed to start Container " + containerId);
containers.remove(containerId);
applicationMaster.jstormMasterContext.numCompletedContainers.incrementAndGet();
applicationMaster.jstormMasterContext.numFailedContainers.incrementAndGet();
}
@Override
public void onGetContainerStatusError(
ContainerId containerId, Throwable t) {
LOG.error("Failed to query the status of Container " + containerId);
}
@Override
public void onStopContainerError(ContainerId containerId, Throwable t) {
LOG.error("Failed to stop Container " + containerId);
containers.remove(containerId);
}
}
/**
* Thread to connect to the {@link ContainerManagementProtocol} and launch the container
* that will execute the shell command.
*/
private class LaunchContainerRunnable implements Runnable {
// Allocated container
Container container;
NMCallbackHandler containerListener;
STARTType startType;
/**
* @param lcontainer Allocated container
* @param containerListener Callback handler of the container
*/
public LaunchContainerRunnable(
Container lcontainer, NMCallbackHandler containerListener, STARTType startType) {
this.container = lcontainer;
this.containerListener = containerListener;
this.startType = startType;
}
@Override
/**
* Connects to CM, sets up container launch context
* for shell command and eventually dispatches the container
* start request to the CM.
*/
public void run() {
LOG.info("Setting up container launch container for containerid="
+ container.getId());
// Set the local resources
Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
// The container for the eventual shell commands needs its own local resources too.
if (!jstormMasterContext.scriptPath.isEmpty()) {
Path renamedScriptPath;
if (Shell.WINDOWS) {
renamedScriptPath = new Path(jstormMasterContext.scriptPath + ".bat");
} else {
renamedScriptPath = new Path(jstormMasterContext.scriptPath + ".sh");
}
try {
// rename the script file based on the underlying OS syntax.
renameScriptFile(renamedScriptPath);
} catch (Exception e) {
LOG.error(
"Not able to add suffix (.bat/.sh) to the shell script filename",
e);
// we cannot continue launching the container. so release it.
jstormMasterContext.numCompletedContainers.incrementAndGet();
jstormMasterContext.numFailedContainers.incrementAndGet();
return;
}
URL yarnUrl;
try {
yarnUrl = ConverterUtils.getYarnUrlFromURI(
new URI(renamedScriptPath.toString()));
} catch (URISyntaxException e) {
LOG.error("Error when trying to use shell script path specified"
+ " in env, path=" + renamedScriptPath, e);
return;
}
URL jarUrl;
try {
jarUrl = ConverterUtils.getYarnUrlFromURI(
new URI(jstormMasterContext.appMasterJarPath.toString()));
} catch (URISyntaxException e) {
LOG.error("Error when trying to use shell script path specified"
+ " in env, path=" + jstormMasterContext.appMasterJarPath, e);
return;
}
try {
FileSystem fileSystem = FileSystem.get(conf);
FileStatus appMasterJarPathStatus = fileSystem.getFileStatus(new Path(jstormMasterContext.appMasterJarPath));
jstormMasterContext.jarPathLen = appMasterJarPathStatus.getLen();
jstormMasterContext.jarTimestamp = appMasterJarPathStatus.getModificationTime();
FileStatus scriptStatus = fileSystem.getFileStatus(renamedScriptPath);
jstormMasterContext.shellScriptPathLen = scriptStatus.getLen();
jstormMasterContext.shellScriptPathTimestamp = scriptStatus.getModificationTime();
LOG.info("jar len:" + jstormMasterContext.jarPathLen + " jar timespan:" + jstormMasterContext.jarTimestamp);
} catch (IOException e) {
LOG.error("get hdfs filestatus"
+ " in env, path=" + jstormMasterContext.appMasterJarPath, e);
}
LocalResource shellRsrc = LocalResource.newInstance(yarnUrl,
LocalResourceType.FILE, LocalResourceVisibility.APPLICATION,
jstormMasterContext.shellScriptPathLen, jstormMasterContext.shellScriptPathTimestamp);
localResources.put(JOYConstants.ExecShellStringPath, shellRsrc);
LocalResource jarRsrc = LocalResource.newInstance(jarUrl,
LocalResourceType.FILE, LocalResourceVisibility.APPLICATION,
jstormMasterContext.jarPathLen, jstormMasterContext.jarTimestamp);
localResources.put(JOYConstants.appMasterJarPath, jarRsrc);
LOG.info(shellRsrc.getResource().getFile());
LOG.info(jarRsrc.getResource().getFile());
jstormMasterContext.shellCommand = Shell.WINDOWS ? JOYConstants.windows_command : JOYConstants.linux_bash_command;
}
// Set the necessary command to execute on the allocated container
Vector<CharSequence> vargs = new Vector<CharSequence>(9);
// Set executable command
vargs.add(jstormMasterContext.shellCommand);
// Set shell script path
if (!jstormMasterContext.scriptPath.isEmpty()) {
vargs.add(Shell.WINDOWS ? JOYConstants.ExecBatScripStringtPath
: JOYConstants.ExecShellStringPath);
}
String startTypeStr = JOYConstants.SUPERVISOR;
// start type specified to be excute by shell script to start jstorm process
if (startType == STARTType.NIMBUS) {
startTypeStr = JOYConstants.NIMBUS;
vargs.add(JOYConstants.NIMBUS);
//put containerId in nimbus containers queue
try {
jstormMasterContext.nimbusContainers.put(this.container);
} catch (InterruptedException e) {
e.printStackTrace();
}
} else {
vargs.add(JOYConstants.SUPERVISOR);
try {
jstormMasterContext.supervisorContainers.put(this.container);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
// pass instanceName for multiple instance deploy
jstormMasterContext.nimbusDataDirPrefix = conf.get(JOYConstants.INSTANCE_DATA_DIR_KEY);
String localDir = jstormMasterContext.nimbusDataDirPrefix + container.getId().toString() + JOYConstants.BACKLASH
+ jstormMasterContext.instanceName;
vargs.add(localDir);
vargs.add(jstormMasterContext.deployPath);
//get superviorhost's free port
SlotPortsView slotPortsView = new SlotPortsView(jstormMasterContext.instanceName, container.getId(), registryOperations);
slotPortsView.setMinPort(conf.getInt(JOYConstants.SUPERVISOR_MIN_PORT_KEY, JOYConstants.PORT_RANGE_MIN));
slotPortsView.setMaxPort(conf.getInt(JOYConstants.SUPERVISOR_MAX_PORT_KEY, JOYConstants.PORT_RANGE_MAX));
String slotPortsStr = JOYConstants.EMPTY;
try {
slotPortsStr = slotPortsView.getSupervisorSlotPorts(container.getResource().getMemory(),
container.getResource().getVirtualCores(), container.getNodeId().getHost());
vargs.add(slotPortsStr);
} catch (Exception ex) {
LOG.error("failed get slot ports , container " + container.toString() + "launch fail", ex);
return;
}
String logviewPort = JOYConstants.DEFAULT_LOGVIEW_PORT;
String nimbusThriftPort = JOYConstants.DEFAULT_NIMBUS_THRIFT_PORT;
try {
logviewPort = slotPortsView.getSupervisorSlotPorts(JOYConstants.DEFAULT_SUPERVISOR_MEMORY,
JOYConstants.DEFAULT_SUPERVISOR_VCORES, container.getNodeId().getHost());
nimbusThriftPort = slotPortsView.getSupervisorSlotPorts(JOYConstants.DEFAULT_SUPERVISOR_MEMORY,
JOYConstants.DEFAULT_SUPERVISOR_VCORES, container.getNodeId().getHost());
} catch (Exception e) {
e.printStackTrace();
}
String hadoopHome = conf.get(JOYConstants.HADOOP_HOME_KEY);
String javaHome = conf.get(JOYConstants.JAVA_HOME_KEY);
String pythonHome = conf.get(JOYConstants.PYTHON_HOME_KEY);
vargs.add(hadoopHome);
vargs.add(javaHome);//$6
vargs.add(pythonHome);//$7
String deployDst = conf.get(JOYConstants.INSTANCE_DEPLOY_DEST_KEY);
if (deployDst == null) {
deployDst = jstormMasterContext.nimbusDataDirPrefix;
}
String dstPath = deployDst + container.getId().toString();
vargs.add(dstPath);//$8
// Set args for the shell command if any
vargs.add(jstormMasterContext.shellArgs);
// Add log redirect params
vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout");
vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr");
// Get final commmand
StringBuilder command = new StringBuilder();
for (CharSequence str : vargs) {
command.append(str).append(" ");
}
List<String> commands = new ArrayList<String>();
Map<String, String> envs = System.getenv();
String exectorCommand = ExecutorLoader.loadCommand(jstormMasterContext.instanceName, jstormMasterContext.shellCommand, startTypeStr,
this.container.getId().toString(), localDir, jstormMasterContext.deployPath, hadoopHome, javaHome, pythonHome, dstPath, slotPortsStr, jstormMasterContext.shellArgs,
envs.get(JOYConstants.CLASS_PATH), JOYConstants.ExecShellStringPath, jstormMasterContext.appAttemptID.getApplicationId().toString(), logviewPort, nimbusThriftPort);
exectorCommand = exectorCommand + " 1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"
+ " 2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr";
commands.add(exectorCommand.toString());
LOG.info("a container command is :" + exectorCommand.toString());
ContainerLaunchContext ctx = ContainerLaunchContext.newInstance(
localResources, jstormMasterContext.shellEnv, commands, null, jstormMasterContext.allTokens.duplicate(), null);
containerListener.addContainer(container.getId(), container);
nmClientAsync.startContainerAsync(container, ctx);
}
}
private void renameScriptFile(final Path renamedScriptPath)
throws IOException, InterruptedException {
appSubmitterUgi.doAs(new PrivilegedExceptionAction<Void>() {
@Override
public Void run() throws IOException {
FileSystem fs = renamedScriptPath.getFileSystem(conf);
fs.rename(new Path(jstormMasterContext.scriptPath), renamedScriptPath);
return null;
}
});
LOG.info("User " + appSubmitterUgi.getUserName()
+ " added suffix(.sh/.bat) to script file as " + renamedScriptPath);
}
/**
* Setup the request that will be sent to the RM for the container ask.
*
* @return the setup ResourceRequest to be sent to RM
*/
public ContainerRequest setupContainerAskForRM(int containerMemory, int containerVirtualCores, int priority, String host) {
// setup requirements for hosts
// using * as any host will do for the jstorm app
// set the priority for the request
Priority pri = Priority.newInstance(priority);
// Set up resource type requirements
// For now, memory and CPU are supported so we set memory and cpu requirements
Resource capability = Resource.newInstance(containerMemory,
containerVirtualCores);
ContainerRequest request = new ContainerRequest(capability, null, null,
pri);
LOG.info("By Thrift Server Requested container ask: " + request.toString());
return request;
}
/**
* Setup the request that will be sent to the RM for the container ask.
*
* @return the setup ResourceRequest to be sent to RM
*/
public ContainerRequest setupContainerAskForRM(int containerMemory, int containerVirtualCores, int priority, String[] racks, String[] hosts) {
Priority pri = Priority.newInstance(priority);
Resource capability = Resource.newInstance(containerMemory,
containerVirtualCores);
ContainerRequest request = new ContainerRequest(capability, hosts, racks,
pri, false);
LOG.info("By Thrift Server Requested container ask: " + request.toString());
return request;
}
private static void publishContainerStartEvent(
final TimelineClient timelineClient, Container container, String domainId,
UserGroupInformation ugi) {
final TimelineEntity entity = new TimelineEntity();
entity.setEntityId(container.getId().toString());
entity.setEntityType(DSEntity.DS_CONTAINER.toString());
entity.setDomainId(domainId);
entity.addPrimaryFilter(JOYConstants.USER, ugi.getShortUserName());
TimelineEvent event = new TimelineEvent();
event.setTimestamp(System.currentTimeMillis());
event.setEventType(DSEvent.DS_CONTAINER_START.toString());
event.addEventInfo(JOYConstants.NODE, container.getNodeId().toString());
event.addEventInfo(JOYConstants.RESOURCES, container.getResource().toString());
entity.addEvent(event);
try {
ugi.doAs(new PrivilegedExceptionAction<TimelinePutResponse>() {
@Override
public TimelinePutResponse run() throws Exception {
return timelineClient.putEntities(entity);
}
});
} catch (Exception e) {
LOG.error("Container start event could not be published for "
+ container.getId().toString(),
e instanceof UndeclaredThrowableException ? e.getCause() : e);
}
}
private static void publishContainerEndEvent(
final TimelineClient timelineClient, ContainerStatus container,
String domainId, UserGroupInformation ugi) {
final TimelineEntity entity = new TimelineEntity();
entity.setEntityId(container.getContainerId().toString());
entity.setEntityType(DSEntity.DS_CONTAINER.toString());
entity.setDomainId(domainId);
entity.addPrimaryFilter(JOYConstants.USER, ugi.getShortUserName());
TimelineEvent event = new TimelineEvent();
event.setTimestamp(System.currentTimeMillis());
event.setEventType(DSEvent.DS_CONTAINER_END.toString());
event.addEventInfo(JOYConstants.STATE, container.getState().name());
event.addEventInfo(JOYConstants.EXIT_STATE, container.getExitStatus());
entity.addEvent(event);
try {
timelineClient.putEntities(entity);
} catch (YarnException e) {
LOG.error("Container end event could not be published for "
+ container.getContainerId().toString(), e);
} catch (IOException e) {
LOG.error("Container end event could not be published for "
+ container.getContainerId().toString(), e);
}
}
private static void publishApplicationAttemptEvent(
final TimelineClient timelineClient, String appAttemptId,
DSEvent appEvent, String domainId, UserGroupInformation ugi) {
final TimelineEntity entity = new TimelineEntity();
entity.setEntityId(appAttemptId);
entity.setEntityType(DSEntity.DS_APP_ATTEMPT.toString());
entity.setDomainId(domainId);
entity.addPrimaryFilter(JOYConstants.USER, ugi.getShortUserName());
TimelineEvent event = new TimelineEvent();
event.setEventType(appEvent.toString());
event.setTimestamp(System.currentTimeMillis());
entity.addEvent(event);
try {
timelineClient.putEntities(entity);
} catch (YarnException e) {
LOG.error("App Attempt "
+ (appEvent.equals(DSEvent.DS_APP_ATTEMPT_START) ? JOYConstants.START : JOYConstants.END)
+ " event could not be published for "
+ appAttemptId.toString(), e);
} catch (IOException e) {
LOG.error("App Attempt "
+ (appEvent.equals(DSEvent.DS_APP_ATTEMPT_START) ? JOYConstants.START : JOYConstants.END)
+ " event could not be published for "
+ appAttemptId.toString(), e);
}
}
/**
* TODO: purge this once RM is doing the work
*
* @throws IOException
*/
protected void setupInitialRegistryPaths() throws IOException {
if (registryOperations instanceof RMRegistryOperationsService) {
RMRegistryOperationsService rmRegOperations =
(RMRegistryOperationsService) registryOperations;
rmRegOperations.initUserRegistryAsync(jstormMasterContext.service_user_name);
}
}
}