/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.yarn;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.BootstrapTools;
import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.heartbeat.HeartbeatServices;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.instance.InstanceID;
import org.apache.flink.runtime.metrics.MetricRegistry;
import org.apache.flink.runtime.resourcemanager.JobLeaderIdService;
import org.apache.flink.runtime.resourcemanager.ResourceManager;
import org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration;
import org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException;
import org.apache.flink.runtime.resourcemanager.slotmanager.SlotManager;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.runtime.rpc.RpcService;
import org.apache.hadoop.yarn.api.ApplicationConstants;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.NodeReport;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.NMClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.concurrent.duration.FiniteDuration;
import org.apache.flink.util.ExceptionUtils;
import java.util.Map;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.TimeUnit;
/**
* The yarn implementation of the resource manager. Used when the system is started
* via the resource framework YARN.
*/
public class YarnResourceManager extends ResourceManager<ResourceID> implements AMRMClientAsync.CallbackHandler {
protected final Logger LOG = LoggerFactory.getLogger(getClass());
/** The process environment variables */
private final Map<String, String> ENV;
/** The default registration timeout for task executor in seconds. */
private final static int DEFAULT_TASK_MANAGER_REGISTRATION_DURATION = 300;
/** The heartbeat interval while the resource master is waiting for containers */
private static final int FAST_YARN_HEARTBEAT_INTERVAL_MS = 500;
/** The default heartbeat interval during regular operation */
private static final int DEFAULT_YARN_HEARTBEAT_INTERVAL_MS = 5000;
/** The default memory of task executor to allocate (in MB) */
private static final int DEFAULT_TSK_EXECUTOR_MEMORY_SIZE = 1024;
/** Environment variable name of the final container id used by the YarnResourceManager.
* Container ID generation may vary across Hadoop versions. */
final static String ENV_FLINK_CONTAINER_ID = "_FLINK_CONTAINER_ID";
/** Environment variable name of the hostname given by the YARN.
* In task executor we use the hostnames given by YARN consistently throughout akka */
final static String ENV_FLINK_NODE_ID = "_FLINK_NODE_ID";
/** Default heartbeat interval between this resource manager and the YARN ResourceManager */
private final int yarnHeartbeatIntervalMillis;
private final Configuration flinkConfig;
private final YarnConfiguration yarnConfig;
/** Client to communicate with the Resource Manager (YARN's master) */
private AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient;
/** Client to communicate with the Node manager and launch TaskExecutor processes */
private NMClient nodeManagerClient;
/** The number of containers requested, but not yet granted */
private int numPendingContainerRequests;
final private Map<ResourceProfile, Integer> resourcePriorities = new HashMap<>();
public YarnResourceManager(
RpcService rpcService,
String resourceManagerEndpointId,
ResourceID resourceId,
Configuration flinkConfig,
Map<String, String> env,
ResourceManagerConfiguration resourceManagerConfiguration,
HighAvailabilityServices highAvailabilityServices,
HeartbeatServices heartbeatServices,
SlotManager slotManager,
MetricRegistry metricRegistry,
JobLeaderIdService jobLeaderIdService,
FatalErrorHandler fatalErrorHandler) {
super(
rpcService,
resourceManagerEndpointId,
resourceId,
resourceManagerConfiguration,
highAvailabilityServices,
heartbeatServices,
slotManager,
metricRegistry,
jobLeaderIdService,
fatalErrorHandler);
this.flinkConfig = flinkConfig;
this.yarnConfig = new YarnConfiguration();
this.ENV = env;
final int yarnHeartbeatIntervalMS = flinkConfig.getInteger(
ConfigConstants.YARN_HEARTBEAT_DELAY_SECONDS, DEFAULT_YARN_HEARTBEAT_INTERVAL_MS / 1000) * 1000;
final long yarnExpiryIntervalMS = yarnConfig.getLong(
YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS,
YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS);
if (yarnHeartbeatIntervalMS >= yarnExpiryIntervalMS) {
log.warn("The heartbeat interval of the Flink Application master ({}) is greater " +
"than YARN's expiry interval ({}). The application is likely to be killed by YARN.",
yarnHeartbeatIntervalMS, yarnExpiryIntervalMS);
}
yarnHeartbeatIntervalMillis = yarnHeartbeatIntervalMS;
numPendingContainerRequests = 0;
}
@Override
protected void initialize() throws ResourceManagerException {
resourceManagerClient = AMRMClientAsync.createAMRMClientAsync(yarnHeartbeatIntervalMillis, this);
resourceManagerClient.init(yarnConfig);
resourceManagerClient.start();
try {
//TODO: change akka address to tcp host and port, the getAddress() interface should return a standard tcp address
Tuple2<String, Integer> hostPort = parseHostPort(getAddress());
//TODO: the third paramter should be the webmonitor address
resourceManagerClient.registerApplicationMaster(hostPort.f0, hostPort.f1, getAddress());
} catch (Exception e) {
LOG.info("registerApplicationMaster fail", e);
}
// create the client to communicate with the node managers
nodeManagerClient = NMClient.createNMClient();
nodeManagerClient.init(yarnConfig);
nodeManagerClient.start();
nodeManagerClient.cleanupRunningContainersOnStop(true);
}
@Override
public void shutDown() throws Exception {
// shut down all components
Throwable firstException = null;
if (resourceManagerClient != null) {
try {
resourceManagerClient.stop();
} catch (Throwable t) {
firstException = t;
}
}
if (nodeManagerClient != null) {
try {
nodeManagerClient.stop();
} catch (Throwable t) {
if (firstException == null) {
firstException = t;
} else {
firstException.addSuppressed(t);
}
}
}
if (firstException != null) {
ExceptionUtils.rethrowException(firstException, "Error while shutting down YARN resource manager");
}
super.shutDown();
}
@Override
protected void shutDownApplication(ApplicationStatus finalStatus, String optionalDiagnostics) {
// first, de-register from YARN
FinalApplicationStatus yarnStatus = getYarnStatus(finalStatus);
LOG.info("Unregistering application from the YARN Resource Manager");
try {
resourceManagerClient.unregisterApplicationMaster(yarnStatus, optionalDiagnostics, "");
} catch (Throwable t) {
LOG.error("Could not unregister the application master.", t);
}
}
@Override
public void startNewWorker(ResourceProfile resourceProfile) {
// Priority for worker containers - priorities are intra-application
//TODO: set priority according to the resource allocated
Priority priority = Priority.newInstance(generatePriority(resourceProfile));
int mem = resourceProfile.getMemoryInMB() < 0 ? DEFAULT_TSK_EXECUTOR_MEMORY_SIZE : (int)resourceProfile.getMemoryInMB();
int vcore = resourceProfile.getCpuCores() < 1 ? 1 : (int)resourceProfile.getCpuCores();
Resource capability = Resource.newInstance(mem, vcore);
requestYarnContainer(capability, priority);
}
@Override
public void stopWorker(InstanceID instanceId) {
// TODO: Implement to stop the worker
}
@Override
protected ResourceID workerStarted(ResourceID resourceID) {
return resourceID;
}
// AMRMClientAsync CallbackHandler methods
@Override
public float getProgress() {
// Temporarily need not record the total size of asked and allocated containers
return 1;
}
@Override
public void onContainersCompleted(List<ContainerStatus> list) {
for (ContainerStatus container : list) {
if (container.getExitStatus() < 0) {
closeTaskManagerConnection(new ResourceID(
container.getContainerId().toString()), new Exception(container.getDiagnostics()));
}
}
}
@Override
public void onContainersAllocated(List<Container> containers) {
for (Container container : containers) {
numPendingContainerRequests = Math.max(0, numPendingContainerRequests - 1);
LOG.info("Received new container: {} - Remaining pending container requests: {}",
container.getId(), numPendingContainerRequests);
try {
/** Context information used to start a TaskExecutor Java process */
ContainerLaunchContext taskExecutorLaunchContext =
createTaskExecutorLaunchContext(container.getResource(), container.getId().toString(), container.getNodeId().getHost());
nodeManagerClient.startContainer(container, taskExecutorLaunchContext);
}
catch (Throwable t) {
// failed to launch the container, will release the failed one and ask for a new one
LOG.error("Could not start TaskManager in container {},", container, t);
resourceManagerClient.releaseAssignedContainer(container.getId());
requestYarnContainer(container.getResource(), container.getPriority());
}
}
if (numPendingContainerRequests <= 0) {
resourceManagerClient.setHeartbeatInterval(yarnHeartbeatIntervalMillis);
}
}
@Override
public void onShutdownRequest() {
try {
shutDown();
} catch (Exception e) {
LOG.warn("Fail to shutdown the YARN resource manager.", e);
}
}
@Override
public void onNodesUpdated(List<NodeReport> list) {
// We are not interested in node updates
}
@Override
public void onError(Throwable error) {
onFatalErrorAsync(error);
}
//Utility methods
/**
* Converts a Flink application status enum to a YARN application status enum.
* @param status The Flink application status.
* @return The corresponding YARN application status.
*/
private FinalApplicationStatus getYarnStatus(ApplicationStatus status) {
if (status == null) {
return FinalApplicationStatus.UNDEFINED;
}
else {
switch (status) {
case SUCCEEDED:
return FinalApplicationStatus.SUCCEEDED;
case FAILED:
return FinalApplicationStatus.FAILED;
case CANCELED:
return FinalApplicationStatus.KILLED;
default:
return FinalApplicationStatus.UNDEFINED;
}
}
}
// parse the host and port from akka address,
// the akka address is like akka.tcp://flink@100.81.153.180:49712/user/$a
private static Tuple2<String, Integer> parseHostPort(String address) {
String[] hostPort = address.split("@")[1].split(":");
String host = hostPort[0];
String port = hostPort[1].split("/")[0];
return new Tuple2(host, Integer.valueOf(port));
}
private void requestYarnContainer(Resource resource, Priority priority) {
resourceManagerClient.addContainerRequest(
new AMRMClient.ContainerRequest(resource, null, null, priority));
// make sure we transmit the request fast and receive fast news of granted allocations
resourceManagerClient.setHeartbeatInterval(FAST_YARN_HEARTBEAT_INTERVAL_MS);
numPendingContainerRequests++;
LOG.info("Requesting new TaskManager container pending requests: {}",
numPendingContainerRequests);
}
private ContainerLaunchContext createTaskExecutorLaunchContext(Resource resource, String containerId, String host)
throws Exception {
// init the ContainerLaunchContext
final String currDir = ENV.get(ApplicationConstants.Environment.PWD.key());
final ContaineredTaskManagerParameters taskManagerParameters =
ContaineredTaskManagerParameters.create(flinkConfig, resource.getMemory(), 1);
LOG.info("TaskExecutor{} will be started with container size {} MB, JVM heap size {} MB, " +
"JVM direct memory limit {} MB",
containerId,
taskManagerParameters.taskManagerTotalMemoryMB(),
taskManagerParameters.taskManagerHeapSizeMB(),
taskManagerParameters.taskManagerDirectMemoryLimitMB());
int timeout = flinkConfig.getInteger(ConfigConstants.TASK_MANAGER_MAX_REGISTRATION_DURATION,
DEFAULT_TASK_MANAGER_REGISTRATION_DURATION);
FiniteDuration teRegistrationTimeout = new FiniteDuration(timeout, TimeUnit.SECONDS);
final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(
flinkConfig, "", 0, 1, teRegistrationTimeout);
LOG.debug("TaskManager configuration: {}", taskManagerConfig);
ContainerLaunchContext taskExecutorLaunchContext = Utils.createTaskExecutorContext(
flinkConfig, yarnConfig, ENV,
taskManagerParameters, taskManagerConfig,
currDir, YarnTaskExecutorRunner.class, LOG);
// set a special environment variable to uniquely identify this container
taskExecutorLaunchContext.getEnvironment()
.put(ENV_FLINK_CONTAINER_ID, containerId);
taskExecutorLaunchContext.getEnvironment()
.put(ENV_FLINK_NODE_ID, host);
return taskExecutorLaunchContext;
}
/**
* Generate priority by given resource profile.
* Priority is only used for distinguishing request of different resource.
* @param resourceProfile The resource profile of a request
* @return The priority of this resource profile.
*/
private int generatePriority(ResourceProfile resourceProfile) {
if (resourcePriorities.containsKey(resourceProfile)) {
return resourcePriorities.get(resourceProfile);
} else {
int priority = resourcePriorities.size();
resourcePriorities.put(resourceProfile, priority);
return priority;
}
}
}