/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.yarn;
import akka.actor.ActorRef;
import akka.actor.Props;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.clusterframework.FlinkResourceManager;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters;
import org.apache.flink.runtime.clusterframework.messages.StopCluster;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.util.Preconditions;
import org.apache.flink.yarn.messages.ContainersAllocated;
import org.apache.flink.yarn.messages.ContainersComplete;
import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.NMClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.slf4j.Logger;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.util.Objects.requireNonNull;
/**
* Specialized Flink Resource Manager implementation for YARN clusters. It is started as the
* YARN ApplicationMaster and implements the YARN-specific logic for container requests and failure
* monitoring.
*/
public class YarnFlinkResourceManager extends FlinkResourceManager<RegisteredYarnWorkerNode> {
/** The heartbeat interval while the resource master is waiting for containers */
private static final int FAST_YARN_HEARTBEAT_INTERVAL_MS = 500;
/** The default heartbeat interval during regular operation */
private static final int DEFAULT_YARN_HEARTBEAT_INTERVAL_MS = 5000;
/** Environment variable name of the final container id used by the Flink ResourceManager.
* Container ID generation may vary across Hadoop versions. */
final static String ENV_FLINK_CONTAINER_ID = "_FLINK_CONTAINER_ID";
/** The containers where a TaskManager is starting and we are waiting for it to register */
private final Map<ResourceID, YarnContainerInLaunch> containersInLaunch;
/** Containers we have released, where we are waiting for an acknowledgement that
* they are released */
private final Map<ContainerId, Container> containersBeingReturned;
/** The YARN / Hadoop configuration object */
private final YarnConfiguration yarnConfig;
/** The TaskManager container parameters (like container memory size) */
private final ContaineredTaskManagerParameters taskManagerParameters;
/** Context information used to start a TaskManager Java process */
private final ContainerLaunchContext taskManagerLaunchContext;
/** Host name for the container running this process */
private final String applicationMasterHostName;
/** Web interface URL, may be null */
private final String webInterfaceURL;
/** Default heartbeat interval between this actor and the YARN ResourceManager */
private final int yarnHeartbeatIntervalMillis;
/** Number of failed TaskManager containers before stopping the application. -1 means infinite. */
private final int maxFailedContainers;
/** Callback handler for the asynchronous resourceManagerClient */
private YarnResourceManagerCallbackHandler resourceManagerCallbackHandler;
/** Client to communicate with the Resource Manager (YARN's master) */
private AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient;
/** Client to communicate with the Node manager and launch TaskManager processes */
private NMClient nodeManagerClient;
/** The number of containers requested, but not yet granted */
private int numPendingContainerRequests;
/** The number of failed containers since the master became active */
private int failedContainersSoFar;
/** A reference to the reflector to look up previous session containers. */
private RegisterApplicationMasterResponseReflector applicationMasterResponseReflector =
new RegisterApplicationMasterResponseReflector(LOG);
public YarnFlinkResourceManager(
Configuration flinkConfig,
YarnConfiguration yarnConfig,
LeaderRetrievalService leaderRetrievalService,
String applicationMasterHostName,
String webInterfaceURL,
ContaineredTaskManagerParameters taskManagerParameters,
ContainerLaunchContext taskManagerLaunchContext,
int yarnHeartbeatIntervalMillis,
int maxFailedContainers,
int numInitialTaskManagers) {
this(
flinkConfig,
yarnConfig,
leaderRetrievalService,
applicationMasterHostName,
webInterfaceURL,
taskManagerParameters,
taskManagerLaunchContext,
yarnHeartbeatIntervalMillis,
maxFailedContainers,
numInitialTaskManagers,
new YarnResourceManagerCallbackHandler());
}
public YarnFlinkResourceManager(
Configuration flinkConfig,
YarnConfiguration yarnConfig,
LeaderRetrievalService leaderRetrievalService,
String applicationMasterHostName,
String webInterfaceURL,
ContaineredTaskManagerParameters taskManagerParameters,
ContainerLaunchContext taskManagerLaunchContext,
int yarnHeartbeatIntervalMillis,
int maxFailedContainers,
int numInitialTaskManagers,
YarnResourceManagerCallbackHandler callbackHandler) {
this(
flinkConfig,
yarnConfig,
leaderRetrievalService,
applicationMasterHostName,
webInterfaceURL,
taskManagerParameters,
taskManagerLaunchContext,
yarnHeartbeatIntervalMillis,
maxFailedContainers,
numInitialTaskManagers,
callbackHandler,
AMRMClientAsync.createAMRMClientAsync(yarnHeartbeatIntervalMillis, callbackHandler),
NMClient.createNMClient());
}
public YarnFlinkResourceManager(
Configuration flinkConfig,
YarnConfiguration yarnConfig,
LeaderRetrievalService leaderRetrievalService,
String applicationMasterHostName,
String webInterfaceURL,
ContaineredTaskManagerParameters taskManagerParameters,
ContainerLaunchContext taskManagerLaunchContext,
int yarnHeartbeatIntervalMillis,
int maxFailedContainers,
int numInitialTaskManagers,
YarnResourceManagerCallbackHandler callbackHandler,
AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient,
NMClient nodeManagerClient) {
super(numInitialTaskManagers, flinkConfig, leaderRetrievalService);
this.yarnConfig = requireNonNull(yarnConfig);
this.taskManagerParameters = requireNonNull(taskManagerParameters);
this.taskManagerLaunchContext = requireNonNull(taskManagerLaunchContext);
this.applicationMasterHostName = requireNonNull(applicationMasterHostName);
this.webInterfaceURL = webInterfaceURL;
this.yarnHeartbeatIntervalMillis = yarnHeartbeatIntervalMillis;
this.maxFailedContainers = maxFailedContainers;
this.resourceManagerCallbackHandler = Preconditions.checkNotNull(callbackHandler);
this.resourceManagerClient = Preconditions.checkNotNull(resourceManagerClient);
this.nodeManagerClient = Preconditions.checkNotNull(nodeManagerClient);
this.containersInLaunch = new HashMap<>();
this.containersBeingReturned = new HashMap<>();
}
// ------------------------------------------------------------------------
// Actor messages
// ------------------------------------------------------------------------
@Override
protected void handleMessage(Object message) {
// check for YARN specific actor messages first
if (message instanceof ContainersAllocated) {
containersAllocated(((ContainersAllocated) message).containers());
} else if (message instanceof ContainersComplete) {
containersComplete(((ContainersComplete) message).containers());
} else {
// message handled by the generic resource master code
super.handleMessage(message);
}
}
// ------------------------------------------------------------------------
// YARN specific behavior
// ------------------------------------------------------------------------
@Override
protected void initialize() throws Exception {
LOG.info("Initializing YARN resource master");
resourceManagerCallbackHandler.initialize(self());
resourceManagerClient.init(yarnConfig);
resourceManagerClient.start();
// create the client to communicate with the node managers
nodeManagerClient.init(yarnConfig);
nodeManagerClient.start();
nodeManagerClient.cleanupRunningContainersOnStop(true);
// register with Resource Manager
LOG.info("Registering Application Master with tracking url {}", webInterfaceURL);
scala.Option<Object> portOption = AkkaUtils.getAddress(getContext().system()).port();
int actorSystemPort = portOption.isDefined() ? (int) portOption.get() : -1;
RegisterApplicationMasterResponse response = resourceManagerClient.registerApplicationMaster(
applicationMasterHostName, actorSystemPort, webInterfaceURL);
// if this application master starts as part of an ApplicationMaster/JobManager recovery,
// then some worker containers are most likely still alive and we can re-obtain them
List<Container> containersFromPreviousAttempts =
applicationMasterResponseReflector.getContainersFromPreviousAttempts(response);
if (!containersFromPreviousAttempts.isEmpty()) {
LOG.info("Retrieved {} TaskManagers from previous attempt", containersFromPreviousAttempts.size());
final long now = System.currentTimeMillis();
for (Container c : containersFromPreviousAttempts) {
YarnContainerInLaunch containerInLaunch = new YarnContainerInLaunch(c, now);
containersInLaunch.put(containerInLaunch.getResourceID(), containerInLaunch);
}
// adjust the progress indicator
updateProgress();
}
}
@Override
protected void shutdownApplication(ApplicationStatus finalStatus, String optionalDiagnostics) {
// first, de-register from YARN
FinalApplicationStatus yarnStatus = getYarnStatus(finalStatus);
LOG.info("Unregistering application from the YARN Resource Manager");
try {
resourceManagerClient.unregisterApplicationMaster(yarnStatus, optionalDiagnostics, "");
} catch (Throwable t) {
LOG.error("Could not unregister the application master.", t);
}
// now shut down all our components
try {
resourceManagerClient.stop();
} catch (Throwable t) {
LOG.error("Could not cleanly shut down the Asynchronous Resource Manager Client", t);
}
try {
nodeManagerClient.stop();
} catch (Throwable t) {
LOG.error("Could not cleanly shut down the Node Manager Client", t);
}
}
@Override
protected void fatalError(String message, Throwable error) {
// we do not unregister, but cause a hard fail of this process, to have it
// restarted by YARN
LOG.error("FATAL ERROR IN YARN APPLICATION MASTER: " + message, error);
LOG.error("Shutting down process");
// kill this process, this will make YARN restart the process
System.exit(EXIT_CODE_FATAL_ERROR);
}
@Override
protected void requestNewWorkers(int numWorkers) {
final long mem = taskManagerParameters.taskManagerTotalMemoryMB();
final int containerMemorySizeMB;
if (mem <= Integer.MAX_VALUE) {
containerMemorySizeMB = (int) mem;
} else {
containerMemorySizeMB = Integer.MAX_VALUE;
LOG.error("Decreasing container size from {} MB to {} MB (integer value overflow)",
mem, containerMemorySizeMB);
}
for (int i = 0; i < numWorkers; i++) {
numPendingContainerRequests++;
LOG.info("Requesting new TaskManager container with {} megabytes memory. Pending requests: {}",
containerMemorySizeMB, numPendingContainerRequests);
// Priority for worker containers - priorities are intra-application
Priority priority = Priority.newInstance(0);
// Resource requirements for worker containers
int taskManagerSlots = taskManagerParameters.numSlots();
int vcores = config.getInteger(ConfigConstants.YARN_VCORES, Math.max(taskManagerSlots, 1));
Resource capability = Resource.newInstance(containerMemorySizeMB, vcores);
resourceManagerClient.addContainerRequest(
new AMRMClient.ContainerRequest(capability, null, null, priority));
}
// make sure we transmit the request fast and receive fast news of granted allocations
resourceManagerClient.setHeartbeatInterval(FAST_YARN_HEARTBEAT_INTERVAL_MS);
}
@Override
protected void releasePendingWorker(ResourceID id) {
YarnContainerInLaunch container = containersInLaunch.remove(id);
if (container != null) {
releaseYarnContainer(container.container());
} else {
LOG.error("Cannot find container {} to release. Ignoring request.", id);
}
}
@Override
protected void releaseStartedWorker(RegisteredYarnWorkerNode worker) {
releaseYarnContainer(worker.yarnContainer());
}
private void releaseYarnContainer(Container container) {
LOG.info("Releasing YARN container {}", container.getId());
containersBeingReturned.put(container.getId(), container);
// release the container on the node manager
try {
nodeManagerClient.stopContainer(container.getId(), container.getNodeId());
} catch (Throwable t) {
// we only log this error. since the ResourceManager also gets the release
// notification, the container should be eventually cleaned up
LOG.error("Error while calling YARN Node Manager to release container", t);
}
// tell the master that the container is no longer needed
resourceManagerClient.releaseAssignedContainer(container.getId());
}
@Override
protected RegisteredYarnWorkerNode workerStarted(ResourceID resourceID) {
YarnContainerInLaunch inLaunch = containersInLaunch.remove(resourceID);
if (inLaunch == null) {
// Container was not in state "being launched", this can indicate that the TaskManager
// in this container was already registered or that the container was not started
// by this resource manager. Simply ignore this resourceID.
return null;
} else {
return new RegisteredYarnWorkerNode(inLaunch.container());
}
}
@Override
protected Collection<RegisteredYarnWorkerNode> reacceptRegisteredWorkers(Collection<ResourceID> toConsolidate) {
// we check for each task manager if we recognize its container
List<RegisteredYarnWorkerNode> accepted = new ArrayList<>();
for (ResourceID resourceID : toConsolidate) {
YarnContainerInLaunch yci = containersInLaunch.remove(resourceID);
if (yci != null) {
LOG.info("YARN container consolidation recognizes Resource {} ", resourceID);
accepted.add(new RegisteredYarnWorkerNode(yci.container()));
}
else {
if (isStarted(resourceID)) {
LOG.info("TaskManager {} has already been registered at the resource manager.", resourceID);
} else {
LOG.info("YARN container consolidation does not recognize TaskManager {}",
resourceID);
}
}
}
return accepted;
}
@Override
protected int getNumWorkerRequestsPending() {
return numPendingContainerRequests;
}
@Override
protected int getNumWorkersPendingRegistration() {
return containersInLaunch.size();
}
// ------------------------------------------------------------------------
// Callbacks from the YARN Resource Manager
// ------------------------------------------------------------------------
private void containersAllocated(List<Container> containers) {
final int numRequired = getDesignatedWorkerPoolSize();
final int numRegistered = getNumberOfStartedTaskManagers();
for (Container container : containers) {
numPendingContainerRequests = Math.max(0, numPendingContainerRequests - 1);
LOG.info("Received new container: {} - Remaining pending container requests: {}",
container.getId(), numPendingContainerRequests);
// decide whether to return the container, or whether to start a TaskManager
if (numRegistered + containersInLaunch.size() < numRequired) {
// start a TaskManager
final YarnContainerInLaunch containerInLaunch = new YarnContainerInLaunch(container);
final ResourceID resourceID = containerInLaunch.getResourceID();
containersInLaunch.put(resourceID, containerInLaunch);
String message = "Launching TaskManager in container " + containerInLaunch
+ " on host " + container.getNodeId().getHost();
LOG.info(message);
sendInfoMessage(message);
try {
// set a special environment variable to uniquely identify this container
taskManagerLaunchContext.getEnvironment()
.put(ENV_FLINK_CONTAINER_ID, resourceID.getResourceIdString());
nodeManagerClient.startContainer(container, taskManagerLaunchContext);
}
catch (Throwable t) {
// failed to launch the container
containersInLaunch.remove(resourceID);
// return container, a new one will be requested eventually
LOG.error("Could not start TaskManager in container " + containerInLaunch, t);
containersBeingReturned.put(container.getId(), container);
resourceManagerClient.releaseAssignedContainer(container.getId());
}
} else {
// return excessive container
LOG.info("Returning excess container {}", container.getId());
containersBeingReturned.put(container.getId(), container);
resourceManagerClient.releaseAssignedContainer(container.getId());
}
}
updateProgress();
// if we are waiting for no further containers, we can go to the
// regular heartbeat interval
if (numPendingContainerRequests <= 0) {
resourceManagerClient.setHeartbeatInterval(yarnHeartbeatIntervalMillis);
}
// make sure we re-check the status of workers / containers one more time at least,
// in case some containers did not come up properly
triggerCheckWorkers();
}
/**
* Invoked when the ResourceManager informs of completed containers.
* Called via an actor message by the callback from the ResourceManager client.
*
* @param containers The containers that have completed.
*/
private void containersComplete(List<ContainerStatus> containers) {
// the list contains both failed containers, as well as containers that
// were gracefully returned by this application master
for (ContainerStatus status : containers) {
final ResourceID id = new ResourceID(status.getContainerId().toString());
// check if this is a failed container or a completed container
if (containersBeingReturned.remove(status.getContainerId()) != null) {
// regular completed container that we released
LOG.info("Container {} completed successfully with diagnostics: {}",
id, status.getDiagnostics());
} else {
// failed container, either at startup, or running
final String exitStatus;
switch (status.getExitStatus()) {
case -103:
exitStatus = "Vmem limit exceeded (-103)";
break;
case -104:
exitStatus = "Pmem limit exceeded (-104)";
break;
default:
exitStatus = String.valueOf(status.getExitStatus());
}
final YarnContainerInLaunch launched = containersInLaunch.remove(id);
if (launched != null) {
LOG.info("Container {} failed, with a TaskManager in launch or registration. " +
"Exit status: {}", id, exitStatus);
// we will trigger re-acquiring new containers at the end
} else {
// failed registered worker
LOG.info("Container {} failed. Exit status: {}", id, exitStatus);
// notify the generic logic, which notifies the JobManager, etc.
notifyWorkerFailed(id, "Container " + id + " failed. " + "Exit status: {}" + exitStatus);
}
// general failure logging
failedContainersSoFar++;
String diagMessage = String.format("Diagnostics for container %s in state %s : " +
"exitStatus=%s diagnostics=%s",
id, status.getState(), exitStatus, status.getDiagnostics());
sendInfoMessage(diagMessage);
LOG.info(diagMessage);
LOG.info("Total number of failed containers so far: " + failedContainersSoFar);
// maxFailedContainers == -1 is infinite number of retries.
if (maxFailedContainers >= 0 && failedContainersSoFar > maxFailedContainers) {
String msg = "Stopping YARN session because the number of failed containers ("
+ failedContainersSoFar + ") exceeded the maximum failed containers ("
+ maxFailedContainers + "). This number is controlled by the '"
+ ConfigConstants.YARN_MAX_FAILED_CONTAINERS + "' configuration setting. "
+ "By default its the number of requested containers.";
LOG.error(msg);
self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)),
ActorRef.noSender());
// no need to do anything else
return;
}
}
}
updateProgress();
// in case failed containers were among the finished containers, make
// sure we re-examine and request new ones
triggerCheckWorkers();
}
// ------------------------------------------------------------------------
// Utilities
// ------------------------------------------------------------------------
/**
* Extracts a unique ResourceID from the Yarn Container.
* @param container The Yarn container
* @return The ResourceID for the container
*/
static ResourceID extractResourceID(Container container) {
return new ResourceID(container.getId().toString());
}
private void updateProgress() {
final int required = getDesignatedWorkerPoolSize();
final int available = getNumberOfStartedTaskManagers() + containersInLaunch.size();
final float progress = (required <= 0) ? 1.0f : available / (float) required;
if (resourceManagerCallbackHandler != null) {
resourceManagerCallbackHandler.setCurrentProgress(progress);
}
}
/**
* Converts a Flink application status enum to a YARN application status enum.
* @param status The Flink application status.
* @return The corresponding YARN application status.
*/
private FinalApplicationStatus getYarnStatus(ApplicationStatus status) {
if (status == null) {
return FinalApplicationStatus.UNDEFINED;
}
else {
switch (status) {
case SUCCEEDED:
return FinalApplicationStatus.SUCCEEDED;
case FAILED:
return FinalApplicationStatus.FAILED;
case CANCELED:
return FinalApplicationStatus.KILLED;
default:
return FinalApplicationStatus.UNDEFINED;
}
}
}
/**
* Looks up the getContainersFromPreviousAttempts method on RegisterApplicationMasterResponse
* once and saves the method. This saves computation time on the sequent calls.
*/
private static class RegisterApplicationMasterResponseReflector {
private Logger logger;
private Method method;
public RegisterApplicationMasterResponseReflector(Logger LOG) {
this.logger = LOG;
try {
method = RegisterApplicationMasterResponse.class
.getMethod("getContainersFromPreviousAttempts");
} catch (NoSuchMethodException e) {
// that happens in earlier Hadoop versions
logger.info("Cannot reconnect to previously allocated containers. " +
"This YARN version does not support 'getContainersFromPreviousAttempts()'");
}
}
/**
* Checks if a YARN application still has registered containers. If the application master
* registered at the ResourceManager for the first time, this list will be empty. If the
* application master registered a repeated time (after a failure and recovery), this list
* will contain the containers that were previously allocated.
*
* @param response The response object from the registration at the ResourceManager.
* @return A list with containers from previous application attempt.
*/
private List<Container> getContainersFromPreviousAttempts(RegisterApplicationMasterResponse response) {
if (method != null && response != null) {
try {
@SuppressWarnings("unchecked")
List<Container> list = (List<Container>) method.invoke(response);
if (list != null && !list.isEmpty()) {
return list;
}
} catch (Throwable t) {
logger.error("Error invoking 'getContainersFromPreviousAttempts()'", t);
}
}
return Collections.emptyList();
}
}
// ------------------------------------------------------------------------
// Actor props factory
// ------------------------------------------------------------------------
/**
* Creates the props needed to instantiate this actor.
*
* Rather than extracting and validating parameters in the constructor, this factory method takes
* care of that. That way, errors occur synchronously, and are not swallowed simply in a
* failed asynchronous attempt to start the actor.
* @param actorClass
* The actor class, to allow overriding this actor with subclasses for testing.
* @param flinkConfig
* The Flink configuration object.
* @param yarnConfig
* The YARN configuration object.
* @param applicationMasterHostName
* The hostname where this application master actor runs.
* @param webFrontendURL
* The URL of the tracking web frontend.
* @param taskManagerParameters
* The parameters for launching TaskManager containers.
* @param taskManagerLaunchContext
* The parameters for launching the TaskManager processes in the TaskManager containers.
* @param numInitialTaskManagers
* The initial number of TaskManagers to allocate.
* @param log
* The logger to log to.
*
* @return The Props object to instantiate the YarnFlinkResourceManager actor.
*/
public static Props createActorProps(Class<? extends YarnFlinkResourceManager> actorClass,
Configuration flinkConfig,
YarnConfiguration yarnConfig,
LeaderRetrievalService leaderRetrievalService,
String applicationMasterHostName,
String webFrontendURL,
ContaineredTaskManagerParameters taskManagerParameters,
ContainerLaunchContext taskManagerLaunchContext,
int numInitialTaskManagers,
Logger log)
{
final int yarnHeartbeatIntervalMS = flinkConfig.getInteger(
ConfigConstants.YARN_HEARTBEAT_DELAY_SECONDS, DEFAULT_YARN_HEARTBEAT_INTERVAL_MS / 1000) * 1000;
final long yarnExpiryIntervalMS = yarnConfig.getLong(
YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS,
YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS);
if (yarnHeartbeatIntervalMS >= yarnExpiryIntervalMS) {
log.warn("The heartbeat interval of the Flink Application master ({}) is greater " +
"than YARN's expiry interval ({}). The application is likely to be killed by YARN.",
yarnHeartbeatIntervalMS, yarnExpiryIntervalMS);
}
final int maxFailedContainers = flinkConfig.getInteger(
ConfigConstants.YARN_MAX_FAILED_CONTAINERS, numInitialTaskManagers);
if (maxFailedContainers >= 0) {
log.info("YARN application tolerates {} failed TaskManager containers before giving up",
maxFailedContainers);
}
return Props.create(actorClass,
flinkConfig,
yarnConfig,
leaderRetrievalService,
applicationMasterHostName,
webFrontendURL,
taskManagerParameters,
taskManagerLaunchContext,
yarnHeartbeatIntervalMS,
maxFailedContainers,
numInitialTaskManagers);
}
}