YarnFlinkResourceManager.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.yarn;

import akka.actor.ActorRef;
import akka.actor.Props;

import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.clusterframework.FlinkResourceManager;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters;
import org.apache.flink.runtime.clusterframework.messages.StopCluster;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.util.Preconditions;
import org.apache.flink.yarn.messages.ContainersAllocated;
import org.apache.flink.yarn.messages.ContainersComplete;

import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.client.api.AMRMClient;
import org.apache.hadoop.yarn.client.api.NMClient;
import org.apache.hadoop.yarn.client.api.async.AMRMClientAsync;
import org.apache.hadoop.yarn.conf.YarnConfiguration;

import org.slf4j.Logger;

import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static java.util.Objects.requireNonNull;

/**
 * Specialized Flink Resource Manager implementation for YARN clusters. It is started as the
 * YARN ApplicationMaster and implements the YARN-specific logic for container requests and failure
 * monitoring.
 */
public class YarnFlinkResourceManager extends FlinkResourceManager<RegisteredYarnWorkerNode> {

	/** The heartbeat interval while the resource master is waiting for containers */
	private static final int FAST_YARN_HEARTBEAT_INTERVAL_MS = 500;

	/** The default heartbeat interval during regular operation */
	private static final int DEFAULT_YARN_HEARTBEAT_INTERVAL_MS = 5000;

	/** Environment variable name of the final container id used by the Flink ResourceManager.
	 * Container ID generation may vary across Hadoop versions. */
	final static String ENV_FLINK_CONTAINER_ID = "_FLINK_CONTAINER_ID";

	/** The containers where a TaskManager is starting and we are waiting for it to register */
	private final Map<ResourceID, YarnContainerInLaunch> containersInLaunch;

	/** Containers we have released, where we are waiting for an acknowledgement that
	 * they are released */
	private final Map<ContainerId, Container> containersBeingReturned;

	/** The YARN / Hadoop configuration object */
	private final YarnConfiguration yarnConfig;

	/** The TaskManager container parameters (like container memory size) */
	private final ContaineredTaskManagerParameters taskManagerParameters;

	/** Context information used to start a TaskManager Java process */
	private final ContainerLaunchContext taskManagerLaunchContext;

	/** Host name for the container running this process */
	private final String applicationMasterHostName;

	/** Web interface URL, may be null */
	private final String webInterfaceURL;

	/** Default heartbeat interval between this actor and the YARN ResourceManager */
	private final int yarnHeartbeatIntervalMillis;

	/** Number of failed TaskManager containers before stopping the application. -1 means infinite. */ 
	private final int maxFailedContainers;

	/** Callback handler for the asynchronous resourceManagerClient */
	private YarnResourceManagerCallbackHandler resourceManagerCallbackHandler;

	/** Client to communicate with the Resource Manager (YARN's master) */
	private AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient;

	/** Client to communicate with the Node manager and launch TaskManager processes */
	private NMClient nodeManagerClient;

	/** The number of containers requested, but not yet granted */
	private int numPendingContainerRequests;

	/** The number of failed containers since the master became active */
	private int failedContainersSoFar;

	/** A reference to the reflector to look up previous session containers. */
	private RegisterApplicationMasterResponseReflector applicationMasterResponseReflector =
		new RegisterApplicationMasterResponseReflector(LOG);

	public YarnFlinkResourceManager(
		Configuration flinkConfig,
		YarnConfiguration yarnConfig,
		LeaderRetrievalService leaderRetrievalService,
		String applicationMasterHostName,
		String webInterfaceURL,
		ContaineredTaskManagerParameters taskManagerParameters,
		ContainerLaunchContext taskManagerLaunchContext,
		int yarnHeartbeatIntervalMillis,
		int maxFailedContainers,
		int numInitialTaskManagers) {

		this(
			flinkConfig,
			yarnConfig,
			leaderRetrievalService,
			applicationMasterHostName,
			webInterfaceURL,
			taskManagerParameters,
			taskManagerLaunchContext,
			yarnHeartbeatIntervalMillis,
			maxFailedContainers,
			numInitialTaskManagers,
			new YarnResourceManagerCallbackHandler());
	}

	public YarnFlinkResourceManager(
		Configuration flinkConfig,
		YarnConfiguration yarnConfig,
		LeaderRetrievalService leaderRetrievalService,
		String applicationMasterHostName,
		String webInterfaceURL,
		ContaineredTaskManagerParameters taskManagerParameters,
		ContainerLaunchContext taskManagerLaunchContext,
		int yarnHeartbeatIntervalMillis,
		int maxFailedContainers,
		int numInitialTaskManagers,
		YarnResourceManagerCallbackHandler callbackHandler) {

		this(
			flinkConfig,
			yarnConfig,
			leaderRetrievalService,
			applicationMasterHostName,
			webInterfaceURL,
			taskManagerParameters,
			taskManagerLaunchContext,
			yarnHeartbeatIntervalMillis,
			maxFailedContainers,
			numInitialTaskManagers,
			callbackHandler,
			AMRMClientAsync.createAMRMClientAsync(yarnHeartbeatIntervalMillis, callbackHandler),
			NMClient.createNMClient());
	}

	public YarnFlinkResourceManager(
		Configuration flinkConfig,
		YarnConfiguration yarnConfig,
		LeaderRetrievalService leaderRetrievalService,
		String applicationMasterHostName,
		String webInterfaceURL,
		ContaineredTaskManagerParameters taskManagerParameters,
		ContainerLaunchContext taskManagerLaunchContext,
		int yarnHeartbeatIntervalMillis,
		int maxFailedContainers,
		int numInitialTaskManagers,
		YarnResourceManagerCallbackHandler callbackHandler,
		AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient,
		NMClient nodeManagerClient) {

		super(numInitialTaskManagers, flinkConfig, leaderRetrievalService);

		this.yarnConfig = requireNonNull(yarnConfig);
		this.taskManagerParameters = requireNonNull(taskManagerParameters);
		this.taskManagerLaunchContext = requireNonNull(taskManagerLaunchContext);
		this.applicationMasterHostName = requireNonNull(applicationMasterHostName);
		this.webInterfaceURL = webInterfaceURL;
		this.yarnHeartbeatIntervalMillis = yarnHeartbeatIntervalMillis;
		this.maxFailedContainers = maxFailedContainers;

		this.resourceManagerCallbackHandler = Preconditions.checkNotNull(callbackHandler);
		this.resourceManagerClient = Preconditions.checkNotNull(resourceManagerClient);
		this.nodeManagerClient = Preconditions.checkNotNull(nodeManagerClient);

		this.containersInLaunch = new HashMap<>();
		this.containersBeingReturned = new HashMap<>();
	}

	// ------------------------------------------------------------------------
	//  Actor messages
	// ------------------------------------------------------------------------

	@Override
	protected void handleMessage(Object message) {

		// check for YARN specific actor messages first

		if (message instanceof ContainersAllocated) {
			containersAllocated(((ContainersAllocated) message).containers());

		} else if (message instanceof ContainersComplete) {
			containersComplete(((ContainersComplete) message).containers());

		} else {
			// message handled by the generic resource master code
			super.handleMessage(message);
		}
	}

	// ------------------------------------------------------------------------
	//  YARN specific behavior
	// ------------------------------------------------------------------------

	@Override
	protected void initialize() throws Exception {
		LOG.info("Initializing YARN resource master");

		resourceManagerCallbackHandler.initialize(self());

		resourceManagerClient.init(yarnConfig);
		resourceManagerClient.start();

		// create the client to communicate with the node managers
		nodeManagerClient.init(yarnConfig);
		nodeManagerClient.start();
		nodeManagerClient.cleanupRunningContainersOnStop(true);

		// register with Resource Manager
		LOG.info("Registering Application Master with tracking url {}", webInterfaceURL);

		scala.Option<Object> portOption = AkkaUtils.getAddress(getContext().system()).port();
		int actorSystemPort = portOption.isDefined() ? (int) portOption.get() : -1;

		RegisterApplicationMasterResponse response = resourceManagerClient.registerApplicationMaster(
			applicationMasterHostName, actorSystemPort, webInterfaceURL);

		// if this application master starts as part of an ApplicationMaster/JobManager recovery,
		// then some worker containers are most likely still alive and we can re-obtain them
		List<Container> containersFromPreviousAttempts =
			applicationMasterResponseReflector.getContainersFromPreviousAttempts(response);

		if (!containersFromPreviousAttempts.isEmpty()) {
			LOG.info("Retrieved {} TaskManagers from previous attempt", containersFromPreviousAttempts.size());

			final long now = System.currentTimeMillis();
			for (Container c : containersFromPreviousAttempts) {
				YarnContainerInLaunch containerInLaunch = new YarnContainerInLaunch(c, now);
				containersInLaunch.put(containerInLaunch.getResourceID(), containerInLaunch);
			}

			// adjust the progress indicator
			updateProgress();
		}
	}

	@Override
	protected void shutdownApplication(ApplicationStatus finalStatus, String optionalDiagnostics) {
		// first, de-register from YARN
		FinalApplicationStatus yarnStatus = getYarnStatus(finalStatus);
		LOG.info("Unregistering application from the YARN Resource Manager");
		try {
			resourceManagerClient.unregisterApplicationMaster(yarnStatus, optionalDiagnostics, "");
		} catch (Throwable t) {
			LOG.error("Could not unregister the application master.", t);
		}

		// now shut down all our components
		try {
			resourceManagerClient.stop();
		} catch (Throwable t) {
			LOG.error("Could not cleanly shut down the Asynchronous Resource Manager Client", t);
		}
		try {
			nodeManagerClient.stop();
		} catch (Throwable t) {
			LOG.error("Could not cleanly shut down the Node Manager Client", t);
		}
	}

	@Override
	protected void fatalError(String message, Throwable error) {
		// we do not unregister, but cause a hard fail of this process, to have it
		// restarted by YARN
		LOG.error("FATAL ERROR IN YARN APPLICATION MASTER: " + message, error);
		LOG.error("Shutting down process");

		// kill this process, this will make YARN restart the process
		System.exit(EXIT_CODE_FATAL_ERROR);
	}

	@Override
	protected void requestNewWorkers(int numWorkers) {
		final long mem = taskManagerParameters.taskManagerTotalMemoryMB();
		final int containerMemorySizeMB;

		if (mem <= Integer.MAX_VALUE) {
			containerMemorySizeMB = (int) mem;
		} else {
			containerMemorySizeMB = Integer.MAX_VALUE;
			LOG.error("Decreasing container size from {} MB to {} MB (integer value overflow)",
				mem, containerMemorySizeMB);
		}

		for (int i = 0; i < numWorkers; i++) {
			numPendingContainerRequests++;
			LOG.info("Requesting new TaskManager container with {} megabytes memory. Pending requests: {}",
				containerMemorySizeMB, numPendingContainerRequests);

			// Priority for worker containers - priorities are intra-application
			Priority priority = Priority.newInstance(0);

			// Resource requirements for worker containers
			int taskManagerSlots = taskManagerParameters.numSlots();
			int vcores = config.getInteger(ConfigConstants.YARN_VCORES, Math.max(taskManagerSlots, 1));
			Resource capability = Resource.newInstance(containerMemorySizeMB, vcores);

			resourceManagerClient.addContainerRequest(
				new AMRMClient.ContainerRequest(capability, null, null, priority));
		}

		// make sure we transmit the request fast and receive fast news of granted allocations
		resourceManagerClient.setHeartbeatInterval(FAST_YARN_HEARTBEAT_INTERVAL_MS);
	}

	@Override
	protected void releasePendingWorker(ResourceID id) {
		YarnContainerInLaunch container = containersInLaunch.remove(id);
		if (container != null) {
			releaseYarnContainer(container.container());
		} else {
			LOG.error("Cannot find container {} to release. Ignoring request.", id);
		}
	}

	@Override
	protected void releaseStartedWorker(RegisteredYarnWorkerNode worker) {
		releaseYarnContainer(worker.yarnContainer());
	}

	private void releaseYarnContainer(Container container) {
		LOG.info("Releasing YARN container {}", container.getId());

		containersBeingReturned.put(container.getId(), container);

		// release the container on the node manager
		try {
			nodeManagerClient.stopContainer(container.getId(), container.getNodeId());
		} catch (Throwable t) {
			// we only log this error. since the ResourceManager also gets the release
			// notification, the container should be eventually cleaned up
			LOG.error("Error while calling YARN Node Manager to release container", t);
		}

		// tell the master that the container is no longer needed
		resourceManagerClient.releaseAssignedContainer(container.getId());
	}

	@Override
	protected RegisteredYarnWorkerNode workerStarted(ResourceID resourceID) {
		YarnContainerInLaunch inLaunch = containersInLaunch.remove(resourceID);
		if (inLaunch == null) {
			// Container was not in state "being launched", this can indicate that the TaskManager
			// in this container was already registered or that the container was not started
			// by this resource manager. Simply ignore this resourceID.
			return null;
		} else {
			return new RegisteredYarnWorkerNode(inLaunch.container());
		}
	}

	@Override
	protected Collection<RegisteredYarnWorkerNode> reacceptRegisteredWorkers(Collection<ResourceID> toConsolidate) {
		// we check for each task manager if we recognize its container
		List<RegisteredYarnWorkerNode> accepted = new ArrayList<>();
		for (ResourceID resourceID : toConsolidate) {
			YarnContainerInLaunch yci = containersInLaunch.remove(resourceID);

			if (yci != null) {
				LOG.info("YARN container consolidation recognizes Resource {} ", resourceID);

				accepted.add(new RegisteredYarnWorkerNode(yci.container()));
			}
			else {
				if (isStarted(resourceID)) {
					LOG.info("TaskManager {} has already been registered at the resource manager.", resourceID);
				} else {
					LOG.info("YARN container consolidation does not recognize TaskManager {}",
						resourceID);
				}
			}
		}
		return accepted;
	}

	@Override
	protected int getNumWorkerRequestsPending() {
		return numPendingContainerRequests;
	}

	@Override
	protected int getNumWorkersPendingRegistration() {
		return containersInLaunch.size();
	}

	// ------------------------------------------------------------------------
	//  Callbacks from the YARN Resource Manager 
	// ------------------------------------------------------------------------

	private void containersAllocated(List<Container> containers) {
		final int numRequired = getDesignatedWorkerPoolSize();
		final int numRegistered = getNumberOfStartedTaskManagers();

		for (Container container : containers) {
			numPendingContainerRequests = Math.max(0, numPendingContainerRequests - 1);
			LOG.info("Received new container: {} - Remaining pending container requests: {}",
				container.getId(), numPendingContainerRequests);

			// decide whether to return the container, or whether to start a TaskManager
			if (numRegistered + containersInLaunch.size() < numRequired) {
				// start a TaskManager
				final YarnContainerInLaunch containerInLaunch = new YarnContainerInLaunch(container);
				final ResourceID resourceID = containerInLaunch.getResourceID();
				containersInLaunch.put(resourceID, containerInLaunch);

				String message = "Launching TaskManager in container " + containerInLaunch
					+ " on host " + container.getNodeId().getHost();
				LOG.info(message);
				sendInfoMessage(message);

				try {
					// set a special environment variable to uniquely identify this container
					taskManagerLaunchContext.getEnvironment()
						.put(ENV_FLINK_CONTAINER_ID, resourceID.getResourceIdString());
					nodeManagerClient.startContainer(container, taskManagerLaunchContext);
				}
				catch (Throwable t) {
					// failed to launch the container
					containersInLaunch.remove(resourceID);

					// return container, a new one will be requested eventually
					LOG.error("Could not start TaskManager in container " + containerInLaunch, t);
					containersBeingReturned.put(container.getId(), container);
					resourceManagerClient.releaseAssignedContainer(container.getId());
				}
			} else {
				// return excessive container
				LOG.info("Returning excess container {}", container.getId());
				containersBeingReturned.put(container.getId(), container);
				resourceManagerClient.releaseAssignedContainer(container.getId());
			}
		}

		updateProgress();

		// if we are waiting for no further containers, we can go to the
		// regular heartbeat interval
		if (numPendingContainerRequests <= 0) {
			resourceManagerClient.setHeartbeatInterval(yarnHeartbeatIntervalMillis);
		}

		// make sure we re-check the status of workers / containers one more time at least,
		// in case some containers did not come up properly
		triggerCheckWorkers();
	}

	/**
	 * Invoked when the ResourceManager informs of completed containers.
	 * Called via an actor message by the callback from the ResourceManager client.
	 * 
	 * @param containers The containers that have completed.
	 */
	private void containersComplete(List<ContainerStatus> containers) {
		// the list contains both failed containers, as well as containers that
		// were gracefully returned by this application master

		for (ContainerStatus status : containers) {
			final ResourceID id = new ResourceID(status.getContainerId().toString());

			// check if this is a failed container or a completed container
			if (containersBeingReturned.remove(status.getContainerId()) != null) {
				// regular completed container that we released
				LOG.info("Container {} completed successfully with diagnostics: {}",
					id, status.getDiagnostics());
			} else {
				// failed container, either at startup, or running
				final String exitStatus;
				switch (status.getExitStatus()) {
					case -103:
						exitStatus = "Vmem limit exceeded (-103)";
						break;
					case -104:
						exitStatus = "Pmem limit exceeded (-104)";
						break;
					default:
						exitStatus = String.valueOf(status.getExitStatus());
				}

				final YarnContainerInLaunch launched = containersInLaunch.remove(id);
				if (launched != null) {
					LOG.info("Container {} failed, with a TaskManager in launch or registration. " +
							"Exit status: {}", id, exitStatus);
					// we will trigger re-acquiring new containers at the end
				} else {
					// failed registered worker
					LOG.info("Container {} failed. Exit status: {}", id, exitStatus);

					// notify the generic logic, which notifies the JobManager, etc.
					notifyWorkerFailed(id, "Container " + id + " failed. " + "Exit status: {}" + exitStatus);
				}

				// general failure logging
				failedContainersSoFar++;

				String diagMessage = String.format("Diagnostics for container %s in state %s : " +
					"exitStatus=%s diagnostics=%s",
					id, status.getState(), exitStatus, status.getDiagnostics());
				sendInfoMessage(diagMessage);

				LOG.info(diagMessage);
				LOG.info("Total number of failed containers so far: " + failedContainersSoFar);

				// maxFailedContainers == -1 is infinite number of retries.
				if (maxFailedContainers >= 0 && failedContainersSoFar > maxFailedContainers) {
					String msg = "Stopping YARN session because the number of failed containers ("
						+ failedContainersSoFar + ") exceeded the maximum failed containers ("
						+ maxFailedContainers + "). This number is controlled by the '"
						+ ConfigConstants.YARN_MAX_FAILED_CONTAINERS + "' configuration setting. "
						+ "By default its the number of requested containers.";

					LOG.error(msg);
					self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)),
						ActorRef.noSender());

					// no need to do anything else
					return;
				}
			}
		}

		updateProgress();

		// in case failed containers were among the finished containers, make
		// sure we re-examine and request new ones
		triggerCheckWorkers();
	}

	// ------------------------------------------------------------------------
	//  Utilities
	// ------------------------------------------------------------------------

	/**
	 * Extracts a unique ResourceID from the Yarn Container.
	 * @param container The Yarn container
	 * @return The ResourceID for the container
	 */
	static ResourceID extractResourceID(Container container) {
		return new ResourceID(container.getId().toString());
	}

	private void updateProgress() {
		final int required = getDesignatedWorkerPoolSize();
		final int available = getNumberOfStartedTaskManagers() + containersInLaunch.size();
		final float progress = (required <= 0) ? 1.0f : available / (float) required;

		if (resourceManagerCallbackHandler != null) {
			resourceManagerCallbackHandler.setCurrentProgress(progress);
		}
	}

	/**
	 * Converts a Flink application status enum to a YARN application status enum.
	 * @param status The Flink application status.
	 * @return The corresponding YARN application status.
	 */
	private FinalApplicationStatus getYarnStatus(ApplicationStatus status) {
		if (status == null) {
			return FinalApplicationStatus.UNDEFINED;
		}
		else {
			switch (status) {
				case SUCCEEDED:
					return FinalApplicationStatus.SUCCEEDED;
				case FAILED:
					return FinalApplicationStatus.FAILED;
				case CANCELED:
					return FinalApplicationStatus.KILLED;
				default:
					return FinalApplicationStatus.UNDEFINED;
			}
		}
	}

	/**
	 * Looks up the getContainersFromPreviousAttempts method on RegisterApplicationMasterResponse
	 * once and saves the method. This saves computation time on the sequent calls.
	 */
	private static class RegisterApplicationMasterResponseReflector {

		private Logger logger;
		private Method method;

		public RegisterApplicationMasterResponseReflector(Logger LOG) {
			this.logger = LOG;

			try {
				method = RegisterApplicationMasterResponse.class
					.getMethod("getContainersFromPreviousAttempts");

			} catch (NoSuchMethodException e) {
				// that happens in earlier Hadoop versions
				logger.info("Cannot reconnect to previously allocated containers. " +
					"This YARN version does not support 'getContainersFromPreviousAttempts()'");
			}
		}

		/**
		 * Checks if a YARN application still has registered containers. If the application master
		 * registered at the ResourceManager for the first time, this list will be empty. If the
		 * application master registered a repeated time (after a failure and recovery), this list
		 * will contain the containers that were previously allocated.
		 *
		 * @param response The response object from the registration at the ResourceManager.
		 * @return A list with containers from previous application attempt.
		 */
		private List<Container> getContainersFromPreviousAttempts(RegisterApplicationMasterResponse response) {
			if (method != null && response != null) {
				try {
					@SuppressWarnings("unchecked")
					List<Container> list = (List<Container>) method.invoke(response);
					if (list != null && !list.isEmpty()) {
						return list;
					}
				} catch (Throwable t) {
					logger.error("Error invoking 'getContainersFromPreviousAttempts()'", t);
				}
			}

			return Collections.emptyList();
		}

	}

	// ------------------------------------------------------------------------
	//  Actor props factory
	// ------------------------------------------------------------------------

	/**
	 * Creates the props needed to instantiate this actor.
	 * 
	 * Rather than extracting and validating parameters in the constructor, this factory method takes
	 * care of that. That way, errors occur synchronously, and are not swallowed simply in a
	 * failed asynchronous attempt to start the actor.
	 
	 * @param actorClass 
	 *             The actor class, to allow overriding this actor with subclasses for testing.
	 * @param flinkConfig
	 *             The Flink configuration object.
	 * @param yarnConfig
	 *             The YARN configuration object.
	 * @param applicationMasterHostName
	 *             The hostname where this application master actor runs.
	 * @param webFrontendURL
	 *             The URL of the tracking web frontend.
	 * @param taskManagerParameters
	 *             The parameters for launching TaskManager containers.
	 * @param taskManagerLaunchContext
	 *             The parameters for launching the TaskManager processes in the TaskManager containers.
	 * @param numInitialTaskManagers
	 *             The initial number of TaskManagers to allocate.
	 * @param log
	 *             The logger to log to.
	 * 
	 * @return The Props object to instantiate the YarnFlinkResourceManager actor.
	 */
	public static Props createActorProps(Class<? extends YarnFlinkResourceManager> actorClass,
			Configuration flinkConfig,
			YarnConfiguration yarnConfig,
			LeaderRetrievalService leaderRetrievalService,
			String applicationMasterHostName,
			String webFrontendURL,
			ContaineredTaskManagerParameters taskManagerParameters,
			ContainerLaunchContext taskManagerLaunchContext,
			int numInitialTaskManagers,
			Logger log)
	{
		final int yarnHeartbeatIntervalMS = flinkConfig.getInteger(
			ConfigConstants.YARN_HEARTBEAT_DELAY_SECONDS, DEFAULT_YARN_HEARTBEAT_INTERVAL_MS / 1000) * 1000;

		final long yarnExpiryIntervalMS = yarnConfig.getLong(
			YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS,
			YarnConfiguration.DEFAULT_RM_AM_EXPIRY_INTERVAL_MS);

		if (yarnHeartbeatIntervalMS >= yarnExpiryIntervalMS) {
			log.warn("The heartbeat interval of the Flink Application master ({}) is greater " +
					"than YARN's expiry interval ({}). The application is likely to be killed by YARN.",
				yarnHeartbeatIntervalMS, yarnExpiryIntervalMS);
		}

		final int maxFailedContainers = flinkConfig.getInteger(
			ConfigConstants.YARN_MAX_FAILED_CONTAINERS, numInitialTaskManagers);
		if (maxFailedContainers >= 0) {
			log.info("YARN application tolerates {} failed TaskManager containers before giving up",
				maxFailedContainers);
		}

		return Props.create(actorClass,
			flinkConfig,
			yarnConfig,
			leaderRetrievalService,
			applicationMasterHostName,
			webFrontendURL,
			taskManagerParameters,
			taskManagerLaunchContext,
			yarnHeartbeatIntervalMS,
			maxFailedContainers,
			numInitialTaskManagers);
	}
}