FlinkResourceManager.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.clusterframework;

import akka.actor.ActorRef;
import akka.actor.ActorSelection;
import akka.actor.ActorSystem;
import akka.actor.Props;
import akka.dispatch.OnComplete;
import akka.pattern.Patterns;
import akka.util.Timeout;

import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.akka.FlinkUntypedActor;
import org.apache.flink.runtime.clusterframework.messages.CheckAndAllocateContainers;
import org.apache.flink.runtime.clusterframework.messages.FatalErrorOccurred;
import org.apache.flink.runtime.clusterframework.messages.InfoMessage;
import org.apache.flink.runtime.clusterframework.messages.NewLeaderAvailable;
import org.apache.flink.runtime.clusterframework.messages.NotifyResourceStarted;
import org.apache.flink.runtime.clusterframework.messages.RegisterInfoMessageListener;
import org.apache.flink.runtime.clusterframework.messages.RegisterInfoMessageListenerSuccessful;
import org.apache.flink.runtime.clusterframework.messages.RegisterResourceManager;
import org.apache.flink.runtime.clusterframework.messages.RegisterResourceManagerSuccessful;
import org.apache.flink.runtime.clusterframework.messages.RemoveResource;
import org.apache.flink.runtime.clusterframework.messages.ResourceRemoved;
import org.apache.flink.runtime.clusterframework.messages.SetWorkerPoolSize;
import org.apache.flink.runtime.clusterframework.messages.StopCluster;
import org.apache.flink.runtime.clusterframework.messages.StopClusterSuccessful;
import org.apache.flink.runtime.clusterframework.messages.TriggerRegistrationAtJobManager;
import org.apache.flink.runtime.clusterframework.messages.UnRegisterInfoMessageListener;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.clusterframework.types.ResourceIDRetrievable;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.JobManagerMessages.LeaderSessionMessage;

import org.apache.flink.util.Preconditions;

import scala.concurrent.Future;
import scala.concurrent.duration.Duration;
import scala.concurrent.duration.FiniteDuration;

import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.TimeUnit;

import static java.util.Objects.requireNonNull;

/**
 *
 * <h1>Worker allocation steps</h1>
 *
 * <ol>
 *     <li>The resource manager decides to request more workers. This can happen in order
 *         to fill the initial pool, or as a result of the JobManager requesting more workers.</li>
 *
 *     <li>The resource master calls {@link #requestNewWorkers(int)}, which triggers requests
 *         for more containers. After that, the {@link #getNumWorkerRequestsPending()}
 *         should reflect the pending requests.</li>
 *
 *     <li>The concrete framework may acquire containers and then trigger to start TaskManagers
 *         in those containers. That should be reflected in {@link #getNumWorkersPendingRegistration()}.</li>
 *
 *     <li>At some point, the TaskManager processes will have started and send a registration
 *         message to the JobManager. The JobManager will perform
 *         a lookup with the ResourceManager to check if it really started this TaskManager.
 *         The method {@link #workerStarted(ResourceID)} will be called
 *         to inform about a registered worker.</li>
 * </ol>
 *
 */
public abstract class FlinkResourceManager<WorkerType extends ResourceIDRetrievable> extends FlinkUntypedActor {

	/** The exit code with which the process is stopped in case of a fatal error */
	protected static final int EXIT_CODE_FATAL_ERROR = -13;

	/** The default name of the resource manager actor */
	public static final String RESOURCE_MANAGER_NAME = "resourcemanager";

	// ------------------------------------------------------------------------

	/** The Flink configuration object */
	protected final Configuration config;

	/** The timeout for actor messages sent to the JobManager / TaskManagers */
	private final FiniteDuration messageTimeout;

	/** The service to find the right leader JobManager (to support high availability) */
	private final LeaderRetrievalService leaderRetriever;

	/** Map which contains the workers from which we know that they have been successfully started
	 * in a container. This notification is sent by the JM when a TM tries to register at it. */
	private final Map<ResourceID, WorkerType> startedWorkers;

	/** List of listeners for info messages */
	private final Set<ActorRef> infoMessageListeners;

	/** The JobManager that the framework master manages resources for */
	private ActorRef jobManager;

	/** Our JobManager's leader session */
	private UUID leaderSessionID;

	/** The size of the worker pool that the resource master strives to maintain */
	private int designatedPoolSize;

	// ------------------------------------------------------------------------

	/**
	 * Creates a AbstractFrameworkMaster actor.
	 *
	 * @param flinkConfig The Flink configuration object.
	 */
	protected FlinkResourceManager(
			int numInitialTaskManagers,
			Configuration flinkConfig,
			LeaderRetrievalService leaderRetriever) {
		this.config = requireNonNull(flinkConfig);
		this.leaderRetriever = requireNonNull(leaderRetriever);
		this.startedWorkers = new HashMap<>();

		FiniteDuration lt;
		try {
			lt = AkkaUtils.getLookupTimeout(config);
		}
		catch (Exception e) {
			lt = new FiniteDuration(
				Duration.apply(ConfigConstants.DEFAULT_AKKA_LOOKUP_TIMEOUT).toMillis(),
				TimeUnit.MILLISECONDS);
		}
		this.messageTimeout = lt;
		this.designatedPoolSize = numInitialTaskManagers;
		this.infoMessageListeners = new HashSet<>();
	}

	// ------------------------------------------------------------------------
	//  Actor Behavior
	// ------------------------------------------------------------------------

	@Override
	public void preStart() {
		try {
			// we start our leader retrieval service to make sure we get informed
			// about JobManager leader changes
			leaderRetriever.start(new LeaderRetrievalListener() {

				@Override
				public void notifyLeaderAddress(String leaderAddress, UUID leaderSessionID) {
					self().tell(
						new NewLeaderAvailable(leaderAddress, leaderSessionID),
						ActorRef.noSender());
				}

				@Override
				public void handleError(Exception e) {
					self().tell(
						new FatalErrorOccurred("Leader retrieval service failed", e),
						ActorRef.noSender());
				}
			});

			// framework specific initialization
			initialize();

		}
		catch (Throwable t) {
			self().tell(
				new FatalErrorOccurred("Error during startup of ResourceManager actor", t),
				ActorRef.noSender());
		}
	}

	@Override
	public void postStop() {
		try {
			leaderRetriever.stop();
		}
		catch (Throwable t) {
			LOG.error("Could not cleanly shut down leader retrieval service", t);
		}
	}

	/**
	 *
	 * This method receives the actor messages after they have been filtered for
	 * a match with the leader session.
	 *
	 * @param message The incoming actor message.
	 */
	@Override
	protected void handleMessage(Object message) {
		try {
			// --- messages about worker allocation and pool sizes

			if (message instanceof CheckAndAllocateContainers) {
				checkWorkersPool();
			}
			else if (message instanceof SetWorkerPoolSize) {
				SetWorkerPoolSize msg = (SetWorkerPoolSize) message;
				adjustDesignatedNumberOfWorkers(msg.numberOfWorkers());
			}
			else if (message instanceof RemoveResource) {
				RemoveResource msg = (RemoveResource) message;
				removeRegisteredResource(msg.resourceId());
			}

			// --- lookup of registered resources

			else if (message instanceof NotifyResourceStarted) {
				NotifyResourceStarted msg = (NotifyResourceStarted) message;
				handleResourceStarted(sender(), msg.getResourceID());
			}

			// --- messages about JobManager leader status and registration

			else if (message instanceof NewLeaderAvailable) {
				NewLeaderAvailable msg = (NewLeaderAvailable) message;
				newJobManagerLeaderAvailable(msg.leaderAddress(), msg.leaderSessionId());
			}
			else if (message instanceof TriggerRegistrationAtJobManager) {
				TriggerRegistrationAtJobManager msg = (TriggerRegistrationAtJobManager) message;
				triggerConnectingToJobManager(msg.jobManagerAddress());
			}
			else if (message instanceof RegisterResourceManagerSuccessful) {
				RegisterResourceManagerSuccessful msg = (RegisterResourceManagerSuccessful) message;
				jobManagerLeaderConnected(msg.jobManager(), msg.currentlyRegisteredTaskManagers());
			}

			// --- end of application

			else if (message instanceof StopCluster) {
				StopCluster msg = (StopCluster) message;
				shutdownCluster(msg.finalStatus(), msg.message());
				sender().tell(decorateMessage(StopClusterSuccessful.getInstance()), ActorRef.noSender());
			}

			// --- miscellaneous messages

			else if (message instanceof RegisterInfoMessageListener) {
				if (jobManager != null) {
					infoMessageListeners.add(sender());
					sender().tell(decorateMessage(
						RegisterInfoMessageListenerSuccessful.get()),
						// answer as the JobManager
						jobManager);
				}
			}

			else if (message instanceof UnRegisterInfoMessageListener) {
				infoMessageListeners.remove(sender());
			}

			else if (message instanceof FatalErrorOccurred) {
				FatalErrorOccurred fatalErrorOccurred = (FatalErrorOccurred) message;
				fatalError(fatalErrorOccurred.message(), fatalErrorOccurred.error());
			}

			// --- unknown messages

			else {
				LOG.error("Discarding unknown message: {}", message);
			}
		}
		catch (Throwable t) {
			// fatal error, needs master recovery
			fatalError("Error processing actor message", t);
		}
	}

	@Override
	protected final UUID getLeaderSessionID() {
		return leaderSessionID;
	}

	// ------------------------------------------------------------------------
	//  Status
	// ------------------------------------------------------------------------

	/**
	 * Gets the current designated worker pool size, meaning the number of workers
	 * that the resource master strives to maintain. The actual number of workers
	 * may be lower (if worker requests are still pending) or higher (if workers have
	 * not yet been released).
	 *
	 * @return The designated worker pool size.
	 */
	public int getDesignatedWorkerPoolSize() {
		return designatedPoolSize;
	}

	/**
	 * Gets the number of currently started TaskManagers.
	 *
	 * @return The number of currently started TaskManagers.
	 */
	public int getNumberOfStartedTaskManagers() {
		return startedWorkers.size();
	}

	/**
	 * Gets the currently registered resources.
	 * @return
	 */
	public Collection<WorkerType> getStartedTaskManagers() {
		return startedWorkers.values();
	}

	/**
	 * Gets the started worker for a given resource ID, if one is available.
	 *
	 * @param resourceId The resource ID for the worker.
	 * @return True if already registered, otherwise false
	 */
	public boolean isStarted(ResourceID resourceId) {
		return startedWorkers.containsKey(resourceId);
	}

	/**
	 * Gets an iterable for all currently started TaskManagers.
	 *
	 * @return All currently started TaskManagers.
	 */
	public Collection<WorkerType> allStartedWorkers() {
		return startedWorkers.values();
	}

	/**
	 * Tells the ResourceManager that a TaskManager had been started in a container with the given
	 * resource id.
	 *
	 * @param jobManager The sender (JobManager) of the message
	 * @param resourceID The resource id of the started TaskManager
	 */
	private void handleResourceStarted(ActorRef jobManager, ResourceID resourceID) {
		if (resourceID != null) {
			// check if resourceID is already registered (TaskManager may send duplicate register messages)
			WorkerType oldWorker = startedWorkers.get(resourceID);
			if (oldWorker != null) {
				LOG.debug("Notification that TaskManager {} had been started was sent before.", resourceID);
			} else {
				WorkerType newWorker = workerStarted(resourceID);

				if (newWorker != null) {
					startedWorkers.put(resourceID, newWorker);
					LOG.info("TaskManager {} has started.", resourceID);
				} else {
					LOG.info("TaskManager {} has not been started by this resource manager.", resourceID);
				}
			}
		}

		// Acknowledge the resource registration
		jobManager.tell(decorateMessage(Acknowledge.get()), self());
	}

	/**
	 * Releases the given resource. Note that this does not automatically shrink
	 * the designated worker pool size.
	 *
	 * @param resourceId The TaskManager's resource id.
	 */
	private void removeRegisteredResource(ResourceID resourceId) {

		WorkerType worker = startedWorkers.remove(resourceId);
		if (worker != null) {
			releaseStartedWorker(worker);
		} else {
			LOG.warn("Resource {} could not be released", resourceId);
		}
	}


	// ------------------------------------------------------------------------
	//  Registration and consolidation with JobManager Leader
	// ------------------------------------------------------------------------

	/**
	 * Called as soon as we discover (via leader election) that a JobManager lost leadership
	 * or a different one gained leadership.
	 *
	 * @param leaderAddress The address (Akka URL) of the new leader. Null if there is currently no leader.
	 * @param leaderSessionID The unique session ID marking the leadership session.
	 */
	private void newJobManagerLeaderAvailable(String leaderAddress, UUID leaderSessionID) {
		LOG.debug("Received new leading JobManager {}. Connecting.", leaderAddress);

		// disconnect from the current leader (no-op if no leader yet)
		jobManagerLostLeadership();

		// a null leader session id means that only a leader disconnect
		// happened, without a new leader yet
		if (leaderSessionID != null && leaderAddress != null) {
			// the leaderSessionID implicitly filters out success and failure messages
			// that come after leadership changed again
			this.leaderSessionID = leaderSessionID;
			triggerConnectingToJobManager(leaderAddress);
		}
	}

	/**
	 * Causes the resource manager to announce itself at the new leader JobManager and
	 * obtains its connection information and currently known TaskManagers.
	 *
	 * @param leaderAddress The akka actor URL of the new leader JobManager.
	 */
	protected void triggerConnectingToJobManager(String leaderAddress) {

		LOG.info("Trying to associate with JobManager leader " + leaderAddress);

		final Object registerMessage = decorateMessage(new RegisterResourceManager(self()));
		final Object retryMessage = decorateMessage(new TriggerRegistrationAtJobManager(leaderAddress));

		// send the registration message to the JobManager
		ActorSelection jobManagerSel = context().actorSelection(leaderAddress);
		Future<Object> future = Patterns.ask(jobManagerSel, registerMessage, new Timeout(messageTimeout));

		future.onComplete(new OnComplete<Object>() {

			@Override
			public void onComplete(Throwable failure, Object msg) {
				// only process if we haven't been connected in the meantime
				if (jobManager == null) {
					if (msg != null) {
						if (msg instanceof LeaderSessionMessage &&
							((LeaderSessionMessage) msg).message() instanceof RegisterResourceManagerSuccessful) {
							self().tell(msg, ActorRef.noSender());
						} else {
							LOG.error("Invalid response type to registration at JobManager: {}", msg);
							self().tell(retryMessage, ActorRef.noSender());
						}
					} else {
						// no success
						LOG.error("Resource manager could not register at JobManager", failure);
						self().tell(retryMessage, ActorRef.noSender());
					}
				}
			}

		}, context().dispatcher());
	}

	/**
	 * This method disassociates from the current leader JobManager.
	 */
	private void jobManagerLostLeadership() {
		if (jobManager != null) {
			LOG.info("Associated JobManager {} lost leader status", jobManager);

			jobManager = null;
			leaderSessionID = null;

			infoMessageListeners.clear();
		}
	}

	/**
	 * Callback when we're informed about a new leading JobManager.
	 * @param newJobManagerLeader The ActorRef of the new jobManager
	 * @param workers The existing workers the JobManager has registered.
	 */
	private void jobManagerLeaderConnected(
						ActorRef newJobManagerLeader,
						Collection<ResourceID> workers) {

		if (jobManager == null) {
			LOG.info("Resource Manager associating with leading JobManager {} - leader session {}",
						newJobManagerLeader, leaderSessionID);

			jobManager = newJobManagerLeader;

			if (workers.size() > 0) {
				LOG.info("Received TaskManagers that were registered at the leader JobManager. " +
						"Trying to consolidate.");

				// keep track of which TaskManagers are not handled
				Set<ResourceID> toHandle = new HashSet<>(workers.size());
				toHandle.addAll(workers);

				try {
					// ask the framework to tell us which ones we should keep for now
					Collection<WorkerType> consolidated = reacceptRegisteredWorkers(workers);
					LOG.info("Consolidated {} TaskManagers", consolidated.size());

					// put the consolidated TaskManagers into our bookkeeping
					for (WorkerType worker : consolidated) {
						ResourceID resourceID = worker.getResourceID();
						startedWorkers.put(resourceID, worker);
						toHandle.remove(resourceID);
					}
				}
				catch (Throwable t) {
					LOG.error("Error during consolidation of known TaskManagers", t);
					// the framework should release the remaining unclear resources
					for (ResourceID id : toHandle) {
						releasePendingWorker(id);
					}
				}

			}

			// trigger initial check for requesting new workers
			checkWorkersPool();

		} else {
			String msg = "Attempting to associate with new JobManager leader " + newJobManagerLeader
				+ " without previously disassociating from current leader " + jobManager;
			fatalError(msg, new Exception(msg));
		}
	}

	// ------------------------------------------------------------------------
	//  ClusterClient Shutdown
	// ------------------------------------------------------------------------

	private void shutdownCluster(ApplicationStatus status, String diagnostics) {
		LOG.info("Shutting down cluster with status {} : {}", status, diagnostics);

		shutdownApplication(status, diagnostics);
	}

	// ------------------------------------------------------------------------
	//  Worker pool size management
	// ------------------------------------------------------------------------

	/**
	 * This method causes the resource framework master to <b>synchronously</b>re-examine
	 * the set of available and pending workers containers, and allocate containers
	 * if needed.
	 *
	 * This method does not automatically release workers, because it is not visible to
	 * this resource master which workers can be released. Instead, the JobManager must
	 * explicitly release individual workers.
	 */
	private void checkWorkersPool() {
		int numWorkersPending = getNumWorkerRequestsPending();
		int numWorkersPendingRegistration = getNumWorkersPendingRegistration();

		// sanity checks
		Preconditions.checkState(numWorkersPending >= 0,
			"Number of pending workers should never be below 0.");
		Preconditions.checkState(numWorkersPendingRegistration >= 0,
			"Number of pending workers pending registration should never be below 0.");

		// see how many workers we want, and whether we have enough
		int allAvailableAndPending = startedWorkers.size() +
			numWorkersPending + numWorkersPendingRegistration;

		int missing = designatedPoolSize - allAvailableAndPending;

		if (missing > 0) {
			requestNewWorkers(missing);
		}
	}

	/**
	 * Sets the designated worker pool size. If this size is larger than the current pool
	 * size, then the resource manager will try to acquire more TaskManagers.
	 *
	 * @param num The number of workers in the pool.
	 */
	private void adjustDesignatedNumberOfWorkers(int num) {
		if (num >= 0) {
			LOG.info("Adjusting designated worker pool size to {}", num);
			designatedPoolSize = num;
			checkWorkersPool();
		} else {
			LOG.warn("Ignoring invalid designated worker pool size: " + num);
		}
	}

	// ------------------------------------------------------------------------
	//  Callbacks
	// ------------------------------------------------------------------------

	/**
	 * This method causes the resource framework master to <b>asynchronously</b>re-examine
	 * the set of available and pending workers containers, and release or allocate
	 * containers if needed. The method sends an actor message which will trigger the
	 * re-examination.
	 */
	public void triggerCheckWorkers() {
		self().tell(
			decorateMessage(
				CheckAndAllocateContainers.get()),
			ActorRef.noSender());
	}

	/**
	 * This method should be called by the framework once it detects that a currently registered
	 * worker has failed.
	 *
	 * @param resourceID Id of the worker that has failed.
	 * @param message An informational message that explains why the worker failed.
	 */
	public void notifyWorkerFailed(ResourceID resourceID, String message) {
		WorkerType worker = startedWorkers.remove(resourceID);
		if (worker != null) {
			jobManager.tell(
				decorateMessage(
					new ResourceRemoved(resourceID, message)),
				self());
		}
	}

	// ------------------------------------------------------------------------
	//  Framework specific behavior
	// ------------------------------------------------------------------------

	/**
	 * Initializes the framework specific components.
	 *
	 * @throws Exception Exceptions during initialization cause the resource manager to fail.
	 *                   If the framework is able to recover this resource manager, it will be
	 *                   restarted.
	 */
	protected abstract void initialize() throws Exception;

	/**
	 * The framework specific code for shutting down the application. This should report the
	 * application's final status and shut down the resource manager cleanly.
	 *
	 * This method also needs to make sure all pending containers that are not registered
	 * yet are returned.
	 *
	 * @param finalStatus The application status to report.
	 * @param optionalDiagnostics An optional diagnostics message.
	 */
	protected abstract void shutdownApplication(ApplicationStatus finalStatus, String optionalDiagnostics);

	/**
	 * Notifies the resource master of a fatal error.
	 *
	 * <p><b>IMPORTANT:</b> This should not cleanly shut down this master, but exit it in
	 * such a way that a high-availability setting would restart this or fail over
	 * to another master.
	 */
	protected abstract void fatalError(String message, Throwable error);

	/**
	 * Requests to allocate a certain number of new workers.
	 *
	 * @param numWorkers The number of workers to allocate.
	 */
	protected abstract void requestNewWorkers(int numWorkers);

	/**
	 * Trigger a release of a pending worker.
	 * @param resourceID The worker resource id
	 */
	protected abstract void releasePendingWorker(ResourceID resourceID);

	/**
	 * Trigger a release of a started worker.
	 * @param resourceID The worker resource id
	 */
	protected abstract void releaseStartedWorker(WorkerType resourceID);

	/**
	 * Callback when a worker was started.
	 * @param resourceID The worker resource id
	 */
	protected abstract WorkerType workerStarted(ResourceID resourceID);

	/**
	 * This method is called when the resource manager starts after a failure and reconnects to
	 * the leader JobManager, who still has some workers registered. The method is used to consolidate
	 * the view between resource manager and JobManager. The resource manager gets the list of TaskManagers
	 * that the JobManager considers available and should return a list or nodes that the
	 * resource manager considers available.
	 *
	 * After that, the JobManager is informed of loss of all TaskManagers that are not part of the
	 * returned list.
	 *
	 * It is possible that the resource manager initially confirms some TaskManagers to be alive, even
	 * through they are in an uncertain status, if it later sends necessary failure notifications
	 * via calling {@link #notifyWorkerFailed(ResourceID, String)}.
	 *
	 * @param registered The list of TaskManagers that the JobManager knows.
	 * @return The subset of TaskManagers that the resource manager can confirm to be alive.
	 */
	protected abstract Collection<WorkerType> reacceptRegisteredWorkers(Collection<ResourceID> registered);

	/**
	 * Gets the number of requested workers that have not yet been granted.
	 *
	 * @return The number pending worker requests. Must never be smaller than 0.
	 */
	protected abstract int getNumWorkerRequestsPending();

	/**
	 * Gets the number of containers that have been started, but where the TaskManager
	 * has not yet registered at the job manager.
	 *
	 * @return The number of started containers pending TaskManager registration.
	 * Must never be smaller than 0.
	 */
	protected abstract int getNumWorkersPendingRegistration();

	// ------------------------------------------------------------------------
	//  Info messaging
	// ------------------------------------------------------------------------

	protected void sendInfoMessage(String message) {
		for (ActorRef listener : infoMessageListeners) {
			listener.tell(decorateMessage(new InfoMessage(message)), self());
		}
	}


	// ------------------------------------------------------------------------
	//  Startup
	// ------------------------------------------------------------------------

	/**
	 * Starts the resource manager actors.
	 * @param configuration The configuration for the resource manager
	 * @param actorSystem The actor system to start the resource manager in
	 * @param leaderRetriever The leader retriever service to intialize the resource manager
	 * @param resourceManagerClass The class of the ResourceManager to be started
	 * @return ActorRef of the resource manager
	 */
	public static ActorRef startResourceManagerActors(
			Configuration configuration,
			ActorSystem actorSystem,
			LeaderRetrievalService leaderRetriever,
			Class<? extends FlinkResourceManager<?>> resourceManagerClass) {

		return startResourceManagerActors(
			configuration, actorSystem, leaderRetriever, resourceManagerClass,
			RESOURCE_MANAGER_NAME + "-" + UUID.randomUUID());
	}

	/**
	 * Starts the resource manager actors.
	 * @param configuration The configuration for the resource manager
	 * @param actorSystem The actor system to start the resource manager in
	 * @param leaderRetriever The leader retriever service to intialize the resource manager
	 * @param resourceManagerClass The class of the ResourceManager to be started
	 * @param resourceManagerActorName The name of the resource manager actor.
	 * @return ActorRef of the resource manager
	 */
	public static ActorRef startResourceManagerActors(
			Configuration configuration,
			ActorSystem actorSystem,
			LeaderRetrievalService leaderRetriever,
			Class<? extends FlinkResourceManager<?>> resourceManagerClass,
			String resourceManagerActorName) {

		Props resourceMasterProps = getResourceManagerProps(
			resourceManagerClass,
			configuration,
			leaderRetriever);

		return actorSystem.actorOf(resourceMasterProps, resourceManagerActorName);
	}

	public static Props getResourceManagerProps(
		Class<? extends FlinkResourceManager> resourceManagerClass,
		Configuration configuration,
		LeaderRetrievalService leaderRetrievalService) {

		return Props.create(resourceManagerClass, configuration, leaderRetrievalService);
	}
}