/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.clusterframework;
import akka.actor.ActorRef;
import akka.actor.ActorSelection;
import akka.actor.ActorSystem;
import akka.actor.Props;
import akka.dispatch.OnComplete;
import akka.pattern.Patterns;
import akka.util.Timeout;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.akka.FlinkUntypedActor;
import org.apache.flink.runtime.clusterframework.messages.CheckAndAllocateContainers;
import org.apache.flink.runtime.clusterframework.messages.FatalErrorOccurred;
import org.apache.flink.runtime.clusterframework.messages.InfoMessage;
import org.apache.flink.runtime.clusterframework.messages.NewLeaderAvailable;
import org.apache.flink.runtime.clusterframework.messages.NotifyResourceStarted;
import org.apache.flink.runtime.clusterframework.messages.RegisterInfoMessageListener;
import org.apache.flink.runtime.clusterframework.messages.RegisterInfoMessageListenerSuccessful;
import org.apache.flink.runtime.clusterframework.messages.RegisterResourceManager;
import org.apache.flink.runtime.clusterframework.messages.RegisterResourceManagerSuccessful;
import org.apache.flink.runtime.clusterframework.messages.RemoveResource;
import org.apache.flink.runtime.clusterframework.messages.ResourceRemoved;
import org.apache.flink.runtime.clusterframework.messages.SetWorkerPoolSize;
import org.apache.flink.runtime.clusterframework.messages.StopCluster;
import org.apache.flink.runtime.clusterframework.messages.StopClusterSuccessful;
import org.apache.flink.runtime.clusterframework.messages.TriggerRegistrationAtJobManager;
import org.apache.flink.runtime.clusterframework.messages.UnRegisterInfoMessageListener;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.clusterframework.types.ResourceIDRetrievable;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.JobManagerMessages.LeaderSessionMessage;
import org.apache.flink.util.Preconditions;
import scala.concurrent.Future;
import scala.concurrent.duration.Duration;
import scala.concurrent.duration.FiniteDuration;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import static java.util.Objects.requireNonNull;
/**
*
* <h1>Worker allocation steps</h1>
*
* <ol>
* <li>The resource manager decides to request more workers. This can happen in order
* to fill the initial pool, or as a result of the JobManager requesting more workers.</li>
*
* <li>The resource master calls {@link #requestNewWorkers(int)}, which triggers requests
* for more containers. After that, the {@link #getNumWorkerRequestsPending()}
* should reflect the pending requests.</li>
*
* <li>The concrete framework may acquire containers and then trigger to start TaskManagers
* in those containers. That should be reflected in {@link #getNumWorkersPendingRegistration()}.</li>
*
* <li>At some point, the TaskManager processes will have started and send a registration
* message to the JobManager. The JobManager will perform
* a lookup with the ResourceManager to check if it really started this TaskManager.
* The method {@link #workerStarted(ResourceID)} will be called
* to inform about a registered worker.</li>
* </ol>
*
*/
public abstract class FlinkResourceManager<WorkerType extends ResourceIDRetrievable> extends FlinkUntypedActor {
/** The exit code with which the process is stopped in case of a fatal error */
protected static final int EXIT_CODE_FATAL_ERROR = -13;
/** The default name of the resource manager actor */
public static final String RESOURCE_MANAGER_NAME = "resourcemanager";
// ------------------------------------------------------------------------
/** The Flink configuration object */
protected final Configuration config;
/** The timeout for actor messages sent to the JobManager / TaskManagers */
private final FiniteDuration messageTimeout;
/** The service to find the right leader JobManager (to support high availability) */
private final LeaderRetrievalService leaderRetriever;
/** Map which contains the workers from which we know that they have been successfully started
* in a container. This notification is sent by the JM when a TM tries to register at it. */
private final Map<ResourceID, WorkerType> startedWorkers;
/** List of listeners for info messages */
private final Set<ActorRef> infoMessageListeners;
/** The JobManager that the framework master manages resources for */
private ActorRef jobManager;
/** Our JobManager's leader session */
private UUID leaderSessionID;
/** The size of the worker pool that the resource master strives to maintain */
private int designatedPoolSize;
// ------------------------------------------------------------------------
/**
* Creates a AbstractFrameworkMaster actor.
*
* @param flinkConfig The Flink configuration object.
*/
protected FlinkResourceManager(
int numInitialTaskManagers,
Configuration flinkConfig,
LeaderRetrievalService leaderRetriever) {
this.config = requireNonNull(flinkConfig);
this.leaderRetriever = requireNonNull(leaderRetriever);
this.startedWorkers = new HashMap<>();
FiniteDuration lt;
try {
lt = AkkaUtils.getLookupTimeout(config);
}
catch (Exception e) {
lt = new FiniteDuration(
Duration.apply(ConfigConstants.DEFAULT_AKKA_LOOKUP_TIMEOUT).toMillis(),
TimeUnit.MILLISECONDS);
}
this.messageTimeout = lt;
this.designatedPoolSize = numInitialTaskManagers;
this.infoMessageListeners = new HashSet<>();
}
// ------------------------------------------------------------------------
// Actor Behavior
// ------------------------------------------------------------------------
@Override
public void preStart() {
try {
// we start our leader retrieval service to make sure we get informed
// about JobManager leader changes
leaderRetriever.start(new LeaderRetrievalListener() {
@Override
public void notifyLeaderAddress(String leaderAddress, UUID leaderSessionID) {
self().tell(
new NewLeaderAvailable(leaderAddress, leaderSessionID),
ActorRef.noSender());
}
@Override
public void handleError(Exception e) {
self().tell(
new FatalErrorOccurred("Leader retrieval service failed", e),
ActorRef.noSender());
}
});
// framework specific initialization
initialize();
}
catch (Throwable t) {
self().tell(
new FatalErrorOccurred("Error during startup of ResourceManager actor", t),
ActorRef.noSender());
}
}
@Override
public void postStop() {
try {
leaderRetriever.stop();
}
catch (Throwable t) {
LOG.error("Could not cleanly shut down leader retrieval service", t);
}
}
/**
*
* This method receives the actor messages after they have been filtered for
* a match with the leader session.
*
* @param message The incoming actor message.
*/
@Override
protected void handleMessage(Object message) {
try {
// --- messages about worker allocation and pool sizes
if (message instanceof CheckAndAllocateContainers) {
checkWorkersPool();
}
else if (message instanceof SetWorkerPoolSize) {
SetWorkerPoolSize msg = (SetWorkerPoolSize) message;
adjustDesignatedNumberOfWorkers(msg.numberOfWorkers());
}
else if (message instanceof RemoveResource) {
RemoveResource msg = (RemoveResource) message;
removeRegisteredResource(msg.resourceId());
}
// --- lookup of registered resources
else if (message instanceof NotifyResourceStarted) {
NotifyResourceStarted msg = (NotifyResourceStarted) message;
handleResourceStarted(sender(), msg.getResourceID());
}
// --- messages about JobManager leader status and registration
else if (message instanceof NewLeaderAvailable) {
NewLeaderAvailable msg = (NewLeaderAvailable) message;
newJobManagerLeaderAvailable(msg.leaderAddress(), msg.leaderSessionId());
}
else if (message instanceof TriggerRegistrationAtJobManager) {
TriggerRegistrationAtJobManager msg = (TriggerRegistrationAtJobManager) message;
triggerConnectingToJobManager(msg.jobManagerAddress());
}
else if (message instanceof RegisterResourceManagerSuccessful) {
RegisterResourceManagerSuccessful msg = (RegisterResourceManagerSuccessful) message;
jobManagerLeaderConnected(msg.jobManager(), msg.currentlyRegisteredTaskManagers());
}
// --- end of application
else if (message instanceof StopCluster) {
StopCluster msg = (StopCluster) message;
shutdownCluster(msg.finalStatus(), msg.message());
sender().tell(decorateMessage(StopClusterSuccessful.getInstance()), ActorRef.noSender());
}
// --- miscellaneous messages
else if (message instanceof RegisterInfoMessageListener) {
if (jobManager != null) {
infoMessageListeners.add(sender());
sender().tell(decorateMessage(
RegisterInfoMessageListenerSuccessful.get()),
// answer as the JobManager
jobManager);
}
}
else if (message instanceof UnRegisterInfoMessageListener) {
infoMessageListeners.remove(sender());
}
else if (message instanceof FatalErrorOccurred) {
FatalErrorOccurred fatalErrorOccurred = (FatalErrorOccurred) message;
fatalError(fatalErrorOccurred.message(), fatalErrorOccurred.error());
}
// --- unknown messages
else {
LOG.error("Discarding unknown message: {}", message);
}
}
catch (Throwable t) {
// fatal error, needs master recovery
fatalError("Error processing actor message", t);
}
}
@Override
protected final UUID getLeaderSessionID() {
return leaderSessionID;
}
// ------------------------------------------------------------------------
// Status
// ------------------------------------------------------------------------
/**
* Gets the current designated worker pool size, meaning the number of workers
* that the resource master strives to maintain. The actual number of workers
* may be lower (if worker requests are still pending) or higher (if workers have
* not yet been released).
*
* @return The designated worker pool size.
*/
public int getDesignatedWorkerPoolSize() {
return designatedPoolSize;
}
/**
* Gets the number of currently started TaskManagers.
*
* @return The number of currently started TaskManagers.
*/
public int getNumberOfStartedTaskManagers() {
return startedWorkers.size();
}
/**
* Gets the currently registered resources.
* @return
*/
public Collection<WorkerType> getStartedTaskManagers() {
return startedWorkers.values();
}
/**
* Gets the started worker for a given resource ID, if one is available.
*
* @param resourceId The resource ID for the worker.
* @return True if already registered, otherwise false
*/
public boolean isStarted(ResourceID resourceId) {
return startedWorkers.containsKey(resourceId);
}
/**
* Gets an iterable for all currently started TaskManagers.
*
* @return All currently started TaskManagers.
*/
public Collection<WorkerType> allStartedWorkers() {
return startedWorkers.values();
}
/**
* Tells the ResourceManager that a TaskManager had been started in a container with the given
* resource id.
*
* @param jobManager The sender (JobManager) of the message
* @param resourceID The resource id of the started TaskManager
*/
private void handleResourceStarted(ActorRef jobManager, ResourceID resourceID) {
if (resourceID != null) {
// check if resourceID is already registered (TaskManager may send duplicate register messages)
WorkerType oldWorker = startedWorkers.get(resourceID);
if (oldWorker != null) {
LOG.debug("Notification that TaskManager {} had been started was sent before.", resourceID);
} else {
WorkerType newWorker = workerStarted(resourceID);
if (newWorker != null) {
startedWorkers.put(resourceID, newWorker);
LOG.info("TaskManager {} has started.", resourceID);
} else {
LOG.info("TaskManager {} has not been started by this resource manager.", resourceID);
}
}
}
// Acknowledge the resource registration
jobManager.tell(decorateMessage(Acknowledge.get()), self());
}
/**
* Releases the given resource. Note that this does not automatically shrink
* the designated worker pool size.
*
* @param resourceId The TaskManager's resource id.
*/
private void removeRegisteredResource(ResourceID resourceId) {
WorkerType worker = startedWorkers.remove(resourceId);
if (worker != null) {
releaseStartedWorker(worker);
} else {
LOG.warn("Resource {} could not be released", resourceId);
}
}
// ------------------------------------------------------------------------
// Registration and consolidation with JobManager Leader
// ------------------------------------------------------------------------
/**
* Called as soon as we discover (via leader election) that a JobManager lost leadership
* or a different one gained leadership.
*
* @param leaderAddress The address (Akka URL) of the new leader. Null if there is currently no leader.
* @param leaderSessionID The unique session ID marking the leadership session.
*/
private void newJobManagerLeaderAvailable(String leaderAddress, UUID leaderSessionID) {
LOG.debug("Received new leading JobManager {}. Connecting.", leaderAddress);
// disconnect from the current leader (no-op if no leader yet)
jobManagerLostLeadership();
// a null leader session id means that only a leader disconnect
// happened, without a new leader yet
if (leaderSessionID != null && leaderAddress != null) {
// the leaderSessionID implicitly filters out success and failure messages
// that come after leadership changed again
this.leaderSessionID = leaderSessionID;
triggerConnectingToJobManager(leaderAddress);
}
}
/**
* Causes the resource manager to announce itself at the new leader JobManager and
* obtains its connection information and currently known TaskManagers.
*
* @param leaderAddress The akka actor URL of the new leader JobManager.
*/
protected void triggerConnectingToJobManager(String leaderAddress) {
LOG.info("Trying to associate with JobManager leader " + leaderAddress);
final Object registerMessage = decorateMessage(new RegisterResourceManager(self()));
final Object retryMessage = decorateMessage(new TriggerRegistrationAtJobManager(leaderAddress));
// send the registration message to the JobManager
ActorSelection jobManagerSel = context().actorSelection(leaderAddress);
Future<Object> future = Patterns.ask(jobManagerSel, registerMessage, new Timeout(messageTimeout));
future.onComplete(new OnComplete<Object>() {
@Override
public void onComplete(Throwable failure, Object msg) {
// only process if we haven't been connected in the meantime
if (jobManager == null) {
if (msg != null) {
if (msg instanceof LeaderSessionMessage &&
((LeaderSessionMessage) msg).message() instanceof RegisterResourceManagerSuccessful) {
self().tell(msg, ActorRef.noSender());
} else {
LOG.error("Invalid response type to registration at JobManager: {}", msg);
self().tell(retryMessage, ActorRef.noSender());
}
} else {
// no success
LOG.error("Resource manager could not register at JobManager", failure);
self().tell(retryMessage, ActorRef.noSender());
}
}
}
}, context().dispatcher());
}
/**
* This method disassociates from the current leader JobManager.
*/
private void jobManagerLostLeadership() {
if (jobManager != null) {
LOG.info("Associated JobManager {} lost leader status", jobManager);
jobManager = null;
leaderSessionID = null;
infoMessageListeners.clear();
}
}
/**
* Callback when we're informed about a new leading JobManager.
* @param newJobManagerLeader The ActorRef of the new jobManager
* @param workers The existing workers the JobManager has registered.
*/
private void jobManagerLeaderConnected(
ActorRef newJobManagerLeader,
Collection<ResourceID> workers) {
if (jobManager == null) {
LOG.info("Resource Manager associating with leading JobManager {} - leader session {}",
newJobManagerLeader, leaderSessionID);
jobManager = newJobManagerLeader;
if (workers.size() > 0) {
LOG.info("Received TaskManagers that were registered at the leader JobManager. " +
"Trying to consolidate.");
// keep track of which TaskManagers are not handled
Set<ResourceID> toHandle = new HashSet<>(workers.size());
toHandle.addAll(workers);
try {
// ask the framework to tell us which ones we should keep for now
Collection<WorkerType> consolidated = reacceptRegisteredWorkers(workers);
LOG.info("Consolidated {} TaskManagers", consolidated.size());
// put the consolidated TaskManagers into our bookkeeping
for (WorkerType worker : consolidated) {
ResourceID resourceID = worker.getResourceID();
startedWorkers.put(resourceID, worker);
toHandle.remove(resourceID);
}
}
catch (Throwable t) {
LOG.error("Error during consolidation of known TaskManagers", t);
// the framework should release the remaining unclear resources
for (ResourceID id : toHandle) {
releasePendingWorker(id);
}
}
}
// trigger initial check for requesting new workers
checkWorkersPool();
} else {
String msg = "Attempting to associate with new JobManager leader " + newJobManagerLeader
+ " without previously disassociating from current leader " + jobManager;
fatalError(msg, new Exception(msg));
}
}
// ------------------------------------------------------------------------
// ClusterClient Shutdown
// ------------------------------------------------------------------------
private void shutdownCluster(ApplicationStatus status, String diagnostics) {
LOG.info("Shutting down cluster with status {} : {}", status, diagnostics);
shutdownApplication(status, diagnostics);
}
// ------------------------------------------------------------------------
// Worker pool size management
// ------------------------------------------------------------------------
/**
* This method causes the resource framework master to <b>synchronously</b>re-examine
* the set of available and pending workers containers, and allocate containers
* if needed.
*
* This method does not automatically release workers, because it is not visible to
* this resource master which workers can be released. Instead, the JobManager must
* explicitly release individual workers.
*/
private void checkWorkersPool() {
int numWorkersPending = getNumWorkerRequestsPending();
int numWorkersPendingRegistration = getNumWorkersPendingRegistration();
// sanity checks
Preconditions.checkState(numWorkersPending >= 0,
"Number of pending workers should never be below 0.");
Preconditions.checkState(numWorkersPendingRegistration >= 0,
"Number of pending workers pending registration should never be below 0.");
// see how many workers we want, and whether we have enough
int allAvailableAndPending = startedWorkers.size() +
numWorkersPending + numWorkersPendingRegistration;
int missing = designatedPoolSize - allAvailableAndPending;
if (missing > 0) {
requestNewWorkers(missing);
}
}
/**
* Sets the designated worker pool size. If this size is larger than the current pool
* size, then the resource manager will try to acquire more TaskManagers.
*
* @param num The number of workers in the pool.
*/
private void adjustDesignatedNumberOfWorkers(int num) {
if (num >= 0) {
LOG.info("Adjusting designated worker pool size to {}", num);
designatedPoolSize = num;
checkWorkersPool();
} else {
LOG.warn("Ignoring invalid designated worker pool size: " + num);
}
}
// ------------------------------------------------------------------------
// Callbacks
// ------------------------------------------------------------------------
/**
* This method causes the resource framework master to <b>asynchronously</b>re-examine
* the set of available and pending workers containers, and release or allocate
* containers if needed. The method sends an actor message which will trigger the
* re-examination.
*/
public void triggerCheckWorkers() {
self().tell(
decorateMessage(
CheckAndAllocateContainers.get()),
ActorRef.noSender());
}
/**
* This method should be called by the framework once it detects that a currently registered
* worker has failed.
*
* @param resourceID Id of the worker that has failed.
* @param message An informational message that explains why the worker failed.
*/
public void notifyWorkerFailed(ResourceID resourceID, String message) {
WorkerType worker = startedWorkers.remove(resourceID);
if (worker != null) {
jobManager.tell(
decorateMessage(
new ResourceRemoved(resourceID, message)),
self());
}
}
// ------------------------------------------------------------------------
// Framework specific behavior
// ------------------------------------------------------------------------
/**
* Initializes the framework specific components.
*
* @throws Exception Exceptions during initialization cause the resource manager to fail.
* If the framework is able to recover this resource manager, it will be
* restarted.
*/
protected abstract void initialize() throws Exception;
/**
* The framework specific code for shutting down the application. This should report the
* application's final status and shut down the resource manager cleanly.
*
* This method also needs to make sure all pending containers that are not registered
* yet are returned.
*
* @param finalStatus The application status to report.
* @param optionalDiagnostics An optional diagnostics message.
*/
protected abstract void shutdownApplication(ApplicationStatus finalStatus, String optionalDiagnostics);
/**
* Notifies the resource master of a fatal error.
*
* <p><b>IMPORTANT:</b> This should not cleanly shut down this master, but exit it in
* such a way that a high-availability setting would restart this or fail over
* to another master.
*/
protected abstract void fatalError(String message, Throwable error);
/**
* Requests to allocate a certain number of new workers.
*
* @param numWorkers The number of workers to allocate.
*/
protected abstract void requestNewWorkers(int numWorkers);
/**
* Trigger a release of a pending worker.
* @param resourceID The worker resource id
*/
protected abstract void releasePendingWorker(ResourceID resourceID);
/**
* Trigger a release of a started worker.
* @param resourceID The worker resource id
*/
protected abstract void releaseStartedWorker(WorkerType resourceID);
/**
* Callback when a worker was started.
* @param resourceID The worker resource id
*/
protected abstract WorkerType workerStarted(ResourceID resourceID);
/**
* This method is called when the resource manager starts after a failure and reconnects to
* the leader JobManager, who still has some workers registered. The method is used to consolidate
* the view between resource manager and JobManager. The resource manager gets the list of TaskManagers
* that the JobManager considers available and should return a list or nodes that the
* resource manager considers available.
*
* After that, the JobManager is informed of loss of all TaskManagers that are not part of the
* returned list.
*
* It is possible that the resource manager initially confirms some TaskManagers to be alive, even
* through they are in an uncertain status, if it later sends necessary failure notifications
* via calling {@link #notifyWorkerFailed(ResourceID, String)}.
*
* @param registered The list of TaskManagers that the JobManager knows.
* @return The subset of TaskManagers that the resource manager can confirm to be alive.
*/
protected abstract Collection<WorkerType> reacceptRegisteredWorkers(Collection<ResourceID> registered);
/**
* Gets the number of requested workers that have not yet been granted.
*
* @return The number pending worker requests. Must never be smaller than 0.
*/
protected abstract int getNumWorkerRequestsPending();
/**
* Gets the number of containers that have been started, but where the TaskManager
* has not yet registered at the job manager.
*
* @return The number of started containers pending TaskManager registration.
* Must never be smaller than 0.
*/
protected abstract int getNumWorkersPendingRegistration();
// ------------------------------------------------------------------------
// Info messaging
// ------------------------------------------------------------------------
protected void sendInfoMessage(String message) {
for (ActorRef listener : infoMessageListeners) {
listener.tell(decorateMessage(new InfoMessage(message)), self());
}
}
// ------------------------------------------------------------------------
// Startup
// ------------------------------------------------------------------------
/**
* Starts the resource manager actors.
* @param configuration The configuration for the resource manager
* @param actorSystem The actor system to start the resource manager in
* @param leaderRetriever The leader retriever service to intialize the resource manager
* @param resourceManagerClass The class of the ResourceManager to be started
* @return ActorRef of the resource manager
*/
public static ActorRef startResourceManagerActors(
Configuration configuration,
ActorSystem actorSystem,
LeaderRetrievalService leaderRetriever,
Class<? extends FlinkResourceManager<?>> resourceManagerClass) {
return startResourceManagerActors(
configuration, actorSystem, leaderRetriever, resourceManagerClass,
RESOURCE_MANAGER_NAME + "-" + UUID.randomUUID());
}
/**
* Starts the resource manager actors.
* @param configuration The configuration for the resource manager
* @param actorSystem The actor system to start the resource manager in
* @param leaderRetriever The leader retriever service to intialize the resource manager
* @param resourceManagerClass The class of the ResourceManager to be started
* @param resourceManagerActorName The name of the resource manager actor.
* @return ActorRef of the resource manager
*/
public static ActorRef startResourceManagerActors(
Configuration configuration,
ActorSystem actorSystem,
LeaderRetrievalService leaderRetriever,
Class<? extends FlinkResourceManager<?>> resourceManagerClass,
String resourceManagerActorName) {
Props resourceMasterProps = getResourceManagerProps(
resourceManagerClass,
configuration,
leaderRetriever);
return actorSystem.actorOf(resourceMasterProps, resourceManagerActorName);
}
public static Props getResourceManagerProps(
Class<? extends FlinkResourceManager> resourceManagerClass,
Configuration configuration,
LeaderRetrievalService leaderRetrievalService) {
return Props.create(resourceManagerClass, configuration, leaderRetrievalService);
}
}