/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.mesos.runtime.clusterframework;
import akka.actor.ActorRef;
import akka.actor.Props;
import com.netflix.fenzo.TaskRequest;
import com.netflix.fenzo.TaskScheduler;
import com.netflix.fenzo.VirtualMachineLease;
import com.netflix.fenzo.functions.Action1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.IllegalConfigurationException;
import org.apache.flink.mesos.runtime.clusterframework.store.MesosWorkerStore;
import org.apache.flink.mesos.scheduler.ConnectionMonitor;
import org.apache.flink.mesos.scheduler.LaunchableTask;
import org.apache.flink.mesos.scheduler.LaunchCoordinator;
import org.apache.flink.mesos.scheduler.ReconciliationCoordinator;
import org.apache.flink.mesos.scheduler.SchedulerProxy;
import org.apache.flink.mesos.scheduler.TaskMonitor;
import org.apache.flink.mesos.scheduler.TaskSchedulerBuilder;
import org.apache.flink.mesos.scheduler.Tasks;
import org.apache.flink.mesos.scheduler.messages.AcceptOffers;
import org.apache.flink.mesos.scheduler.messages.Disconnected;
import org.apache.flink.mesos.scheduler.messages.Error;
import org.apache.flink.mesos.scheduler.messages.OfferRescinded;
import org.apache.flink.mesos.scheduler.messages.ReRegistered;
import org.apache.flink.mesos.scheduler.messages.Registered;
import org.apache.flink.mesos.scheduler.messages.ResourceOffers;
import org.apache.flink.mesos.scheduler.messages.StatusUpdate;
import org.apache.flink.mesos.util.MesosArtifactResolver;
import org.apache.flink.mesos.util.MesosConfiguration;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.FlinkResourceManager;
import org.apache.flink.runtime.clusterframework.ContainerSpecification;
import org.apache.flink.runtime.clusterframework.messages.FatalErrorOccurred;
import org.apache.flink.runtime.clusterframework.messages.StopCluster;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.mesos.Protos;
import org.apache.mesos.Protos.FrameworkInfo;
import org.apache.mesos.SchedulerDriver;
import org.slf4j.Logger;
import scala.Option;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import static java.util.Objects.requireNonNull;
/**
* Flink Resource Manager for Apache Mesos.
*/
public class MesosFlinkResourceManager extends FlinkResourceManager<RegisteredMesosWorkerNode> {
/** The Mesos configuration (master and framework info) */
private final MesosConfiguration mesosConfig;
/** The TaskManager container parameters (like container memory size) */
private final MesosTaskManagerParameters taskManagerParameters;
/** Container specification for launching a TM */
private final ContainerSpecification taskManagerContainerSpec;
/** Resolver for HTTP artifacts **/
private final MesosArtifactResolver artifactResolver;
/** Number of failed Mesos tasks before stopping the application. -1 means infinite. */
private final int maxFailedTasks;
/** Callback handler for the asynchronous Mesos scheduler */
private SchedulerProxy schedulerCallbackHandler;
/** Mesos scheduler driver */
private SchedulerDriver schedulerDriver;
private ActorRef connectionMonitor;
private ActorRef taskRouter;
private ActorRef launchCoordinator;
private ActorRef reconciliationCoordinator;
private final MesosWorkerStore workerStore;
/** planning state related to workers - package private for unit test purposes */
final Map<ResourceID, MesosWorkerStore.Worker> workersInNew;
final Map<ResourceID, MesosWorkerStore.Worker> workersInLaunch;
final Map<ResourceID, MesosWorkerStore.Worker> workersBeingReturned;
/** The number of failed tasks since the master became active */
private int failedTasksSoFar;
public MesosFlinkResourceManager(
Configuration flinkConfig,
MesosConfiguration mesosConfig,
MesosWorkerStore workerStore,
LeaderRetrievalService leaderRetrievalService,
MesosTaskManagerParameters taskManagerParameters,
ContainerSpecification taskManagerContainerSpec,
MesosArtifactResolver artifactResolver,
int maxFailedTasks,
int numInitialTaskManagers) {
super(numInitialTaskManagers, flinkConfig, leaderRetrievalService);
this.mesosConfig = requireNonNull(mesosConfig);
this.workerStore = requireNonNull(workerStore);
this.artifactResolver = requireNonNull(artifactResolver);
this.taskManagerParameters = requireNonNull(taskManagerParameters);
this.taskManagerContainerSpec = requireNonNull(taskManagerContainerSpec);
this.maxFailedTasks = maxFailedTasks;
this.workersInNew = new HashMap<>();
this.workersInLaunch = new HashMap<>();
this.workersBeingReturned = new HashMap<>();
}
// ------------------------------------------------------------------------
// Mesos-specific behavior
// ------------------------------------------------------------------------
@Override
protected void initialize() throws Exception {
LOG.info("Initializing Mesos resource master");
workerStore.start();
// create the scheduler driver to communicate with Mesos
schedulerCallbackHandler = new SchedulerProxy(self());
// register with Mesos
FrameworkInfo.Builder frameworkInfo = mesosConfig.frameworkInfo()
.clone()
.setCheckpoint(true);
Option<Protos.FrameworkID> frameworkID = workerStore.getFrameworkID();
if(frameworkID.isEmpty()) {
LOG.info("Registering as new framework.");
}
else {
LOG.info("Recovery scenario: re-registering using framework ID {}.", frameworkID.get().getValue());
frameworkInfo.setId(frameworkID.get());
}
MesosConfiguration initializedMesosConfig = mesosConfig.withFrameworkInfo(frameworkInfo);
MesosConfiguration.logMesosConfig(LOG, initializedMesosConfig);
schedulerDriver = initializedMesosConfig.createDriver(schedulerCallbackHandler, false);
// create supporting actors
connectionMonitor = createConnectionMonitor();
launchCoordinator = createLaunchCoordinator();
reconciliationCoordinator = createReconciliationCoordinator();
taskRouter = createTaskRouter();
recoverWorkers();
connectionMonitor.tell(new ConnectionMonitor.Start(), self());
schedulerDriver.start();
}
protected ActorRef createConnectionMonitor() {
return context().actorOf(
ConnectionMonitor.createActorProps(ConnectionMonitor.class, config),
"connectionMonitor");
}
protected ActorRef createTaskRouter() {
return context().actorOf(
Tasks.createActorProps(Tasks.class, config, schedulerDriver, TaskMonitor.class),
"tasks");
}
protected ActorRef createLaunchCoordinator() {
return context().actorOf(
LaunchCoordinator.createActorProps(LaunchCoordinator.class, self(), config, schedulerDriver, createOptimizer()),
"launchCoordinator");
}
protected ActorRef createReconciliationCoordinator() {
return context().actorOf(
ReconciliationCoordinator.createActorProps(ReconciliationCoordinator.class, config, schedulerDriver),
"reconciliationCoordinator");
}
@Override
public void postStop() {
LOG.info("Stopping Mesos resource master");
super.postStop();
}
// ------------------------------------------------------------------------
// Actor messages
// ------------------------------------------------------------------------
@Override
protected void handleMessage(Object message) {
// check for Mesos-specific actor messages first
// --- messages about Mesos connection
if (message instanceof Registered) {
registered((Registered) message);
} else if (message instanceof ReRegistered) {
reregistered((ReRegistered) message);
} else if (message instanceof Disconnected) {
disconnected((Disconnected) message);
} else if (message instanceof Error) {
error(((Error) message).message());
// --- messages about offers
} else if (message instanceof ResourceOffers || message instanceof OfferRescinded) {
launchCoordinator.tell(message, self());
} else if (message instanceof AcceptOffers) {
acceptOffers((AcceptOffers) message);
// --- messages about tasks
} else if (message instanceof StatusUpdate) {
taskStatusUpdated((StatusUpdate) message);
} else if (message instanceof ReconciliationCoordinator.Reconcile) {
// a reconciliation request from a task
reconciliationCoordinator.tell(message, self());
} else if (message instanceof TaskMonitor.TaskTerminated) {
// a termination message from a task
TaskMonitor.TaskTerminated msg = (TaskMonitor.TaskTerminated) message;
taskTerminated(msg.taskID(), msg.status());
} else {
// message handled by the generic resource master code
super.handleMessage(message);
}
}
/**
* Called to shut down the cluster (not a failover situation).
*
* @param finalStatus The application status to report.
* @param optionalDiagnostics An optional diagnostics message.
*/
@Override
protected void shutdownApplication(ApplicationStatus finalStatus, String optionalDiagnostics) {
LOG.info("Shutting down and unregistering as a Mesos framework.");
try {
// unregister the framework, which implicitly removes all tasks.
schedulerDriver.stop(false);
}
catch(Exception ex) {
LOG.warn("unable to unregister the framework", ex);
}
try {
workerStore.stop(true);
}
catch(Exception ex) {
LOG.warn("unable to stop the worker state store", ex);
}
context().stop(self());
}
@Override
protected void fatalError(String message, Throwable error) {
// we do not unregister, but cause a hard fail of this process, to have it
// restarted by the dispatcher
LOG.error("FATAL ERROR IN MESOS APPLICATION MASTER: " + message, error);
LOG.error("Shutting down process");
// kill this process, this will make an external supervisor (the dispatcher) restart the process
System.exit(EXIT_CODE_FATAL_ERROR);
}
// ------------------------------------------------------------------------
// Worker Management
// ------------------------------------------------------------------------
/**
* Recover framework/worker information persisted by a prior incarnation of the RM.
*/
private void recoverWorkers() throws Exception {
// if this application master starts as part of an ApplicationMaster/JobManager recovery,
// then some worker tasks are most likely still alive and we can re-obtain them
final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts = workerStore.recoverWorkers();
if (!tasksFromPreviousAttempts.isEmpty()) {
LOG.info("Retrieved {} TaskManagers from previous attempt", tasksFromPreviousAttempts.size());
List<Tuple2<TaskRequest,String>> toAssign = new ArrayList<>(tasksFromPreviousAttempts.size());
List<LaunchableTask> toLaunch = new ArrayList<>(tasksFromPreviousAttempts.size());
for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) {
LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());
switch(worker.state()) {
case New:
workersInNew.put(extractResourceID(worker.taskID()), worker);
toLaunch.add(launchable);
break;
case Launched:
workersInLaunch.put(extractResourceID(worker.taskID()), worker);
toAssign.add(new Tuple2<>(launchable.taskRequest(), worker.hostname().get()));
break;
case Released:
workersBeingReturned.put(extractResourceID(worker.taskID()), worker);
break;
}
taskRouter.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), self());
}
// tell the launch coordinator about prior assignments
if(toAssign.size() >= 1) {
launchCoordinator.tell(new LaunchCoordinator.Assign(toAssign), self());
}
// tell the launch coordinator to launch any new tasks
if(toLaunch.size() >= 1) {
launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self());
}
}
}
/**
* Plan for some additional workers to be launched.
*
* @param numWorkers The number of workers to allocate.
*/
@Override
protected void requestNewWorkers(int numWorkers) {
try {
List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(numWorkers);
List<LaunchableTask> toLaunch = new ArrayList<>(numWorkers);
// generate new workers into persistent state and launch associated actors
for (int i = 0; i < numWorkers; i++) {
MesosWorkerStore.Worker worker = MesosWorkerStore.Worker.newWorker(workerStore.newTaskID());
workerStore.putWorker(worker);
workersInNew.put(extractResourceID(worker.taskID()), worker);
LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());
LOG.info("Scheduling Mesos task {} with ({} MB, {} cpus).",
launchable.taskID().getValue(), launchable.taskRequest().getMemory(), launchable.taskRequest().getCPUs());
toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)));
toLaunch.add(launchable);
}
// tell the task router about the new plans
for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) {
taskRouter.tell(update, self());
}
// tell the launch coordinator to launch the new tasks
if(toLaunch.size() >= 1) {
launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self());
}
}
catch(Exception ex) {
fatalError("unable to request new workers", ex);
}
}
/**
* Accept offers as advised by the launch coordinator.
*
* Acceptance is routed through the RM to update the persistent state before
* forwarding the message to Mesos.
*/
private void acceptOffers(AcceptOffers msg) {
try {
List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(msg.operations().size());
// transition the persistent state of some tasks to Launched
for (Protos.Offer.Operation op : msg.operations()) {
if (op.getType() != Protos.Offer.Operation.Type.LAUNCH) {
continue;
}
for (Protos.TaskInfo info : op.getLaunch().getTaskInfosList()) {
MesosWorkerStore.Worker worker = workersInNew.remove(extractResourceID(info.getTaskId()));
assert (worker != null);
worker = worker.launchWorker(info.getSlaveId(), msg.hostname());
workerStore.putWorker(worker);
workersInLaunch.put(extractResourceID(worker.taskID()), worker);
LOG.info("Launching Mesos task {} on host {}.",
worker.taskID().getValue(), worker.hostname().get());
toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)));
}
}
// tell the task router about the new plans
for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) {
taskRouter.tell(update, self());
}
// send the acceptance message to Mesos
schedulerDriver.acceptOffers(msg.offerIds(), msg.operations(), msg.filters());
}
catch(Exception ex) {
fatalError("unable to accept offers", ex);
}
}
/**
* Handle a task status change.
*/
private void taskStatusUpdated(StatusUpdate message) {
taskRouter.tell(message, self());
reconciliationCoordinator.tell(message, self());
schedulerDriver.acknowledgeStatusUpdate(message.status());
}
/**
* Accept the given started worker into the internal state.
*
* @param resourceID The worker resource id
* @return A registered worker node record.
*/
@Override
protected RegisteredMesosWorkerNode workerStarted(ResourceID resourceID) {
MesosWorkerStore.Worker inLaunch = workersInLaunch.remove(resourceID);
if (inLaunch == null) {
// Worker was not in state "being launched", this can indicate that the TaskManager
// in this worker was already registered or that the container was not started
// by this resource manager. Simply ignore this resourceID.
return null;
}
return new RegisteredMesosWorkerNode(inLaunch);
}
/**
* Accept the given registered workers into the internal state.
*
* @param toConsolidate The worker IDs known previously to the JobManager.
* @return A collection of registered worker node records.
*/
@Override
protected Collection<RegisteredMesosWorkerNode> reacceptRegisteredWorkers(Collection<ResourceID> toConsolidate) {
// we check for each task manager if we recognize its Mesos task ID
List<RegisteredMesosWorkerNode> accepted = new ArrayList<>(toConsolidate.size());
for (ResourceID resourceID : toConsolidate) {
MesosWorkerStore.Worker worker = workersInLaunch.remove(resourceID);
if (worker != null) {
LOG.info("Mesos worker consolidation recognizes TaskManager {}.", resourceID);
accepted.add(new RegisteredMesosWorkerNode(worker));
}
else {
if(isStarted(resourceID)) {
LOG.info("TaskManager {} has already been registered at the resource manager.", resourceID);
}
else {
LOG.info("Mesos worker consolidation does not recognize TaskManager {}.", resourceID);
}
}
}
return accepted;
}
/**
* Release the given pending worker.
*/
@Override
protected void releasePendingWorker(ResourceID id) {
MesosWorkerStore.Worker worker = workersInLaunch.remove(id);
if (worker != null) {
releaseWorker(worker);
} else {
LOG.error("Cannot find worker {} to release. Ignoring request.", id);
}
}
/**
* Release the given started worker.
*/
@Override
protected void releaseStartedWorker(RegisteredMesosWorkerNode worker) {
releaseWorker(worker.task());
}
/**
* Plan for the removal of the given worker.
*/
private void releaseWorker(MesosWorkerStore.Worker worker) {
try {
LOG.info("Releasing worker {}", worker.taskID());
// update persistent state of worker to Released
worker = worker.releaseWorker();
workerStore.putWorker(worker);
workersBeingReturned.put(extractResourceID(worker.taskID()), worker);
taskRouter.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), self());
if (worker.hostname().isDefined()) {
// tell the launch coordinator that the task is being unassigned from the host, for planning purposes
launchCoordinator.tell(new LaunchCoordinator.Unassign(worker.taskID(), worker.hostname().get()), self());
}
}
catch (Exception ex) {
fatalError("unable to release worker", ex);
}
}
@Override
protected int getNumWorkerRequestsPending() {
return workersInNew.size();
}
@Override
protected int getNumWorkersPendingRegistration() {
return workersInLaunch.size();
}
// ------------------------------------------------------------------------
// Callbacks from the Mesos Master
// ------------------------------------------------------------------------
/**
* Called when connected to Mesos as a new framework.
*/
private void registered(Registered message) {
connectionMonitor.tell(message, self());
try {
workerStore.setFrameworkID(Option.apply(message.frameworkId()));
}
catch(Exception ex) {
fatalError("unable to store the assigned framework ID", ex);
return;
}
launchCoordinator.tell(message, self());
reconciliationCoordinator.tell(message, self());
taskRouter.tell(message, self());
}
/**
* Called when reconnected to Mesos following a failover event.
*/
private void reregistered(ReRegistered message) {
connectionMonitor.tell(message, self());
launchCoordinator.tell(message, self());
reconciliationCoordinator.tell(message, self());
taskRouter.tell(message, self());
}
/**
* Called when disconnected from Mesos.
*/
private void disconnected(Disconnected message) {
connectionMonitor.tell(message, self());
launchCoordinator.tell(message, self());
reconciliationCoordinator.tell(message, self());
taskRouter.tell(message, self());
}
/**
* Called when an error is reported by the scheduler callback.
*/
private void error(String message) {
self().tell(new FatalErrorOccurred("Connection to Mesos failed", new Exception(message)), self());
}
/**
* Invoked when a Mesos task reaches a terminal status.
*/
private void taskTerminated(Protos.TaskID taskID, Protos.TaskStatus status) {
// this callback occurs for failed containers and for released containers alike
final ResourceID id = extractResourceID(taskID);
boolean existed;
try {
existed = workerStore.removeWorker(taskID);
}
catch(Exception ex) {
fatalError("unable to remove worker", ex);
return;
}
if(!existed) {
LOG.info("Received a termination notice for an unrecognized worker: {}", id);
return;
}
// check if this is a failed task or a released task
if (workersBeingReturned.remove(id) != null) {
// regular finished worker that we released
LOG.info("Worker {} finished successfully with diagnostics: {}",
id, status.getMessage());
} else {
// failed worker, either at startup, or running
final MesosWorkerStore.Worker launched = workersInLaunch.remove(id);
if (launched != null) {
LOG.info("Mesos task {} failed, with a TaskManager in launch or registration. " +
"State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());
// we will trigger re-acquiring new workers at the end
} else {
// failed registered worker
LOG.info("Mesos task {} failed, with a registered TaskManager. " +
"State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());
// notify the generic logic, which notifies the JobManager, etc.
notifyWorkerFailed(id, "Mesos task " + id + " failed. State: " + status.getState());
}
// general failure logging
failedTasksSoFar++;
String diagMessage = String.format("Diagnostics for task %s in state %s : " +
"reason=%s message=%s",
id, status.getState(), status.getReason(), status.getMessage());
sendInfoMessage(diagMessage);
LOG.info(diagMessage);
LOG.info("Total number of failed tasks so far: {}", failedTasksSoFar);
// maxFailedTasks == -1 is infinite number of retries.
if (maxFailedTasks >= 0 && failedTasksSoFar > maxFailedTasks) {
String msg = "Stopping Mesos session because the number of failed tasks ("
+ failedTasksSoFar + ") exceeded the maximum failed tasks ("
+ maxFailedTasks + "). This number is controlled by the '"
+ ConfigConstants.MESOS_MAX_FAILED_TASKS + "' configuration setting. "
+ "By default its the number of requested tasks.";
LOG.error(msg);
self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)),
ActorRef.noSender());
// no need to do anything else
return;
}
}
// in case failed containers were among the finished containers, make
// sure we re-examine and request new ones
triggerCheckWorkers();
}
// ------------------------------------------------------------------------
// Utilities
// ------------------------------------------------------------------------
private LaunchableMesosWorker createLaunchableMesosWorker(Protos.TaskID taskID) {
LaunchableMesosWorker launchable =
new LaunchableMesosWorker(
artifactResolver,
taskManagerParameters,
taskManagerContainerSpec,
taskID,
mesosConfig);
return launchable;
}
/**
* Extracts a unique ResourceID from the Mesos task.
*
* @param taskId the Mesos TaskID
* @return The ResourceID for the container
*/
static ResourceID extractResourceID(Protos.TaskID taskId) {
return new ResourceID(taskId.getValue());
}
/**
* Extracts the Mesos task goal state from the worker information.
* @param worker the persistent worker information.
* @return goal state information for the {@Link TaskMonitor}.
*/
static TaskMonitor.TaskGoalState extractGoalState(MesosWorkerStore.Worker worker) {
switch(worker.state()) {
case New: return new TaskMonitor.New(worker.taskID());
case Launched: return new TaskMonitor.Launched(worker.taskID(), worker.slaveID().get());
case Released: return new TaskMonitor.Released(worker.taskID(), worker.slaveID().get());
default: throw new IllegalArgumentException("unsupported worker state");
}
}
/**
* Creates the Fenzo optimizer (builder).
* The builder is an indirection to facilitate unit testing of the Launch Coordinator.
*/
private static TaskSchedulerBuilder createOptimizer() {
return new TaskSchedulerBuilder() {
TaskScheduler.Builder builder = new TaskScheduler.Builder();
@Override
public TaskSchedulerBuilder withLeaseRejectAction(Action1<VirtualMachineLease> action) {
builder.withLeaseRejectAction(action);
return this;
}
@Override
public TaskScheduler build() {
return builder.build();
}
};
}
/**
* Creates the props needed to instantiate this actor.
*
* Rather than extracting and validating parameters in the constructor, this factory method takes
* care of that. That way, errors occur synchronously, and are not swallowed simply in a
* failed asynchronous attempt to start the actor.
* @param actorClass
* The actor class, to allow overriding this actor with subclasses for testing.
* @param flinkConfig
* The Flink configuration object.
* @param taskManagerParameters
* The parameters for launching TaskManager containers.
* @param taskManagerContainerSpec
* The container specification.
* @param artifactResolver
* The artifact resolver to locate artifacts
* @param log
* The logger to log to.
*
* @return The Props object to instantiate the MesosFlinkResourceManager actor.
*/
public static Props createActorProps(Class<? extends MesosFlinkResourceManager> actorClass,
Configuration flinkConfig,
MesosConfiguration mesosConfig,
MesosWorkerStore workerStore,
LeaderRetrievalService leaderRetrievalService,
MesosTaskManagerParameters taskManagerParameters,
ContainerSpecification taskManagerContainerSpec,
MesosArtifactResolver artifactResolver,
Logger log)
{
final int numInitialTaskManagers = flinkConfig.getInteger(
ConfigConstants.MESOS_INITIAL_TASKS, 0);
if (numInitialTaskManagers >= 0) {
log.info("Mesos framework to allocate {} initial tasks",
numInitialTaskManagers);
}
else {
throw new IllegalConfigurationException("Invalid value for " +
ConfigConstants.MESOS_INITIAL_TASKS + ", which must be at least zero.");
}
final int maxFailedTasks = flinkConfig.getInteger(
ConfigConstants.MESOS_MAX_FAILED_TASKS, numInitialTaskManagers);
if (maxFailedTasks >= 0) {
log.info("Mesos framework tolerates {} failed tasks before giving up",
maxFailedTasks);
}
return Props.create(actorClass,
flinkConfig,
mesosConfig,
workerStore,
leaderRetrievalService,
taskManagerParameters,
taskManagerContainerSpec,
artifactResolver,
maxFailedTasks,
numInitialTaskManagers);
}
}