MesosFlinkResourceManager.java example

Explorer
flink-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.mesos.runtime.clusterframework;

import akka.actor.ActorRef;
import akka.actor.Props;
import com.netflix.fenzo.TaskRequest;
import com.netflix.fenzo.TaskScheduler;
import com.netflix.fenzo.VirtualMachineLease;
import com.netflix.fenzo.functions.Action1;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.IllegalConfigurationException;
import org.apache.flink.mesos.runtime.clusterframework.store.MesosWorkerStore;
import org.apache.flink.mesos.scheduler.ConnectionMonitor;
import org.apache.flink.mesos.scheduler.LaunchableTask;
import org.apache.flink.mesos.scheduler.LaunchCoordinator;
import org.apache.flink.mesos.scheduler.ReconciliationCoordinator;
import org.apache.flink.mesos.scheduler.SchedulerProxy;
import org.apache.flink.mesos.scheduler.TaskMonitor;
import org.apache.flink.mesos.scheduler.TaskSchedulerBuilder;
import org.apache.flink.mesos.scheduler.Tasks;
import org.apache.flink.mesos.scheduler.messages.AcceptOffers;
import org.apache.flink.mesos.scheduler.messages.Disconnected;
import org.apache.flink.mesos.scheduler.messages.Error;
import org.apache.flink.mesos.scheduler.messages.OfferRescinded;
import org.apache.flink.mesos.scheduler.messages.ReRegistered;
import org.apache.flink.mesos.scheduler.messages.Registered;
import org.apache.flink.mesos.scheduler.messages.ResourceOffers;
import org.apache.flink.mesos.scheduler.messages.StatusUpdate;
import org.apache.flink.mesos.util.MesosArtifactResolver;
import org.apache.flink.mesos.util.MesosConfiguration;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.FlinkResourceManager;
import org.apache.flink.runtime.clusterframework.ContainerSpecification;
import org.apache.flink.runtime.clusterframework.messages.FatalErrorOccurred;
import org.apache.flink.runtime.clusterframework.messages.StopCluster;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.mesos.Protos;
import org.apache.mesos.Protos.FrameworkInfo;
import org.apache.mesos.SchedulerDriver;
import org.slf4j.Logger;
import scala.Option;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import static java.util.Objects.requireNonNull;

/**
 * Flink Resource Manager for Apache Mesos.
 */
public class MesosFlinkResourceManager extends FlinkResourceManager<RegisteredMesosWorkerNode> {

	/** The Mesos configuration (master and framework info) */
	private final MesosConfiguration mesosConfig;

	/** The TaskManager container parameters (like container memory size) */
	private final MesosTaskManagerParameters taskManagerParameters;

	/** Container specification for launching a TM */
	private final ContainerSpecification taskManagerContainerSpec;

	/** Resolver for HTTP artifacts **/
	private final MesosArtifactResolver artifactResolver;

	/** Number of failed Mesos tasks before stopping the application. -1 means infinite. */
	private final int maxFailedTasks;

	/** Callback handler for the asynchronous Mesos scheduler */
	private SchedulerProxy schedulerCallbackHandler;

	/** Mesos scheduler driver */
	private SchedulerDriver schedulerDriver;

	private ActorRef connectionMonitor;

	private ActorRef taskRouter;

	private ActorRef launchCoordinator;

	private ActorRef reconciliationCoordinator;

	private final MesosWorkerStore workerStore;

	/** planning state related to workers - package private for unit test purposes */
	final Map<ResourceID, MesosWorkerStore.Worker> workersInNew;
	final Map<ResourceID, MesosWorkerStore.Worker> workersInLaunch;
	final Map<ResourceID, MesosWorkerStore.Worker> workersBeingReturned;

	/** The number of failed tasks since the master became active */
	private int failedTasksSoFar;

	public MesosFlinkResourceManager(
		Configuration flinkConfig,
		MesosConfiguration mesosConfig,
		MesosWorkerStore workerStore,
		LeaderRetrievalService leaderRetrievalService,
		MesosTaskManagerParameters taskManagerParameters,
		ContainerSpecification taskManagerContainerSpec,
		MesosArtifactResolver artifactResolver,
		int maxFailedTasks,
		int numInitialTaskManagers) {

		super(numInitialTaskManagers, flinkConfig, leaderRetrievalService);

		this.mesosConfig = requireNonNull(mesosConfig);

		this.workerStore = requireNonNull(workerStore);
		this.artifactResolver = requireNonNull(artifactResolver);

		this.taskManagerParameters = requireNonNull(taskManagerParameters);
		this.taskManagerContainerSpec = requireNonNull(taskManagerContainerSpec);
		this.maxFailedTasks = maxFailedTasks;

		this.workersInNew = new HashMap<>();
		this.workersInLaunch = new HashMap<>();
		this.workersBeingReturned = new HashMap<>();
	}

	// ------------------------------------------------------------------------
	//  Mesos-specific behavior
	// ------------------------------------------------------------------------

	@Override
	protected void initialize() throws Exception {
		LOG.info("Initializing Mesos resource master");

		workerStore.start();

		// create the scheduler driver to communicate with Mesos
		schedulerCallbackHandler = new SchedulerProxy(self());

		// register with Mesos
		FrameworkInfo.Builder frameworkInfo = mesosConfig.frameworkInfo()
			.clone()
			.setCheckpoint(true);

		Option<Protos.FrameworkID> frameworkID = workerStore.getFrameworkID();
		if(frameworkID.isEmpty()) {
			LOG.info("Registering as new framework.");
		}
		else {
			LOG.info("Recovery scenario: re-registering using framework ID {}.", frameworkID.get().getValue());
			frameworkInfo.setId(frameworkID.get());
		}

		MesosConfiguration initializedMesosConfig = mesosConfig.withFrameworkInfo(frameworkInfo);
		MesosConfiguration.logMesosConfig(LOG, initializedMesosConfig);
		schedulerDriver = initializedMesosConfig.createDriver(schedulerCallbackHandler, false);

		// create supporting actors
		connectionMonitor = createConnectionMonitor();
		launchCoordinator = createLaunchCoordinator();
		reconciliationCoordinator = createReconciliationCoordinator();
		taskRouter = createTaskRouter();

		recoverWorkers();

		connectionMonitor.tell(new ConnectionMonitor.Start(), self());
		schedulerDriver.start();
	}

	protected ActorRef createConnectionMonitor() {
		return context().actorOf(
			ConnectionMonitor.createActorProps(ConnectionMonitor.class, config),
			"connectionMonitor");
	}

	protected ActorRef createTaskRouter() {
		return context().actorOf(
			Tasks.createActorProps(Tasks.class, config, schedulerDriver, TaskMonitor.class),
			"tasks");
	}

	protected ActorRef createLaunchCoordinator() {
		return context().actorOf(
			LaunchCoordinator.createActorProps(LaunchCoordinator.class, self(), config, schedulerDriver, createOptimizer()),
			"launchCoordinator");
	}

	protected ActorRef createReconciliationCoordinator() {
		return context().actorOf(
			ReconciliationCoordinator.createActorProps(ReconciliationCoordinator.class, config, schedulerDriver),
			"reconciliationCoordinator");
	}

	@Override
	public void postStop() {
		LOG.info("Stopping Mesos resource master");
		super.postStop();
	}

	// ------------------------------------------------------------------------
	//  Actor messages
	// ------------------------------------------------------------------------

	@Override
	protected void handleMessage(Object message) {

		// check for Mesos-specific actor messages first

		// --- messages about Mesos connection
		if (message instanceof Registered) {
			registered((Registered) message);
		} else if (message instanceof ReRegistered) {
			reregistered((ReRegistered) message);
		} else if (message instanceof Disconnected) {
			disconnected((Disconnected) message);
		} else if (message instanceof Error) {
			error(((Error) message).message());

		// --- messages about offers
		} else if (message instanceof ResourceOffers || message instanceof OfferRescinded) {
			launchCoordinator.tell(message, self());
		} else if (message instanceof AcceptOffers) {
			acceptOffers((AcceptOffers) message);

		// --- messages about tasks
		} else if (message instanceof StatusUpdate) {
			taskStatusUpdated((StatusUpdate) message);
		} else if (message instanceof ReconciliationCoordinator.Reconcile) {
			// a reconciliation request from a task
			reconciliationCoordinator.tell(message, self());
		} else if (message instanceof TaskMonitor.TaskTerminated) {
			// a termination message from a task
			TaskMonitor.TaskTerminated msg = (TaskMonitor.TaskTerminated) message;
			taskTerminated(msg.taskID(), msg.status());

		} else  {
			// message handled by the generic resource master code
			super.handleMessage(message);
		}
	}

	/**
	 * Called to shut down the cluster (not a failover situation).
	 *
	 * @param finalStatus The application status to report.
	 * @param optionalDiagnostics An optional diagnostics message.
	 */
	@Override
	protected void shutdownApplication(ApplicationStatus finalStatus, String optionalDiagnostics) {

		LOG.info("Shutting down and unregistering as a Mesos framework.");
		try {
			// unregister the framework, which implicitly removes all tasks.
			schedulerDriver.stop(false);
		}
		catch(Exception ex) {
			LOG.warn("unable to unregister the framework", ex);
		}

		try {
			workerStore.stop(true);
		}
		catch(Exception ex) {
			LOG.warn("unable to stop the worker state store", ex);
		}

		context().stop(self());
	}

	@Override
	protected void fatalError(String message, Throwable error) {
		// we do not unregister, but cause a hard fail of this process, to have it
		// restarted by the dispatcher
		LOG.error("FATAL ERROR IN MESOS APPLICATION MASTER: " + message, error);
		LOG.error("Shutting down process");

		// kill this process, this will make an external supervisor (the dispatcher) restart the process
		System.exit(EXIT_CODE_FATAL_ERROR);
	}

	// ------------------------------------------------------------------------
	//  Worker Management
	// ------------------------------------------------------------------------

	/**
	 * Recover framework/worker information persisted by a prior incarnation of the RM.
	 */
	private void recoverWorkers() throws Exception {
		// if this application master starts as part of an ApplicationMaster/JobManager recovery,
		// then some worker tasks are most likely still alive and we can re-obtain them
		final List<MesosWorkerStore.Worker> tasksFromPreviousAttempts = workerStore.recoverWorkers();

		if (!tasksFromPreviousAttempts.isEmpty()) {
			LOG.info("Retrieved {} TaskManagers from previous attempt", tasksFromPreviousAttempts.size());

			List<Tuple2<TaskRequest,String>> toAssign = new ArrayList<>(tasksFromPreviousAttempts.size());
			List<LaunchableTask> toLaunch = new ArrayList<>(tasksFromPreviousAttempts.size());

			for (final MesosWorkerStore.Worker worker : tasksFromPreviousAttempts) {
				LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());

				switch(worker.state()) {
					case New:
						workersInNew.put(extractResourceID(worker.taskID()), worker);
						toLaunch.add(launchable);
						break;
					case Launched:
						workersInLaunch.put(extractResourceID(worker.taskID()), worker);
						toAssign.add(new Tuple2<>(launchable.taskRequest(), worker.hostname().get()));
						break;
					case Released:
						workersBeingReturned.put(extractResourceID(worker.taskID()), worker);
						break;
				}
				taskRouter.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), self());
			}

			// tell the launch coordinator about prior assignments
			if(toAssign.size() >= 1) {
				launchCoordinator.tell(new LaunchCoordinator.Assign(toAssign), self());
			}
			// tell the launch coordinator to launch any new tasks
			if(toLaunch.size() >= 1) {
				launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self());
			}
		}
	}

	/**
	 * Plan for some additional workers to be launched.
	 *
	 * @param numWorkers The number of workers to allocate.
	 */
	@Override
	protected void requestNewWorkers(int numWorkers) {

		try {
			List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(numWorkers);
			List<LaunchableTask> toLaunch = new ArrayList<>(numWorkers);

			// generate new workers into persistent state and launch associated actors
			for (int i = 0; i < numWorkers; i++) {
				MesosWorkerStore.Worker worker = MesosWorkerStore.Worker.newWorker(workerStore.newTaskID());
				workerStore.putWorker(worker);
				workersInNew.put(extractResourceID(worker.taskID()), worker);

				LaunchableMesosWorker launchable = createLaunchableMesosWorker(worker.taskID());

				LOG.info("Scheduling Mesos task {} with ({} MB, {} cpus).",
					launchable.taskID().getValue(), launchable.taskRequest().getMemory(), launchable.taskRequest().getCPUs());

				toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)));
				toLaunch.add(launchable);
			}

			// tell the task router about the new plans
			for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) {
				taskRouter.tell(update, self());
			}

			// tell the launch coordinator to launch the new tasks
			if(toLaunch.size() >= 1) {
				launchCoordinator.tell(new LaunchCoordinator.Launch(toLaunch), self());
			}
		}
		catch(Exception ex) {
			fatalError("unable to request new workers", ex);
		}
	}

	/**
	 * Accept offers as advised by the launch coordinator.
	 *
	 * Acceptance is routed through the RM to update the persistent state before
	 * forwarding the message to Mesos.
	 */
	private void acceptOffers(AcceptOffers msg) {

		try {
			List<TaskMonitor.TaskGoalStateUpdated> toMonitor = new ArrayList<>(msg.operations().size());

			// transition the persistent state of some tasks to Launched
			for (Protos.Offer.Operation op : msg.operations()) {
				if (op.getType() != Protos.Offer.Operation.Type.LAUNCH) {
					continue;
				}
				for (Protos.TaskInfo info : op.getLaunch().getTaskInfosList()) {
					MesosWorkerStore.Worker worker = workersInNew.remove(extractResourceID(info.getTaskId()));
					assert (worker != null);

					worker = worker.launchWorker(info.getSlaveId(), msg.hostname());
					workerStore.putWorker(worker);
					workersInLaunch.put(extractResourceID(worker.taskID()), worker);

					LOG.info("Launching Mesos task {} on host {}.",
						worker.taskID().getValue(), worker.hostname().get());

					toMonitor.add(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)));
				}
			}

			// tell the task router about the new plans
			for (TaskMonitor.TaskGoalStateUpdated update : toMonitor) {
				taskRouter.tell(update, self());
			}

			// send the acceptance message to Mesos
			schedulerDriver.acceptOffers(msg.offerIds(), msg.operations(), msg.filters());
		}
		catch(Exception ex) {
			fatalError("unable to accept offers", ex);
		}
	}

	/**
	 * Handle a task status change.
     */
	private void taskStatusUpdated(StatusUpdate message) {
		taskRouter.tell(message, self());
		reconciliationCoordinator.tell(message, self());
		schedulerDriver.acknowledgeStatusUpdate(message.status());
	}

	/**
	 * Accept the given started worker into the internal state.
	 *
	 * @param resourceID The worker resource id
	 * @return A registered worker node record.
	 */
	@Override
	protected RegisteredMesosWorkerNode workerStarted(ResourceID resourceID) {
		MesosWorkerStore.Worker inLaunch = workersInLaunch.remove(resourceID);
		if (inLaunch == null) {
			// Worker was not in state "being launched", this can indicate that the TaskManager
			// in this worker was already registered or that the container was not started
			// by this resource manager. Simply ignore this resourceID.
			return null;
		}
		return new RegisteredMesosWorkerNode(inLaunch);
	}

	/**
	 * Accept the given registered workers into the internal state.
	 *
	 * @param toConsolidate The worker IDs known previously to the JobManager.
	 * @return A collection of registered worker node records.
	 */
	@Override
	protected Collection<RegisteredMesosWorkerNode> reacceptRegisteredWorkers(Collection<ResourceID> toConsolidate) {

		// we check for each task manager if we recognize its Mesos task ID
		List<RegisteredMesosWorkerNode> accepted = new ArrayList<>(toConsolidate.size());
		for (ResourceID resourceID : toConsolidate) {
			MesosWorkerStore.Worker worker = workersInLaunch.remove(resourceID);
			if (worker != null) {
				LOG.info("Mesos worker consolidation recognizes TaskManager {}.", resourceID);
				accepted.add(new RegisteredMesosWorkerNode(worker));
			}
			else {
				if(isStarted(resourceID)) {
					LOG.info("TaskManager {} has already been registered at the resource manager.", resourceID);
				}
				else {
					LOG.info("Mesos worker consolidation does not recognize TaskManager {}.", resourceID);
				}
			}
		}
		return accepted;
	}

	/**
	 * Release the given pending worker.
	 */
	@Override
	protected void releasePendingWorker(ResourceID id) {
		MesosWorkerStore.Worker worker = workersInLaunch.remove(id);
		if (worker != null) {
			releaseWorker(worker);
		} else {
			LOG.error("Cannot find worker {} to release. Ignoring request.", id);
		}
	}

	/**
	 * Release the given started worker.
	 */
	@Override
	protected void releaseStartedWorker(RegisteredMesosWorkerNode worker) {
		releaseWorker(worker.task());
	}

	/**
	 * Plan for the removal of the given worker.
	 */
	private void releaseWorker(MesosWorkerStore.Worker worker) {
		try {
			LOG.info("Releasing worker {}", worker.taskID());

			// update persistent state of worker to Released
			worker = worker.releaseWorker();
			workerStore.putWorker(worker);
			workersBeingReturned.put(extractResourceID(worker.taskID()), worker);
			taskRouter.tell(new TaskMonitor.TaskGoalStateUpdated(extractGoalState(worker)), self());

			if (worker.hostname().isDefined()) {
				// tell the launch coordinator that the task is being unassigned from the host, for planning purposes
				launchCoordinator.tell(new LaunchCoordinator.Unassign(worker.taskID(), worker.hostname().get()), self());
			}
		}
		catch (Exception ex) {
			fatalError("unable to release worker", ex);
		}
	}

	@Override
	protected int getNumWorkerRequestsPending() {
		return workersInNew.size();
	}

	@Override
	protected int getNumWorkersPendingRegistration() {
		return workersInLaunch.size();
	}

	// ------------------------------------------------------------------------
	//  Callbacks from the Mesos Master
	// ------------------------------------------------------------------------

	/**
	 * Called when connected to Mesos as a new framework.
	 */
	private void registered(Registered message) {
		connectionMonitor.tell(message, self());

		try {
			workerStore.setFrameworkID(Option.apply(message.frameworkId()));
		}
		catch(Exception ex) {
			fatalError("unable to store the assigned framework ID", ex);
			return;
		}

		launchCoordinator.tell(message, self());
		reconciliationCoordinator.tell(message, self());
		taskRouter.tell(message, self());
	}

	/**
	 * Called when reconnected to Mesos following a failover event.
	 */
	private void reregistered(ReRegistered message) {
		connectionMonitor.tell(message, self());
		launchCoordinator.tell(message, self());
		reconciliationCoordinator.tell(message, self());
		taskRouter.tell(message, self());
	}

	/**
	 * Called when disconnected from Mesos.
	 */
	private void disconnected(Disconnected message) {
		connectionMonitor.tell(message, self());
		launchCoordinator.tell(message, self());
		reconciliationCoordinator.tell(message, self());
		taskRouter.tell(message, self());
	}

	/**
	 * Called when an error is reported by the scheduler callback.
	 */
	private void error(String message) {
		self().tell(new FatalErrorOccurred("Connection to Mesos failed", new Exception(message)), self());
	}

	/**
	 * Invoked when a Mesos task reaches a terminal status.
	 */
	private void taskTerminated(Protos.TaskID taskID, Protos.TaskStatus status) {
		// this callback occurs for failed containers and for released containers alike

		final ResourceID id = extractResourceID(taskID);

		boolean existed;
		try {
			existed = workerStore.removeWorker(taskID);
		}
		catch(Exception ex) {
			fatalError("unable to remove worker", ex);
			return;
		}

		if(!existed) {
			LOG.info("Received a termination notice for an unrecognized worker: {}", id);
			return;
		}

		// check if this is a failed task or a released task
		if (workersBeingReturned.remove(id) != null) {
			// regular finished worker that we released
			LOG.info("Worker {} finished successfully with diagnostics: {}",
				id, status.getMessage());
		} else {
			// failed worker, either at startup, or running
			final MesosWorkerStore.Worker launched = workersInLaunch.remove(id);
			if (launched != null) {
				LOG.info("Mesos task {} failed, with a TaskManager in launch or registration. " +
					"State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());
				// we will trigger re-acquiring new workers at the end
			} else {
				// failed registered worker
				LOG.info("Mesos task {} failed, with a registered TaskManager. " +
					"State: {} Reason: {} ({})", id, status.getState(), status.getReason(), status.getMessage());

				// notify the generic logic, which notifies the JobManager, etc.
				notifyWorkerFailed(id, "Mesos task " + id + " failed.  State: " + status.getState());
			}

			// general failure logging
			failedTasksSoFar++;

			String diagMessage = String.format("Diagnostics for task %s in state %s : " +
					"reason=%s message=%s",
				id, status.getState(), status.getReason(), status.getMessage());
			sendInfoMessage(diagMessage);

			LOG.info(diagMessage);
			LOG.info("Total number of failed tasks so far: {}", failedTasksSoFar);

			// maxFailedTasks == -1 is infinite number of retries.
			if (maxFailedTasks >= 0 && failedTasksSoFar > maxFailedTasks) {
				String msg = "Stopping Mesos session because the number of failed tasks ("
					+ failedTasksSoFar + ") exceeded the maximum failed tasks ("
					+ maxFailedTasks + "). This number is controlled by the '"
					+ ConfigConstants.MESOS_MAX_FAILED_TASKS + "' configuration setting. "
					+ "By default its the number of requested tasks.";

				LOG.error(msg);
				self().tell(decorateMessage(new StopCluster(ApplicationStatus.FAILED, msg)),
					ActorRef.noSender());

				// no need to do anything else
				return;
			}
		}

		// in case failed containers were among the finished containers, make
		// sure we re-examine and request new ones
		triggerCheckWorkers();
	}

	// ------------------------------------------------------------------------
	//  Utilities
	// ------------------------------------------------------------------------

	private LaunchableMesosWorker createLaunchableMesosWorker(Protos.TaskID taskID) {
		LaunchableMesosWorker launchable =
			new LaunchableMesosWorker(
				artifactResolver,
				taskManagerParameters,
				taskManagerContainerSpec,
				taskID,
				mesosConfig);

		return launchable;
	}

	/**
	 * Extracts a unique ResourceID from the Mesos task.
	 *
	 * @param taskId the Mesos TaskID
	 * @return The ResourceID for the container
	 */
	static ResourceID extractResourceID(Protos.TaskID taskId) {
		return new ResourceID(taskId.getValue());
	}

	/**
	 * Extracts the Mesos task goal state from the worker information.
	 * @param worker the persistent worker information.
	 * @return goal state information for the {@Link TaskMonitor}.
	 */
	static TaskMonitor.TaskGoalState extractGoalState(MesosWorkerStore.Worker worker) {
		switch(worker.state()) {
			case New: return new TaskMonitor.New(worker.taskID());
			case Launched: return new TaskMonitor.Launched(worker.taskID(), worker.slaveID().get());
			case Released: return new TaskMonitor.Released(worker.taskID(), worker.slaveID().get());
			default: throw new IllegalArgumentException("unsupported worker state");
		}
	}

	/**
	 * Creates the Fenzo optimizer (builder).
	 * The builder is an indirection to facilitate unit testing of the Launch Coordinator.
	 */
	private static TaskSchedulerBuilder createOptimizer() {
		return new TaskSchedulerBuilder() {
			TaskScheduler.Builder builder = new TaskScheduler.Builder();

			@Override
			public TaskSchedulerBuilder withLeaseRejectAction(Action1<VirtualMachineLease> action) {
				builder.withLeaseRejectAction(action);
				return this;
			}

			@Override
			public TaskScheduler build() {
				return builder.build();
			}
		};
	}

	/**
	 * Creates the props needed to instantiate this actor.
	 *
	 * Rather than extracting and validating parameters in the constructor, this factory method takes
	 * care of that. That way, errors occur synchronously, and are not swallowed simply in a
	 * failed asynchronous attempt to start the actor.

	 * @param actorClass
	 *             The actor class, to allow overriding this actor with subclasses for testing.
	 * @param flinkConfig
	 *             The Flink configuration object.
	 * @param taskManagerParameters
	 *             The parameters for launching TaskManager containers.
	 * @param taskManagerContainerSpec
	 *             The container specification.
	 * @param artifactResolver
	 *             The artifact resolver to locate artifacts
	 * @param log
	 *             The logger to log to.
	 *
	 * @return The Props object to instantiate the MesosFlinkResourceManager actor.
	 */
	public static Props createActorProps(Class<? extends MesosFlinkResourceManager> actorClass,
			Configuration flinkConfig,
			MesosConfiguration mesosConfig,
			MesosWorkerStore workerStore,
			LeaderRetrievalService leaderRetrievalService,
			MesosTaskManagerParameters taskManagerParameters,
			ContainerSpecification taskManagerContainerSpec,
			MesosArtifactResolver artifactResolver,
			Logger log)
	{

		final int numInitialTaskManagers = flinkConfig.getInteger(
			ConfigConstants.MESOS_INITIAL_TASKS, 0);
		if (numInitialTaskManagers >= 0) {
			log.info("Mesos framework to allocate {} initial tasks",
				numInitialTaskManagers);
		}
		else {
			throw new IllegalConfigurationException("Invalid value for " +
				ConfigConstants.MESOS_INITIAL_TASKS + ", which must be at least zero.");
		}

		final int maxFailedTasks = flinkConfig.getInteger(
			ConfigConstants.MESOS_MAX_FAILED_TASKS, numInitialTaskManagers);
		if (maxFailedTasks >= 0) {
			log.info("Mesos framework tolerates {} failed tasks before giving up",
				maxFailedTasks);
		}

		return Props.create(actorClass,
			flinkConfig,
			mesosConfig,
			workerStore,
			leaderRetrievalService,
			taskManagerParameters,
			taskManagerContainerSpec,
			artifactResolver,
			maxFailedTasks,
			numInitialTaskManagers);
	}
}