/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.resourcemanager.slotmanager;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.runtime.clusterframework.types.AllocationID;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.clusterframework.types.TaskManagerSlot;
import org.apache.flink.runtime.clusterframework.types.SlotID;
import org.apache.flink.runtime.concurrent.BiFunction;
import org.apache.flink.runtime.concurrent.CompletableFuture;
import org.apache.flink.runtime.concurrent.Future;
import org.apache.flink.runtime.concurrent.ScheduledExecutor;
import org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture;
import org.apache.flink.runtime.instance.InstanceID;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.resourcemanager.SlotRequest;
import org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException;
import org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection;
import org.apache.flink.runtime.taskexecutor.SlotReport;
import org.apache.flink.runtime.taskexecutor.SlotStatus;
import org.apache.flink.runtime.taskexecutor.TaskExecutorGateway;
import org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException;
import org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException;
import org.apache.flink.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
import java.util.concurrent.CancellationException;
import java.util.concurrent.Executor;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
/**
* The slot manager is responsible for maintaining a view on all registered task manager slots,
* their allocation and all pending slot requests. Whenever a new slot is registered or and
* allocated slot is freed, then it tries to fulfill another pending slot request. Whenever there
* are not enough slots available the slot manager will notify the resource manager about it via
* {@link ResourceManagerActions#allocateResource(ResourceProfile)}.
*
* In order to free resources and avoid resource leaks, idling task managers (task managers whose
* slots are currently not used) and pending slot requests time out triggering their release and
* failure, respectively.
*/
public class SlotManager implements AutoCloseable {
private static final Logger LOG = LoggerFactory.getLogger(SlotManager.class);
/** Scheduled executor for timeouts */
private final ScheduledExecutor scheduledExecutor;
/** Timeout for slot requests to the task manager */
private final Time taskManagerRequestTimeout;
/** Timeout after which an allocation is discarded */
private final Time slotRequestTimeout;
/** Timeout after which an unused TaskManager is released */
private final Time taskManagerTimeout;
/** Map for all registered slots */
private final HashMap<SlotID, TaskManagerSlot> slots;
/** Index of all currently free slots */
private final LinkedHashMap<SlotID, TaskManagerSlot> freeSlots;
/** All currently registered task managers */
private final HashMap<InstanceID, TaskManagerRegistration> taskManagerRegistrations;
/** Map of fulfilled and active allocations for request deduplication purposes */
private final HashMap<AllocationID, SlotID> fulfilledSlotRequests;
/** Map of pending/unfulfilled slot allocation requests */
private final HashMap<AllocationID, PendingSlotRequest> pendingSlotRequests;
/** Leader id of the containing component */
private UUID leaderId;
/** Executor for future callbacks which have to be "synchronized" */
private Executor mainThreadExecutor;
/** Callbacks for resource (de-)allocations */
private ResourceManagerActions resourceManagerActions;
private ScheduledFuture<?> taskManagerTimeoutCheck;
private ScheduledFuture<?> slotRequestTimeoutCheck;
/** True iff the component has been started */
private boolean started;
public SlotManager(
ScheduledExecutor scheduledExecutor,
Time taskManagerRequestTimeout,
Time slotRequestTimeout,
Time taskManagerTimeout) {
this.scheduledExecutor = Preconditions.checkNotNull(scheduledExecutor);
this.taskManagerRequestTimeout = Preconditions.checkNotNull(taskManagerRequestTimeout);
this.slotRequestTimeout = Preconditions.checkNotNull(slotRequestTimeout);
this.taskManagerTimeout = Preconditions.checkNotNull(taskManagerTimeout);
slots = new HashMap<>(16);
freeSlots = new LinkedHashMap<>(16);
taskManagerRegistrations = new HashMap<>(4);
fulfilledSlotRequests = new HashMap<>(16);
pendingSlotRequests = new HashMap<>(16);
leaderId = null;
resourceManagerActions = null;
mainThreadExecutor = null;
taskManagerTimeoutCheck = null;
slotRequestTimeoutCheck = null;
started = false;
}
// ---------------------------------------------------------------------------------------------
// Component lifecycle methods
// ---------------------------------------------------------------------------------------------
/**
* Starts the slot manager with the given leader id and resource manager actions.
*
* @param newLeaderId to use for communication with the task managers
* @param newResourceManagerActions to use for resource (de-)allocations
*/
public void start(UUID newLeaderId, Executor newMainThreadExecutor, ResourceManagerActions newResourceManagerActions) {
LOG.info("Starting the SlotManager.");
leaderId = Preconditions.checkNotNull(newLeaderId);
mainThreadExecutor = Preconditions.checkNotNull(newMainThreadExecutor);
resourceManagerActions = Preconditions.checkNotNull(newResourceManagerActions);
started = true;
taskManagerTimeoutCheck = scheduledExecutor.scheduleWithFixedDelay(new Runnable() {
@Override
public void run() {
mainThreadExecutor.execute(new Runnable() {
@Override
public void run() {
checkTaskManagerTimeouts();
}
});
}
}, 0L, taskManagerTimeout.toMilliseconds(), TimeUnit.MILLISECONDS);
slotRequestTimeoutCheck = scheduledExecutor.scheduleWithFixedDelay(new Runnable() {
@Override
public void run() {
mainThreadExecutor.execute(new Runnable() {
@Override
public void run() {
checkSlotRequestTimeouts();
}
});
}
}, 0L, slotRequestTimeout.toMilliseconds(), TimeUnit.MILLISECONDS);
}
/**
* Suspends the component. This clears the internal state of the slot manager.
*/
public void suspend() {
LOG.info("Suspending the SlotManager.");
// stop the timeout checks for the TaskManagers and the SlotRequests
taskManagerTimeoutCheck.cancel(false);
slotRequestTimeoutCheck.cancel(false);
taskManagerTimeoutCheck = null;
slotRequestTimeoutCheck = null;
for (PendingSlotRequest pendingSlotRequest : pendingSlotRequests.values()) {
cancelPendingSlotRequest(pendingSlotRequest);
}
pendingSlotRequests.clear();
ArrayList<InstanceID> registeredTaskManagers = new ArrayList<>(taskManagerRegistrations.keySet());
for (InstanceID registeredTaskManager : registeredTaskManagers) {
unregisterTaskManager(registeredTaskManager);
}
leaderId = null;
resourceManagerActions = null;
started = false;
}
/**
* Closes the slot manager.
*
* @throws Exception if the close operation fails
*/
@Override
public void close() throws Exception {
LOG.info("Closing the SlotManager.");
suspend();
}
// ---------------------------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------------------------
/**
* Requests a slot with the respective resource profile.
*
* @param slotRequest specifying the requested slot specs
* @return true if the slot request was registered; false if the request is a duplicate
* @throws SlotManagerException if the slot request failed (e.g. not enough resources left)
*/
public boolean registerSlotRequest(SlotRequest slotRequest) throws SlotManagerException {
checkInit();
if (checkDuplicateRequest(slotRequest.getAllocationId())) {
LOG.debug("Ignoring a duplicate slot request with allocation id {}.", slotRequest.getAllocationId());
return false;
} else {
PendingSlotRequest pendingSlotRequest = new PendingSlotRequest(slotRequest);
pendingSlotRequests.put(slotRequest.getAllocationId(), pendingSlotRequest);
try {
internalRequestSlot(pendingSlotRequest);
} catch (ResourceManagerException e) {
// requesting the slot failed --> remove pending slot request
pendingSlotRequests.remove(slotRequest.getAllocationId());
throw new SlotManagerException("Could not fulfill slot request " + slotRequest.getAllocationId() + '.', e);
}
return true;
}
}
/**
* Cancels and removes a pending slot request with the given allocation id. If there is no such
* pending request, then nothing is done.
*
* @param allocationId identifying the pending slot request
* @return True if a pending slot request was found; otherwise false
*/
public boolean unregisterSlotRequest(AllocationID allocationId) {
checkInit();
PendingSlotRequest pendingSlotRequest = pendingSlotRequests.remove(allocationId);
if (null != pendingSlotRequest) {
cancelPendingSlotRequest(pendingSlotRequest);
return true;
} else {
LOG.debug("No pending slot request with allocation id {} found.", allocationId);
return false;
}
}
/**
* Registers a new task manager at the slot manager. This will make the task managers slots
* known and, thus, available for allocation.
*
* @param taskExecutorConnection for the new task manager
* @param initialSlotReport for the new task manager
*/
public void registerTaskManager(final TaskExecutorConnection taskExecutorConnection, SlotReport initialSlotReport) {
checkInit();
LOG.info("Register TaskManager {} at the SlotManager.", taskExecutorConnection.getInstanceID());
// we identify task managers by their instance id
if (taskManagerRegistrations.containsKey(taskExecutorConnection.getInstanceID())) {
reportSlotStatus(taskExecutorConnection.getInstanceID(), initialSlotReport);
} else {
// first register the TaskManager
ArrayList<SlotID> reportedSlots = new ArrayList<>();
for (SlotStatus slotStatus : initialSlotReport) {
reportedSlots.add(slotStatus.getSlotID());
}
TaskManagerRegistration taskManagerRegistration = new TaskManagerRegistration(taskExecutorConnection, reportedSlots);
taskManagerRegistrations.put(taskExecutorConnection.getInstanceID(), taskManagerRegistration);
// next register the new slots
for (SlotStatus slotStatus : initialSlotReport) {
registerSlot(
slotStatus.getSlotID(),
slotStatus.getAllocationID(),
slotStatus.getResourceProfile(),
taskExecutorConnection);
}
// determine if the task manager is idle or not
boolean idle = !anySlotUsed(taskManagerRegistration.getSlots());
if (idle) {
taskManagerRegistration.markIdle();
} else {
taskManagerRegistration.markUsed();
}
}
}
/**
* Unregisters the task manager identified by the given instance id and its associated slots
* from the slot manager.
*
* @param instanceId identifying the task manager to unregister
* @return True if there existed a registered task manager with the given instance id
*/
public boolean unregisterTaskManager(InstanceID instanceId) {
checkInit();
TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.remove(instanceId);
if (null != taskManagerRegistration) {
internalUnregisterTaskManager(taskManagerRegistration);
return true;
} else {
LOG.debug("There is no task manager registered with instance ID {}. Ignoring this message.", instanceId);
return false;
}
}
/**
* Reports the current slot allocations for a task manager identified by the given instance id.
*
* @param instanceId identifying the task manager for which to report the slot status
* @param slotReport containing the status for all of its slots
* @return true if the slot status has been updated successfully, otherwise false
*/
public boolean reportSlotStatus(InstanceID instanceId, SlotReport slotReport) {
checkInit();
TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(instanceId);
if (null != taskManagerRegistration) {
boolean idle = true;
for (SlotStatus slotStatus : slotReport) {
// We assume that the slots of a TaskManager don't change over its lifetime and they are registered
// once when the TaskManager is registered
if (taskManagerRegistration.containsSlot(slotStatus.getSlotID()) && updateSlot(slotStatus.getSlotID(), slotStatus.getAllocationID())) {
TaskManagerSlot slot = slots.get(slotStatus.getSlotID());
idle &= slot.isFree();
} else {
// sanity check to guarantee that slots of a TaskManager don't change
throw new IllegalStateException("Reported a slot status for slot " + slotStatus.getSlotID() +
" which has not been registered.");
}
}
if (idle) {
// no slot of this task manager is being used --> mark this task manager to be idle which allows it to
// time out
taskManagerRegistration.markIdle();
} else {
taskManagerRegistration.markUsed();
}
return true;
} else {
LOG.debug("Received slot report for unknown task manager with instance id {}. Ignoring this report.", instanceId);
return false;
}
}
/**
* Free the given slot from the given allocation. If the slot is still allocated by the given
* allocation id, then the slot will be marked as free and will be subject to new slot requests.
*
* @param slotId identifying the slot to free
* @param allocationId with which the slot is presumably allocated
*/
public void freeSlot(SlotID slotId, AllocationID allocationId) {
checkInit();
TaskManagerSlot slot = slots.get(slotId);
if (null != slot) {
if (slot.isAllocated()) {
if (Objects.equals(allocationId, slot.getAllocationId())) {
// free the slot
slot.setAllocationId(null);
fulfilledSlotRequests.remove(allocationId);
if (slot.isFree()) {
handleFreeSlot(slot);
}
TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(slot.getInstanceId());
if (null != taskManagerRegistration) {
if (anySlotUsed(taskManagerRegistration.getSlots())) {
taskManagerRegistration.markUsed();
} else {
taskManagerRegistration.markIdle();
}
}
} else {
LOG.debug("Received request to free slot {} with expected allocation id {}, " +
"but actual allocation id {} differs. Ignoring the request.", slotId, allocationId, slot.getAllocationId());
}
} else {
LOG.debug("Slot {} has not been allocated.", allocationId);
}
} else {
LOG.debug("Trying to free a slot {} which has not been registered. Ignoring this message.", slotId);
}
}
// ---------------------------------------------------------------------------------------------
// Behaviour methods
// ---------------------------------------------------------------------------------------------
/**
* Finds a matching slot request for a given resource profile. If there is no such request,
* the method returns null.
*
* Note: If you want to change the behaviour of the slot manager wrt slot allocation and
* request fulfillment, then you should override this method.
*
* @param slotResourceProfile defining the resources of an available slot
* @return A matching slot request which can be deployed in a slot with the given resource
* profile. Null if there is no such slot request pending.
*/
protected PendingSlotRequest findMatchingRequest(ResourceProfile slotResourceProfile) {
for (PendingSlotRequest pendingSlotRequest : pendingSlotRequests.values()) {
if (!pendingSlotRequest.isAssigned() && slotResourceProfile.isMatching(pendingSlotRequest.getResourceProfile())) {
return pendingSlotRequest;
}
}
return null;
}
/**
* Finds a matching slot for a given resource profile. A matching slot has at least as many
* resources available as the given resource profile. If there is no such slot available, then
* the method returns null.
*
* Note: If you want to change the behaviour of the slot manager wrt slot allocation and
* request fulfillment, then you should override this method.
*
* @param requestResourceProfile specifying the resource requirements for the a slot request
* @return A matching slot which fulfills the given resource profile. Null if there is no such
* slot available.
*/
protected TaskManagerSlot findMatchingSlot(ResourceProfile requestResourceProfile) {
Iterator<Map.Entry<SlotID, TaskManagerSlot>> iterator = freeSlots.entrySet().iterator();
while (iterator.hasNext()) {
TaskManagerSlot taskManagerSlot = iterator.next().getValue();
// sanity check
Preconditions.checkState(taskManagerSlot.isFree());
if (taskManagerSlot.getResourceProfile().isMatching(requestResourceProfile)) {
iterator.remove();
return taskManagerSlot;
}
}
return null;
}
// ---------------------------------------------------------------------------------------------
// Internal slot operations
// ---------------------------------------------------------------------------------------------
/**
* Registers a slot for the given task manager at the slot manager. The slot is identified by
* the given slot id. The given resource profile defines the available resources for the slot.
* The task manager connection can be used to communicate with the task manager.
*
* @param slotId identifying the slot on the task manager
* @param allocationId which is currently deployed in the slot
* @param resourceProfile of the slot
* @param taskManagerConnection to communicate with the remote task manager
*/
private void registerSlot(
SlotID slotId,
AllocationID allocationId,
ResourceProfile resourceProfile,
TaskExecutorConnection taskManagerConnection) {
if (slots.containsKey(slotId)) {
// remove the old slot first
removeSlot(slotId);
}
TaskManagerSlot slot = new TaskManagerSlot(
slotId,
resourceProfile,
taskManagerConnection,
allocationId);
slots.put(slotId, slot);
if (slot.isFree()) {
handleFreeSlot(slot);
}
if (slot.isAllocated()) {
fulfilledSlotRequests.put(slot.getAllocationId(), slotId);
}
}
/**
* Updates a slot with the given allocation id.
*
* @param slotId to update
* @param allocationId specifying the current allocation of the slot
* @return True if the slot could be updated; otherwise false
*/
private boolean updateSlot(SlotID slotId, AllocationID allocationId) {
TaskManagerSlot slot = slots.get(slotId);
if (null != slot) {
// we assume the given allocation id to be the ground truth (coming from the TM)
slot.setAllocationId(allocationId);
if (null != allocationId) {
if (slot.hasPendingSlotRequest()){
// we have a pending slot request --> check whether we have to reject it
PendingSlotRequest pendingSlotRequest = slot.getAssignedSlotRequest();
if (Objects.equals(pendingSlotRequest.getAllocationId(), allocationId)) {
// we can cancel the slot request because it has been fulfilled
cancelPendingSlotRequest(pendingSlotRequest);
// remove the pending slot request, since it has been completed
pendingSlotRequests.remove(pendingSlotRequest.getAllocationId());
} else {
// this will try to find a new slot for the request
rejectPendingSlotRequest(
pendingSlotRequest,
new Exception("Task manager reported slot " + slotId + " being already allocated."));
}
slot.setAssignedSlotRequest(null);
}
fulfilledSlotRequests.put(allocationId, slotId);
TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(slot.getInstanceId());
if (null != taskManagerRegistration) {
// mark this TaskManager to be used to exempt it from timing out
taskManagerRegistration.markUsed();
}
}
return true;
} else {
LOG.debug("Trying to update unknown slot with slot id {}.", slotId);
return false;
}
}
/**
* Tries to allocate a slot for the given slot request. If there is no slot available, the
* resource manager is informed to allocate more resources and a timeout for the request is
* registered.
*
* @param pendingSlotRequest to allocate a slot for
* @throws ResourceManagerException if the resource manager cannot allocate more resource
*/
private void internalRequestSlot(PendingSlotRequest pendingSlotRequest) throws ResourceManagerException {
TaskManagerSlot taskManagerSlot = findMatchingSlot(pendingSlotRequest.getResourceProfile());
if (taskManagerSlot != null) {
allocateSlot(taskManagerSlot, pendingSlotRequest);
} else {
resourceManagerActions.allocateResource(pendingSlotRequest.getResourceProfile());
}
}
/**
* Allocates the given slot for the given slot request. This entails sending a registration
* message to the task manager and treating failures.
*
* @param taskManagerSlot to allocate for the given slot request
* @param pendingSlotRequest to allocate the given slot for
*/
private void allocateSlot(TaskManagerSlot taskManagerSlot, PendingSlotRequest pendingSlotRequest) {
TaskExecutorConnection taskExecutorConnection = taskManagerSlot.getTaskManagerConnection();
TaskExecutorGateway gateway = taskExecutorConnection.getTaskExecutorGateway();
final CompletableFuture<Acknowledge> completableFuture = new FlinkCompletableFuture<>();
final AllocationID allocationId = pendingSlotRequest.getAllocationId();
final SlotID slotId = taskManagerSlot.getSlotId();
taskManagerSlot.setAssignedSlotRequest(pendingSlotRequest);
pendingSlotRequest.setRequestFuture(completableFuture);
TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(taskManagerSlot.getInstanceId());
if (taskManagerRegistration != null) {
// mark the task manager to be used since we have a pending slot request assigned ot one of its slots
taskManagerRegistration.markUsed();
} else {
throw new IllegalStateException("Could not find a registered task manager for instance id " +
taskManagerSlot.getInstanceId() + '.');
}
// RPC call to the task manager
Future<Acknowledge> requestFuture = gateway.requestSlot(
slotId,
pendingSlotRequest.getJobId(),
allocationId,
pendingSlotRequest.getTargetAddress(),
leaderId,
taskManagerRequestTimeout);
requestFuture.handle(new BiFunction<Acknowledge, Throwable, Void>() {
@Override
public Void apply(Acknowledge acknowledge, Throwable throwable) {
if (acknowledge != null) {
completableFuture.complete(acknowledge);
} else {
completableFuture.completeExceptionally(throwable);
}
return null;
}
});
completableFuture.handleAsync(new BiFunction<Acknowledge, Throwable, Void>() {
@Override
public Void apply(Acknowledge acknowledge, Throwable throwable) {
if (acknowledge != null) {
updateSlot(slotId, allocationId);
} else {
if (throwable instanceof SlotOccupiedException) {
SlotOccupiedException exception = (SlotOccupiedException) throwable;
updateSlot(slotId, exception.getAllocationId());
} else {
removeSlotRequestFromSlot(slotId, allocationId);
}
if (!(throwable instanceof CancellationException)) {
handleFailedSlotRequest(slotId, allocationId, throwable);
} else {
LOG.debug("Slot allocation request {} has been cancelled.", allocationId, throwable);
}
}
return null;
}
}, mainThreadExecutor);
}
/**
* Handles a free slot. It first tries to find a pending slot request which can be fulfilled.
* If there is no such request, then it will add the slot to the set of free slots.
*
* @param freeSlot to find a new slot request for
*/
private void handleFreeSlot(TaskManagerSlot freeSlot) {
PendingSlotRequest pendingSlotRequest = findMatchingRequest(freeSlot.getResourceProfile());
if (null != pendingSlotRequest) {
allocateSlot(freeSlot, pendingSlotRequest);
} else {
freeSlots.put(freeSlot.getSlotId(), freeSlot);
}
}
/**
* Removes the given set of slots from the slot manager.
*
* @param slotsToRemove identifying the slots to remove from the slot manager
*/
private void removeSlots(Iterable<SlotID> slotsToRemove) {
for (SlotID slotId : slotsToRemove) {
removeSlot(slotId);
}
}
/**
* Removes the given slot from the slot manager.
*
* @param slotId identifying the slot to remove
*/
private void removeSlot(SlotID slotId) {
TaskManagerSlot slot = slots.remove(slotId);
if (null != slot) {
freeSlots.remove(slotId);
if (slot.hasPendingSlotRequest()) {
// reject the pending slot request --> triggering a new allocation attempt
rejectPendingSlotRequest(
slot.getAssignedSlotRequest(),
new Exception("The assigned slot " + slot.getSlotId() + " was removed."));
}
AllocationID oldAllocationId = slot.getAllocationId();
fulfilledSlotRequests.remove(oldAllocationId);
} else {
LOG.debug("There was no slot registered with slot id {}.", slotId);
}
}
// ---------------------------------------------------------------------------------------------
// Internal request handling methods
// ---------------------------------------------------------------------------------------------
/**
* Removes a pending slot request identified by the given allocation id from a slot identified
* by the given slot id.
*
* @param slotId identifying the slot
* @param allocationId identifying the presumable assigned pending slot request
*/
private void removeSlotRequestFromSlot(SlotID slotId, AllocationID allocationId) {
TaskManagerSlot taskManagerSlot = slots.get(slotId);
if (null != taskManagerSlot) {
if (taskManagerSlot.hasPendingSlotRequest() && Objects.equals(allocationId, taskManagerSlot.getAssignedSlotRequest().getAllocationId())) {
taskManagerSlot.setAssignedSlotRequest(null);
}
if (taskManagerSlot.isFree()) {
handleFreeSlot(taskManagerSlot);
}
TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(taskManagerSlot.getInstanceId());
if (null != taskManagerRegistration && !anySlotUsed(taskManagerRegistration.getSlots())) {
taskManagerRegistration.markIdle();
}
} else {
LOG.debug("There was no slot with {} registered. Probably this slot has been already freed.", slotId);
}
}
/**
* Handles a failed slot request. The slot manager tries to find a new slot fulfilling
* the resource requirements for the failed slot request.
*
* @param slotId identifying the slot which was assigned to the slot request before
* @param allocationId identifying the failed slot request
* @param cause of the failure
*/
private void handleFailedSlotRequest(SlotID slotId, AllocationID allocationId, Throwable cause) {
PendingSlotRequest pendingSlotRequest = pendingSlotRequests.get(allocationId);
LOG.debug("Slot request with allocation id {} failed for slot {}.", allocationId, slotId, cause);
if (null != pendingSlotRequest) {
pendingSlotRequest.setRequestFuture(null);
try {
internalRequestSlot(pendingSlotRequest);
} catch (ResourceManagerException e) {
pendingSlotRequests.remove(allocationId);
resourceManagerActions.notifyAllocationFailure(
pendingSlotRequest.getJobId(),
allocationId,
e);
}
} else {
LOG.debug("There was not pending slot request with allocation id {}. Probably the request has been fulfilled or cancelled.", allocationId);
}
}
/**
* Rejects the pending slot request by failing the request future with a
* {@link SlotAllocationException}.
*
* @param pendingSlotRequest to reject
* @param cause of the rejection
*/
private void rejectPendingSlotRequest(PendingSlotRequest pendingSlotRequest, Exception cause) {
CompletableFuture<Acknowledge> request = pendingSlotRequest.getRequestFuture();
if (null != request) {
request.completeExceptionally(new SlotAllocationException(cause));
} else {
LOG.debug("Cannot reject pending slot request {}, since no request has been sent.", pendingSlotRequest.getAllocationId());
}
}
/**
* Cancels the given slot request.
*
* @param pendingSlotRequest to cancel
*/
private void cancelPendingSlotRequest(PendingSlotRequest pendingSlotRequest) {
CompletableFuture<Acknowledge> request = pendingSlotRequest.getRequestFuture();
if (null != request) {
request.cancel(false);
}
}
// ---------------------------------------------------------------------------------------------
// Internal timeout methods
// ---------------------------------------------------------------------------------------------
private void checkTaskManagerTimeouts() {
if (!taskManagerRegistrations.isEmpty()) {
long currentTime = System.currentTimeMillis();
Iterator<Map.Entry<InstanceID, TaskManagerRegistration>> taskManagerRegistrationIterator = taskManagerRegistrations.entrySet().iterator();
while (taskManagerRegistrationIterator.hasNext()) {
TaskManagerRegistration taskManagerRegistration = taskManagerRegistrationIterator.next().getValue();
if (anySlotUsed(taskManagerRegistration.getSlots())) {
taskManagerRegistration.markUsed();
} else if (currentTime - taskManagerRegistration.getIdleSince() >= taskManagerTimeout.toMilliseconds()) {
taskManagerRegistrationIterator.remove();
internalUnregisterTaskManager(taskManagerRegistration);
resourceManagerActions.releaseResource(taskManagerRegistration.getInstanceId());
}
}
}
}
private void checkSlotRequestTimeouts() {
if (!pendingSlotRequests.isEmpty()) {
long currentTime = System.currentTimeMillis();
Iterator<Map.Entry<AllocationID, PendingSlotRequest>> slotRequestIterator = pendingSlotRequests.entrySet().iterator();
while (slotRequestIterator.hasNext()) {
PendingSlotRequest slotRequest = slotRequestIterator.next().getValue();
if (currentTime - slotRequest.getCreationTimestamp() >= slotRequestTimeout.toMilliseconds()) {
slotRequestIterator.remove();
if (slotRequest.isAssigned()) {
cancelPendingSlotRequest(slotRequest);
}
resourceManagerActions.notifyAllocationFailure(
slotRequest.getJobId(),
slotRequest.getAllocationId(),
new TimeoutException("The allocation could not be fulfilled in time."));
}
}
}
}
// ---------------------------------------------------------------------------------------------
// Internal utility methods
// ---------------------------------------------------------------------------------------------
private void internalUnregisterTaskManager(TaskManagerRegistration taskManagerRegistration) {
Preconditions.checkNotNull(taskManagerRegistration);
removeSlots(taskManagerRegistration.getSlots());
}
private boolean checkDuplicateRequest(AllocationID allocationId) {
return pendingSlotRequests.containsKey(allocationId) || fulfilledSlotRequests.containsKey(allocationId);
}
private boolean anySlotUsed(Iterable<SlotID> slotsToCheck) {
if (null != slotsToCheck) {
for (SlotID slotId : slotsToCheck) {
TaskManagerSlot taskManagerSlot = slots.get(slotId);
if (null != taskManagerSlot) {
if (taskManagerSlot.isAllocated()) {
return true;
}
}
}
}
return false;
}
private void checkInit() {
Preconditions.checkState(started, "The slot manager has not been started.");
}
// ---------------------------------------------------------------------------------------------
// Testing methods
// ---------------------------------------------------------------------------------------------
@VisibleForTesting
TaskManagerSlot getSlot(SlotID slotId) {
return slots.get(slotId);
}
@VisibleForTesting
int getNumberRegisteredSlots() {
return slots.size();
}
@VisibleForTesting
PendingSlotRequest getSlotRequest(AllocationID allocationId) {
return pendingSlotRequests.get(allocationId);
}
@VisibleForTesting
boolean isTaskManagerIdle(InstanceID instanceId) {
TaskManagerRegistration taskManagerRegistration = taskManagerRegistrations.get(instanceId);
if (null != taskManagerRegistration) {
return taskManagerRegistration.isIdle();
} else {
return false;
}
}
}