/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.mutable.MutableObject;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.authorize.AccessControlList;
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.NodeId;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.QueueACL;
import org.apache.hadoop.yarn.api.records.QueueInfo;
import org.apache.hadoop.yarn.api.records.QueueState;
import org.apache.hadoop.yarn.api.records.QueueUserACLInfo;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.ResourceRequest;
import org.apache.hadoop.yarn.factories.RecordFactory;
import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider;
import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager;
import org.apache.hadoop.yarn.security.AccessType;
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerAppUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
import org.apache.hadoop.yarn.server.utils.BuilderUtils;
import org.apache.hadoop.yarn.server.utils.Lock;
import org.apache.hadoop.yarn.server.utils.Lock.NoLock;
import org.apache.hadoop.yarn.util.resource.Resources;
import com.google.common.annotations.VisibleForTesting;
import java.util.concurrent.ConcurrentSkipListSet;
@Private
@Unstable
public class LeafQueue extends AbstractCSQueue {
private static final Log LOG = LogFactory.getLog(LeafQueue.class);
private float absoluteUsedCapacity = 0.0f;
private int userLimit;
private float userLimitFactor;
protected int maxApplications;
protected int maxApplicationsPerUser;
private float maxAMResourcePerQueuePercent;
private int nodeLocalityDelay;
Map<ApplicationId, FiCaSchedulerApp> activeApplications;
Set<ApplicationId> applicationsToActivate;
Set<ApplicationId> applicationsToDeactivate;
Set<FiCaSchedulerApp> activeApplicationsWithRequests;
Set<ApplicationId> pendingApplicationsWithRequests;
Map<ApplicationAttemptId, FiCaSchedulerApp> applicationAttemptMap =
new HashMap<ApplicationAttemptId, FiCaSchedulerApp>();
Set<FiCaSchedulerApp> pendingApplications;
private float minimumAllocationFactor;
private Map<String, User> users = new HashMap<String, User>();
private final RecordFactory recordFactory =
RecordFactoryProvider.getRecordFactory(null);
private CapacitySchedulerContext scheduler;
private final ActiveUsersManager activeUsersManager;
// cache last cluster resource to compute actual capacity
private Resource lastClusterResource = Resources.none();
// absolute capacity as a resource (based on cluster resource)
private Resource absoluteCapacityResource = Resources.none();
private final QueueResourceLimitsInfo queueResourceLimitsInfo =
new QueueResourceLimitsInfo();
private volatile ResourceLimits cachedResourceLimitsForHeadroom = null;
public LeafQueue(CapacitySchedulerContext cs,
String queueName, CSQueue parent, CSQueue old) throws IOException {
super(cs, queueName, parent, old);
this.scheduler = cs;
this.activeUsersManager = new ActiveUsersManager(metrics);
if(LOG.isDebugEnabled()) {
LOG.debug("LeafQueue:" + " name=" + queueName
+ ", fullname=" + getQueuePath());
}
Comparator<FiCaSchedulerApp> applicationComparator =
cs.getApplicationComparator();
this.pendingApplications =
new TreeSet<FiCaSchedulerApp>(applicationComparator);
this.activeApplications = new HashMap<>();
this.activeApplicationsWithRequests = new TreeSet<FiCaSchedulerApp>(
applicationComparator);
this.pendingApplicationsWithRequests = new HashSet<>();
applicationsToActivate = new ConcurrentSkipListSet<>();
applicationsToDeactivate = new ConcurrentSkipListSet<>();
setupQueueConfigs(cs.getClusterResource());
}
protected synchronized void setupQueueConfigs(Resource clusterResource)
throws IOException {
super.setupQueueConfigs(clusterResource);
this.lastClusterResource = clusterResource;
updateAbsoluteCapacityResource(clusterResource);
this.cachedResourceLimitsForHeadroom = new ResourceLimits(clusterResource);
// Initialize headroom info, also used for calculating application
// master resource limits. Since this happens during queue initialization
// and all queues may not be realized yet, we'll use (optimistic)
// absoluteMaxCapacity (it will be replaced with the more accurate
// absoluteMaxAvailCapacity during headroom/userlimit/allocation events)
setQueueResourceLimitsInfo(clusterResource);
CapacitySchedulerConfiguration conf = csContext.getConfiguration();
userLimit = conf.getUserLimit(getQueuePath());
userLimitFactor = conf.getUserLimitFactor(getQueuePath());
maxApplications = conf.getMaximumApplicationsPerQueue(getQueuePath());
if (maxApplications < 0) {
int maxSystemApps = conf.getMaximumSystemApplications();
maxApplications =
(int) (maxSystemApps * queueCapacities.getAbsoluteCapacity());
}
maxApplicationsPerUser = Math.min(maxApplications,
(int)(maxApplications * (userLimit / 100.0f) * userLimitFactor));
maxAMResourcePerQueuePercent =
conf.getMaximumApplicationMasterResourcePerQueuePercent(getQueuePath());
if (!SchedulerUtils.checkQueueLabelExpression(
this.accessibleLabels, this.defaultLabelExpression, null)) {
throw new IOException("Invalid default label expression of "
+ " queue="
+ getQueueName()
+ " doesn't have permission to access all labels "
+ "in default label expression. labelExpression of resource request="
+ (this.defaultLabelExpression == null ? ""
: this.defaultLabelExpression)
+ ". Queue labels="
+ (getAccessibleNodeLabels() == null ? "" : StringUtils.join(
getAccessibleNodeLabels().iterator(), ',')));
}
nodeLocalityDelay = conf.getNodeLocalityDelay();
// re-init this since max allocation could have changed
this.minimumAllocationFactor =
Resources.ratio(resourceCalculator,
Resources.subtract(maximumAllocation, minimumAllocation),
maximumAllocation);
StringBuilder aclsString = new StringBuilder();
for (Map.Entry<AccessType, AccessControlList> e : acls.entrySet()) {
aclsString.append(e.getKey() + ":" + e.getValue().getAclString());
}
StringBuilder labelStrBuilder = new StringBuilder();
if (accessibleLabels != null) {
for (String s : accessibleLabels) {
labelStrBuilder.append(s);
labelStrBuilder.append(",");
}
}
LOG.info("Initializing " + queueName + "\n" +
"capacity = " + queueCapacities.getCapacity() +
" [= (float) configuredCapacity / 100 ]" + "\n" +
"asboluteCapacity = " + queueCapacities.getAbsoluteCapacity() +
" [= parentAbsoluteCapacity * capacity ]" + "\n" +
"maxCapacity = " + queueCapacities.getMaximumCapacity() +
" [= configuredMaxCapacity ]" + "\n" +
"absoluteMaxCapacity = " + queueCapacities.getAbsoluteMaximumCapacity() +
" [= 1.0 maximumCapacity undefined, " +
"(parentAbsoluteMaxCapacity * maximumCapacity) / 100 otherwise ]" +
"\n" +
"userLimit = " + userLimit +
" [= configuredUserLimit ]" + "\n" +
"userLimitFactor = " + userLimitFactor +
" [= configuredUserLimitFactor ]" + "\n" +
"maxApplications = " + maxApplications +
" [= configuredMaximumSystemApplicationsPerQueue or" +
" (int)(configuredMaximumSystemApplications * absoluteCapacity)]" +
"\n" +
"maxApplicationsPerUser = " + maxApplicationsPerUser +
" [= (int)(maxApplications * (userLimit / 100.0f) * " +
"userLimitFactor) ]" + "\n" +
"usedCapacity = " + queueCapacities.getUsedCapacity() +
" [= usedResourcesMemory / " +
"(clusterResourceMemory * absoluteCapacity)]" + "\n" +
"absoluteUsedCapacity = " + absoluteUsedCapacity +
" [= usedResourcesMemory / clusterResourceMemory]" + "\n" +
"maxAMResourcePerQueuePercent = " + maxAMResourcePerQueuePercent +
" [= configuredMaximumAMResourcePercent ]" + "\n" +
"minimumAllocationFactor = " + minimumAllocationFactor +
" [= (float)(maximumAllocationMemory - minimumAllocationMemory) / " +
"maximumAllocationMemory ]" + "\n" +
"maximumAllocation = " + maximumAllocation +
" [= configuredMaxAllocation ]" + "\n" +
"numContainers = " + numContainers +
" [= currentNumContainers ]" + "\n" +
"state = " + state +
" [= configuredState ]" + "\n" +
"acls = " + aclsString +
" [= configuredAcls ]" + "\n" +
"nodeLocalityDelay = " + nodeLocalityDelay + "\n" +
"labels=" + labelStrBuilder.toString() + "\n" +
"nodeLocalityDelay = " + nodeLocalityDelay + "\n" +
"reservationsContinueLooking = " +
reservationsContinueLooking + "\n" +
"preemptionDisabled = " + getPreemptionDisabled() + "\n");
}
@Override
public String getQueuePath() {
return getParent().getQueuePath() + "." + getQueueName();
}
/**
* Used only by tests.
*/
@Private
public float getMinimumAllocationFactor() {
return minimumAllocationFactor;
}
/**
* Used only by tests.
*/
@Private
public float getMaxAMResourcePerQueuePercent() {
return maxAMResourcePerQueuePercent;
}
public int getMaxApplications() {
return maxApplications;
}
public synchronized int getMaxApplicationsPerUser() {
return maxApplicationsPerUser;
}
@Override
public ActiveUsersManager getActiveUsersManager() {
return activeUsersManager;
}
@Override
public List<CSQueue> getChildQueues() {
return null;
}
/**
* Set user limit - used only for testing.
* @param userLimit new user limit
*/
synchronized void setUserLimit(int userLimit) {
this.userLimit = userLimit;
}
/**
* Set user limit factor - used only for testing.
* @param userLimitFactor new user limit factor
*/
synchronized void setUserLimitFactor(float userLimitFactor) {
this.userLimitFactor = userLimitFactor;
}
@Override
public synchronized int getNumApplications() {
return getNumPendingApplications() + getNumActiveApplications();
}
public synchronized int getNumPendingApplications() {
return pendingApplications.size();
}
public synchronized int getNumActiveApplications() {
return activeApplications.size();
}
@Private
public synchronized int getNumApplications(String user) {
return getUser(user).getTotalApplications();
}
@Private
public synchronized int getNumPendingApplications(String user) {
return getUser(user).getPendingApplications();
}
@Private
public synchronized int getNumActiveApplications(String user) {
return getUser(user).getActiveApplications();
}
public synchronized int getNumContainers() {
return numContainers;
}
@Override
public synchronized QueueState getState() {
return state;
}
@Private
public synchronized int getUserLimit() {
return userLimit;
}
@Private
public synchronized float getUserLimitFactor() {
return userLimitFactor;
}
@Override
public synchronized QueueInfo getQueueInfo(
boolean includeChildQueues, boolean recursive) {
QueueInfo queueInfo = getQueueInfo();
return queueInfo;
}
@Override
public synchronized List<QueueUserACLInfo>
getQueueUserAclInfo(UserGroupInformation user) {
QueueUserACLInfo userAclInfo =
recordFactory.newRecordInstance(QueueUserACLInfo.class);
List<QueueACL> operations = new ArrayList<QueueACL>();
for (QueueACL operation : QueueACL.values()) {
if (hasAccess(operation, user)) {
operations.add(operation);
}
}
userAclInfo.setQueueName(getQueueName());
userAclInfo.setUserAcls(operations);
return Collections.singletonList(userAclInfo);
}
@Private
public int getNodeLocalityDelay() {
return nodeLocalityDelay;
}
public String toString() {
return queueName + ": " +
"capacity=" + queueCapacities.getCapacity() + ", " +
"absoluteCapacity=" + queueCapacities.getAbsoluteCapacity() + ", " +
"usedResources=" + queueUsage.getUsed() + ", " +
"usedCapacity=" + getUsedCapacity() + ", " +
"absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + ", " +
"numApps=" + getNumApplications() + ", " +
"numContainers=" + getNumContainers();
}
@VisibleForTesting
public synchronized void setNodeLabelManager(RMNodeLabelsManager mgr) {
this.labelManager = mgr;
}
@VisibleForTesting
public synchronized User getUser(String userName) {
User user = users.get(userName);
if (user == null) {
user = new User();
users.put(userName, user);
}
return user;
}
/**
* @return an ArrayList of UserInfo objects who are active in this queue
*/
public synchronized ArrayList<UserInfo> getUsers() {
ArrayList<UserInfo> usersToReturn = new ArrayList<UserInfo>();
for (Map.Entry<String, User> entry : users.entrySet()) {
User user = entry.getValue();
Resource usedRes = Resource.newInstance(0, 0);
for (String nl : getAccessibleLabelSet()) {
Resources.addTo(usedRes, user.getUsed(nl));
}
usersToReturn.add(new UserInfo(entry.getKey(), usedRes,
user.getActiveApplications(), user
.getPendingApplications(), Resources.clone(user
.getConsumedAMResources()), Resources.clone(user
.getUserResourceLimit())));
}
return usersToReturn;
}
/**
* Gets the labels which are accessible by this queue. If ANY label can be
* accessed, put all labels in the set.
* @return accessiglbe node labels
*/
protected final Set<String> getAccessibleLabelSet() {
Set<String> nodeLabels = new HashSet<String>();
if (this.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) {
nodeLabels.addAll(labelManager.getClusterNodeLabels());
} else {
nodeLabels.addAll(this.getAccessibleNodeLabels());
}
nodeLabels.add(RMNodeLabelsManager.NO_LABEL);
return nodeLabels;
}
@Override
public synchronized void reinitialize(
CSQueue newlyParsedQueue, Resource clusterResource)
throws IOException {
// Sanity check
if (!(newlyParsedQueue instanceof LeafQueue) ||
!newlyParsedQueue.getQueuePath().equals(getQueuePath())) {
throw new IOException("Trying to reinitialize " + getQueuePath() +
" from " + newlyParsedQueue.getQueuePath());
}
LeafQueue newlyParsedLeafQueue = (LeafQueue)newlyParsedQueue;
// don't allow the maximum allocation to be decreased in size
// since we have already told running AM's the size
Resource oldMax = getMaximumAllocation();
Resource newMax = newlyParsedLeafQueue.getMaximumAllocation();
if (newMax.getMemory() < oldMax.getMemory()
|| newMax.getVirtualCores() < oldMax.getVirtualCores()) {
throw new IOException(
"Trying to reinitialize "
+ getQueuePath()
+ " the maximum allocation size can not be decreased!"
+ " Current setting: " + oldMax
+ ", trying to set it to: " + newMax);
}
setupQueueConfigs(clusterResource);
// queue metrics are updated, more resource may be available
// activate the pending applications if possible
activateApplications();
}
@Override
public void submitApplicationAttempt(FiCaSchedulerApp application,
String userName) {
// Careful! Locking order is important!
synchronized (this) {
User user = getUser(userName);
// Add the attempt to our data-structures
addApplicationAttempt(application, user);
}
// We don't want to update metrics for move app
if (application.isPending()) {
metrics.submitAppAttempt(userName);
}
getParent().submitApplicationAttempt(application, userName);
}
@Override
public void submitApplication(ApplicationId applicationId, String userName,
String queue) throws AccessControlException {
// Careful! Locking order is important!
// Check queue ACLs
UserGroupInformation userUgi = UserGroupInformation.createRemoteUser(userName);
if (!hasAccess(QueueACL.SUBMIT_APPLICATIONS, userUgi)
&& !hasAccess(QueueACL.ADMINISTER_QUEUE, userUgi)) {
throw new AccessControlException("User " + userName + " cannot submit" +
" applications to queue " + getQueuePath());
}
User user = null;
synchronized (this) {
// Check if the queue is accepting jobs
if (getState() != QueueState.RUNNING) {
String msg = "Queue " + getQueuePath() +
" is STOPPED. Cannot accept submission of application: " + applicationId;
LOG.info(msg);
throw new AccessControlException(msg);
}
// Check submission limits for queues
if (getNumApplications() >= getMaxApplications()) {
String msg = "Queue " + getQueuePath() +
" already has " + getNumApplications() + " applications," +
" cannot accept submission of application: " + applicationId;
LOG.info(msg);
throw new AccessControlException(msg);
}
// Check submission limits for the user on this queue
user = getUser(userName);
if (user.getTotalApplications() >= getMaxApplicationsPerUser()) {
String msg = "Queue " + getQueuePath() +
" already has " + user.getTotalApplications() +
" applications from user " + userName +
" cannot accept submission of application: " + applicationId;
LOG.info(msg);
throw new AccessControlException(msg);
}
}
// Inform the parent queue
try {
getParent().submitApplication(applicationId, userName, queue);
} catch (AccessControlException ace) {
LOG.info("Failed to submit application to parent-queue: " +
getParent().getQueuePath(), ace);
throw ace;
}
}
public synchronized Resource getAMResourceLimit() {
/*
* The limit to the amount of resources which can be consumed by
* application masters for applications running in the queue
* is calculated by taking the greater of the max resources currently
* available to the queue (see absoluteMaxAvailCapacity) and the absolute
* resources guaranteed for the queue and multiplying it by the am
* resource percent.
*
* This is to allow a queue to grow its (proportional) application
* master resource use up to its max capacity when other queues are
* idle but to scale back down to it's guaranteed capacity as they
* become busy.
*
*/
Resource queueCurrentLimit;
synchronized (queueResourceLimitsInfo) {
queueCurrentLimit = queueResourceLimitsInfo.getQueueCurrentLimit();
}
Resource queueCap = Resources.max(resourceCalculator, lastClusterResource,
absoluteCapacityResource, queueCurrentLimit);
return Resources.multiplyAndNormalizeUp(
resourceCalculator,
queueCap,
maxAMResourcePerQueuePercent, minimumAllocation);
}
public synchronized Resource getUserAMResourceLimit() {
/*
* The user amresource limit is based on the same approach as the
* user limit (as it should represent a subset of that). This means that
* it uses the absolute queue capacity instead of the max and is modified
* by the userlimit and the userlimit factor as is the userlimit
*
*/
float effectiveUserLimit = Math.max(userLimit / 100.0f, 1.0f /
Math.max(getActiveUsersManager().getNumActiveUsers(), 1));
return Resources.multiplyAndNormalizeUp(
resourceCalculator,
absoluteCapacityResource,
maxAMResourcePerQueuePercent * effectiveUserLimit *
userLimitFactor, minimumAllocation);
}
private synchronized void activateApplications() {
//limit of allowed resource usage for application masters
Resource amLimit = getAMResourceLimit();
Resource userAMLimit = getUserAMResourceLimit();
for (Iterator<FiCaSchedulerApp> i=pendingApplications.iterator();
i.hasNext(); ) {
FiCaSchedulerApp application = i.next();
// Check am resource limit
Resource amIfStarted =
Resources.add(application.getAMResource(), queueUsage.getAMUsed());
if (LOG.isDebugEnabled()) {
LOG.debug("application AMResource " + application.getAMResource() +
" maxAMResourcePerQueuePercent " + maxAMResourcePerQueuePercent +
" amLimit " + amLimit +
" lastClusterResource " + lastClusterResource +
" amIfStarted " + amIfStarted);
}
if (!Resources.lessThanOrEqual(
resourceCalculator, lastClusterResource, amIfStarted, amLimit)) {
if (getNumActiveApplications() < 1) {
LOG.warn("maximum-am-resource-percent is insufficient to start a" +
" single application in queue, it is likely set too low." +
" skipping enforcement to allow at least one application to start");
} else {
LOG.info("not starting application as amIfStarted exceeds amLimit");
continue;
}
}
// Check user am resource limit
User user = getUser(application.getUser());
Resource userAmIfStarted =
Resources.add(application.getAMResource(),
user.getConsumedAMResources());
if (!Resources.lessThanOrEqual(
resourceCalculator, lastClusterResource, userAmIfStarted,
userAMLimit)) {
if (getNumActiveApplications() < 1) {
LOG.warn("maximum-am-resource-percent is insufficient to start a" +
" single application in queue for user, it is likely set too low." +
" skipping enforcement to allow at least one application to start");
} else {
LOG.info("not starting application as amIfStarted exceeds " +
"userAmLimit");
continue;
}
}
user.activateApplication();
activeApplications.put(application.getApplicationId(),application);
if(pendingApplicationsWithRequests.remove(application.getApplicationId())){
activeApplicationsWithRequests.add(application);
}
queueUsage.incAMUsed(application.getAMResource());
user.getResourceUsage().incAMUsed(application.getAMResource());
i.remove();
LOG.info("Application " + application.getApplicationId() +
" from user: " + application.getUser() +
" activated in queue: " + getQueueName());
}
}
private synchronized void addApplicationAttempt(FiCaSchedulerApp application,
User user) {
// Accept
user.submitApplication();
pendingApplications.add(application);
applicationAttemptMap.put(application.getApplicationAttemptId(), application);
// Activate applications
activateApplications();
LOG.info("Application added -" +
" appId: " + application.getApplicationId() +
" user: " + user + "," + " leaf-queue: " + getQueueName() +
" #user-pending-applications: " + user.getPendingApplications() +
" #user-active-applications: " + user.getActiveApplications() +
" #queue-pending-applications: " + getNumPendingApplications() +
" #queue-active-applications: " + getNumActiveApplications()
);
}
@Override
public void finishApplication(ApplicationId application, String user) {
// Inform the activeUsersManager
activeUsersManager.deactivateApplication(user, application);
// Inform the parent queue
getParent().finishApplication(application, user);
}
@Override
public void finishApplicationAttempt(FiCaSchedulerApp application, String queue) {
// Careful! Locking order is important!
synchronized (this) {
removeApplicationAttempt(application, getUser(application.getUser()));
}
getParent().finishApplicationAttempt(application, queue);
}
public synchronized void removeApplicationAttempt(
FiCaSchedulerApp application, User user) {
deactivateApplication(application.getApplicationId());
boolean wasActive = (activeApplications.remove(application.
getApplicationId()) != null);
if (!wasActive) {
pendingApplications.remove(application);
} else {
queueUsage.decAMUsed(application.getAMResource());
user.getResourceUsage().decAMUsed(application.getAMResource());
}
applicationAttemptMap.remove(application.getApplicationAttemptId());
user.finishApplication(wasActive);
if (user.getTotalApplications() == 0) {
users.remove(application.getUser());
}
// Check if we can activate more applications
activateApplications();
LOG.info("Application removed -" +
" appId: " + application.getApplicationId() +
" user: " + application.getUser() +
" queue: " + getQueueName() +
" #user-pending-applications: " + user.getPendingApplications() +
" #user-active-applications: " + user.getActiveApplications() +
" #queue-pending-applications: " + getNumPendingApplications() +
" #queue-active-applications: " + getNumActiveApplications()
);
}
private synchronized FiCaSchedulerApp getApplication(
ApplicationAttemptId applicationAttemptId) {
return applicationAttemptMap.get(applicationAttemptId);
}
private static final CSAssignment NULL_ASSIGNMENT =
new CSAssignment(Resources.createResource(0, 0), NodeType.NODE_LOCAL);
private static final CSAssignment SKIP_ASSIGNMENT = new CSAssignment(true);
private static Set<String> getRequestLabelSetByExpression(
String labelExpression) {
Set<String> labels = new HashSet<String>();
if (null == labelExpression) {
return labels;
}
for (String l : labelExpression.split("&&")) {
if (l.trim().isEmpty()) {
continue;
}
labels.add(l.trim());
}
return labels;
}
@Override
public void activateApplication(ApplicationId appId){
applicationsToActivate.add(appId);
}
@Override
public void deactivateApplication(ApplicationId appId){
applicationsToDeactivate.add(appId);
applicationsToActivate.remove(appId);
}
@Override
public synchronized CSAssignment assignContainers(Resource clusterResource,
FiCaSchedulerNode node, ResourceLimits currentResourceLimits) {
updateCurrentResourceLimits(currentResourceLimits, clusterResource);
if(LOG.isDebugEnabled()) {
LOG.debug("assignContainers: node=" + node.getNodeName()
+ " #applications=" + activeApplications.size());
}
// if our queue cannot access this node, just return
if (!SchedulerUtils.checkQueueAccessToNode(accessibleLabels,
node.getLabels())) {
return NULL_ASSIGNMENT;
}
// Check for reserved resources
RMContainer reservedContainer = node.getReservedContainer();
if (reservedContainer != null) {
FiCaSchedulerApp application =
getApplication(reservedContainer.getApplicationAttemptId());
synchronized (application) {
return assignReservedContainer(application, node, reservedContainer,
clusterResource);
}
}
Resource initAmountNeededUnreserve =
currentResourceLimits.getAmountNeededUnreserve();
List<ApplicationId> toRemove = new ArrayList<>();
for (ApplicationId appId : applicationsToDeactivate) {
toRemove.add(appId);
FiCaSchedulerApp app = activeApplications.get(appId);
if (app != null) {
activeApplicationsWithRequests.remove(app);
} else {
pendingApplicationsWithRequests.remove(appId);
}
}
applicationsToDeactivate.removeAll(toRemove);
toRemove.clear();
for (ApplicationId appId : applicationsToActivate) {
toRemove.add(appId);
FiCaSchedulerApp application = activeApplications.get(appId);
if (application != null) {
activeApplicationsWithRequests.add(application);
} else {
pendingApplicationsWithRequests.add(appId);
}
}
applicationsToActivate.removeAll(toRemove);
// Try to assign containers to applications in order
for (FiCaSchedulerApp application : activeApplicationsWithRequests) {
if(LOG.isDebugEnabled()) {
LOG.debug("pre-assignContainers for application "
+ application.getApplicationId());
application.showRequests();
}
synchronized (application) {
// Check if this resource is on the blacklist
if (SchedulerAppUtils.isBlacklisted(application, node, LOG)) {
continue;
}
// Schedule in priority order
for (Priority priority : application.getPriorities()) {
ResourceRequest anyRequest =
application.getResourceRequest(priority, ResourceRequest.ANY);
if (null == anyRequest) {
continue;
}
// Required resource
Resource required = anyRequest.getCapability();
// Do we need containers at this 'priority'?
if (application.getTotalRequiredResources(priority) <= 0) {
continue;
}
if (!this.reservationsContinueLooking) {
if (!shouldAllocOrReserveNewContainer(application, priority, required)) {
if (LOG.isDebugEnabled()) {
LOG.debug("doesn't need containers based on reservation algo!");
}
continue;
}
}
Set<String> requestedNodeLabels =
getRequestLabelSetByExpression(anyRequest
.getNodeLabelExpression());
// Compute user-limit & set headroom
// Note: We compute both user-limit & headroom with the highest
// priority request as the target.
// This works since we never assign lower priority requests
// before all higher priority ones are serviced.
Resource userLimit =
computeUserLimitAndSetHeadroom(application, clusterResource,
required, requestedNodeLabels);
currentResourceLimits.setAmountNeededUnreserve(
initAmountNeededUnreserve);
// Check queue max-capacity limit
if (!super.canAssignToThisQueue(clusterResource, node.getLabels(),
currentResourceLimits, required, application.getCurrentReservation())) {
return NULL_ASSIGNMENT;
}
// Check user limit
if (!assignToUser(clusterResource, application.getUser(), userLimit,
application, requestedNodeLabels, currentResourceLimits)) {
break;
}
// Inform the application it is about to get a scheduling opportunity
application.addSchedulingOpportunity(priority);
// Try to schedule
CSAssignment assignment =
assignContainersOnNode(clusterResource, node, application, priority,
null, currentResourceLimits);
// Did the application skip this node?
if (assignment.getSkipped()) {
// Don't count 'skipped nodes' as a scheduling opportunity!
application.subtractSchedulingOpportunity(priority);
continue;
}
// Did we schedule or reserve a container?
Resource assigned = assignment.getResource();
if (Resources.greaterThan(
resourceCalculator, clusterResource, assigned, Resources.none())) {
// Book-keeping
// Note: Update headroom to account for current allocation too...
allocateResource(clusterResource, application, assigned,
node.getLabels());
// Don't reset scheduling opportunities for non-local assignments
// otherwise the app will be delayed for each non-local assignment.
// This helps apps with many off-cluster requests schedule faster.
if (assignment.getType() != NodeType.OFF_SWITCH) {
if (LOG.isDebugEnabled()) {
LOG.debug("Resetting scheduling opportunities");
}
application.resetSchedulingOpportunities(priority);
}
// Done
return assignment;
} else {
// Do not assign out of order w.r.t priorities
break;
}
}
}
if(LOG.isDebugEnabled()) {
LOG.debug("post-assignContainers for application "
+ application.getApplicationId());
}
application.showRequests();
}
return NULL_ASSIGNMENT;
}
private synchronized CSAssignment assignReservedContainer(
FiCaSchedulerApp application, FiCaSchedulerNode node,
RMContainer rmContainer, Resource clusterResource) {
// Do we still need this reservation?
Priority priority = rmContainer.getReservedPriority();
if (application.getTotalRequiredResources(priority) == 0) {
// Release
return new CSAssignment(application, rmContainer);
}
// Try to assign if we have sufficient resources
assignContainersOnNode(clusterResource, node, application, priority,
rmContainer, new ResourceLimits(Resources.none()));
// Doesn't matter... since it's already charged for at time of reservation
// "re-reservation" is *free*
return new CSAssignment(Resources.none(), NodeType.NODE_LOCAL);
}
protected Resource getHeadroom(User user, Resource queueCurrentLimit,
Resource clusterResource, FiCaSchedulerApp application, Resource required) {
return getHeadroom(user, queueCurrentLimit, clusterResource,
computeUserLimit(application, clusterResource, required, user, null));
}
private Resource getHeadroom(User user, Resource currentResourceLimit,
Resource clusterResource, Resource userLimit) {
/**
* Headroom is:
* min(
* min(userLimit, queueMaxCap) - userConsumed,
* queueMaxCap - queueUsedResources
* )
*
* ( which can be expressed as,
* min (userLimit - userConsumed, queuMaxCap - userConsumed,
* queueMaxCap - queueUsedResources)
* )
*
* given that queueUsedResources >= userConsumed, this simplifies to
*
* >> min (userlimit - userConsumed, queueMaxCap - queueUsedResources) <<
*
*/
Resource headroom =
Resources.componentwiseMin(
Resources.subtract(userLimit, user.getUsed()),
Resources.subtract(currentResourceLimit, queueUsage.getUsed())
);
// Normalize it before return
headroom =
Resources.roundDown(resourceCalculator, headroom, minimumAllocation);
return headroom;
}
private void setQueueResourceLimitsInfo(
Resource clusterResource) {
synchronized (queueResourceLimitsInfo) {
queueResourceLimitsInfo.setQueueCurrentLimit(cachedResourceLimitsForHeadroom
.getLimit());
queueResourceLimitsInfo.setClusterResource(clusterResource);
}
}
@Lock({LeafQueue.class, FiCaSchedulerApp.class})
Resource computeUserLimitAndSetHeadroom(FiCaSchedulerApp application,
Resource clusterResource, Resource required, Set<String> requestedLabels) {
String user = application.getUser();
User queueUser = getUser(user);
// Compute user limit respect requested labels,
// TODO, need consider headroom respect labels also
Resource userLimit =
computeUserLimit(application, clusterResource, required,
queueUser, requestedLabels);
setQueueResourceLimitsInfo(clusterResource);
Resource headroom =
getHeadroom(queueUser, cachedResourceLimitsForHeadroom.getLimit(),
clusterResource, userLimit);
if (LOG.isDebugEnabled()) {
LOG.debug("Headroom calculation for user " + user + ": " +
" userLimit=" + userLimit +
" queueMaxAvailRes=" + cachedResourceLimitsForHeadroom.getLimit() +
" consumed=" + queueUser.getUsed() +
" headroom=" + headroom);
}
CapacityHeadroomProvider headroomProvider = new CapacityHeadroomProvider(
queueUser, this, application, required, queueResourceLimitsInfo);
application.setHeadroomProvider(headroomProvider);
metrics.setAvailableResourcesToUser(user, headroom);
return userLimit;
}
@Lock(NoLock.class)
private Resource computeUserLimit(FiCaSchedulerApp application,
Resource clusterResource, Resource required, User user,
Set<String> requestedLabels) {
// What is our current capacity?
// * It is equal to the max(required, queue-capacity) if
// we're running below capacity. The 'max' ensures that jobs in queues
// with miniscule capacity (< 1 slot) make progress
// * If we're running over capacity, then its
// (usedResources + required) (which extra resources we are allocating)
Resource queueCapacity = Resource.newInstance(0, 0);
if (requestedLabels != null && !requestedLabels.isEmpty()) {
// if we have multiple labels to request, we will choose to use the first
// label
String firstLabel = requestedLabels.iterator().next();
queueCapacity =
Resources
.max(resourceCalculator, clusterResource, queueCapacity,
Resources.multiplyAndNormalizeUp(resourceCalculator,
labelManager.getResourceByLabel(firstLabel,
clusterResource),
queueCapacities.getAbsoluteCapacity(firstLabel),
minimumAllocation));
} else {
// else there's no label on request, just to use absolute capacity as
// capacity for nodes without label
queueCapacity =
Resources.multiplyAndNormalizeUp(resourceCalculator, labelManager
.getResourceByLabel(CommonNodeLabelsManager.NO_LABEL, clusterResource),
queueCapacities.getAbsoluteCapacity(), minimumAllocation);
}
// Allow progress for queues with miniscule capacity
queueCapacity =
Resources.max(
resourceCalculator, clusterResource,
queueCapacity,
required);
Resource currentCapacity =
Resources.lessThan(resourceCalculator, clusterResource,
queueUsage.getUsed(), queueCapacity) ?
queueCapacity : Resources.add(queueUsage.getUsed(), required);
// Never allow a single user to take more than the
// queue's configured capacity * user-limit-factor.
// Also, the queue's configured capacity should be higher than
// queue-hard-limit * ulMin
final int activeUsers = activeUsersManager.getNumActiveUsers();
Resource limit =
Resources.roundUp(
resourceCalculator,
Resources.min(
resourceCalculator, clusterResource,
Resources.max(
resourceCalculator, clusterResource,
Resources.divideAndCeil(
resourceCalculator, currentCapacity, activeUsers),
Resources.divideAndCeil(
resourceCalculator,
Resources.multiplyAndRoundDown(
currentCapacity, userLimit),
100)
),
Resources.multiplyAndRoundDown(queueCapacity, userLimitFactor)
),
minimumAllocation);
if (LOG.isDebugEnabled()) {
String userName = application.getUser();
LOG.debug("User limit computation for " + userName +
" in queue " + getQueueName() +
" userLimit=" + userLimit +
" userLimitFactor=" + userLimitFactor +
" required: " + required +
" consumed: " + user.getUsed() +
" limit: " + limit +
" queueCapacity: " + queueCapacity +
" qconsumed: " + queueUsage.getUsed() +
" currentCapacity: " + currentCapacity +
" activeUsers: " + activeUsers +
" clusterCapacity: " + clusterResource
);
}
user.setUserResourceLimit(limit);
return limit;
}
@Private
protected synchronized boolean assignToUser(Resource clusterResource,
String userName, Resource limit, FiCaSchedulerApp application,
Set<String> requestLabels, ResourceLimits currentResoureLimits) {
User user = getUser(userName);
String label = CommonNodeLabelsManager.NO_LABEL;
if (requestLabels != null && !requestLabels.isEmpty()) {
label = requestLabels.iterator().next();
}
// Note: We aren't considering the current request since there is a fixed
// overhead of the AM, but it's a > check, not a >= check, so...
if (Resources
.greaterThan(resourceCalculator, clusterResource,
user.getUsed(label),
limit)) {
// if enabled, check to see if could we potentially use this node instead
// of a reserved node if the application has reserved containers
if (this.reservationsContinueLooking) {
if (Resources.lessThanOrEqual(
resourceCalculator,
clusterResource,
Resources.subtract(user.getUsed(), application.getCurrentReservation()),
limit)) {
if (LOG.isDebugEnabled()) {
LOG.debug("User " + userName + " in queue " + getQueueName()
+ " will exceed limit based on reservations - " + " consumed: "
+ user.getUsed() + " reserved: "
+ application.getCurrentReservation() + " limit: " + limit);
}
Resource amountNeededToUnreserve = Resources.subtract(user.getUsed(label), limit);
// we can only acquire a new container if we unreserve first since we ignored the
// user limit. Choose the max of user limit or what was previously set by max
// capacity.
currentResoureLimits.setAmountNeededUnreserve(Resources.max(resourceCalculator,
clusterResource, currentResoureLimits.getAmountNeededUnreserve(),
amountNeededToUnreserve));
return true;
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("User " + userName + " in queue " + getQueueName()
+ " will exceed limit - " + " consumed: "
+ user.getUsed() + " limit: " + limit);
}
return false;
}
return true;
}
boolean shouldAllocOrReserveNewContainer(FiCaSchedulerApp application,
Priority priority, Resource required) {
int requiredContainers = application.getTotalRequiredResources(priority);
int reservedContainers = application.getNumReservedContainers(priority);
int starvation = 0;
if (reservedContainers > 0) {
float nodeFactor =
Resources.ratio(
resourceCalculator, required, getMaximumAllocation()
);
// Use percentage of node required to bias against large containers...
// Protect against corner case where you need the whole node with
// Math.min(nodeFactor, minimumAllocationFactor)
starvation =
(int)((application.getReReservations(priority) / (float)reservedContainers) *
(1.0f - (Math.min(nodeFactor, getMinimumAllocationFactor())))
);
if (LOG.isDebugEnabled()) {
LOG.debug("needsContainers:" +
" app.#re-reserve=" + application.getReReservations(priority) +
" reserved=" + reservedContainers +
" nodeFactor=" + nodeFactor +
" minAllocFactor=" + getMinimumAllocationFactor() +
" starvation=" + starvation);
}
}
return (((starvation + requiredContainers) - reservedContainers) > 0);
}
private CSAssignment assignContainersOnNode(Resource clusterResource,
FiCaSchedulerNode node, FiCaSchedulerApp application, Priority priority,
RMContainer reservedContainer, ResourceLimits currentResoureLimits) {
Resource assigned = Resources.none();
NodeType requestType = null;
MutableObject allocatedContainer = new MutableObject();
// Data-local
ResourceRequest nodeLocalResourceRequest =
application.getResourceRequest(priority, node.getNodeName());
if (nodeLocalResourceRequest != null) {
requestType = NodeType.NODE_LOCAL;
assigned =
assignNodeLocalContainers(clusterResource, nodeLocalResourceRequest,
node, application, priority, reservedContainer,
allocatedContainer, currentResoureLimits);
if (Resources.greaterThan(resourceCalculator, clusterResource,
assigned, Resources.none())) {
//update locality statistics
if (allocatedContainer.getValue() != null) {
application.incNumAllocatedContainers(NodeType.NODE_LOCAL,
requestType);
}
return new CSAssignment(assigned, NodeType.NODE_LOCAL);
}
}
// Rack-local
ResourceRequest rackLocalResourceRequest =
application.getResourceRequest(priority, node.getRackName());
if (rackLocalResourceRequest != null) {
if (!rackLocalResourceRequest.getRelaxLocality()) {
return SKIP_ASSIGNMENT;
}
if (requestType != NodeType.NODE_LOCAL) {
requestType = NodeType.RACK_LOCAL;
}
assigned =
assignRackLocalContainers(clusterResource, rackLocalResourceRequest,
node, application, priority, reservedContainer,
allocatedContainer, currentResoureLimits);
if (Resources.greaterThan(resourceCalculator, clusterResource,
assigned, Resources.none())) {
//update locality statistics
if (allocatedContainer.getValue() != null) {
application.incNumAllocatedContainers(NodeType.RACK_LOCAL,
requestType);
}
return new CSAssignment(assigned, NodeType.RACK_LOCAL);
}
}
// Off-switch
ResourceRequest offSwitchResourceRequest =
application.getResourceRequest(priority, ResourceRequest.ANY);
if (offSwitchResourceRequest != null) {
if (!offSwitchResourceRequest.getRelaxLocality()) {
return SKIP_ASSIGNMENT;
}
if (requestType != NodeType.NODE_LOCAL
&& requestType != NodeType.RACK_LOCAL) {
requestType = NodeType.OFF_SWITCH;
}
assigned =
assignOffSwitchContainers(clusterResource, offSwitchResourceRequest,
node, application, priority, reservedContainer,
allocatedContainer, currentResoureLimits);
// update locality statistics
if (allocatedContainer.getValue() != null) {
application.incNumAllocatedContainers(NodeType.OFF_SWITCH, requestType);
}
return new CSAssignment(assigned, NodeType.OFF_SWITCH);
}
return SKIP_ASSIGNMENT;
}
@Private
protected boolean findNodeToUnreserve(Resource clusterResource,
FiCaSchedulerNode node, FiCaSchedulerApp application, Priority priority,
Resource minimumUnreservedResource) {
// need to unreserve some other container first
NodeId idToUnreserve =
application.getNodeIdToUnreserve(priority, minimumUnreservedResource,
resourceCalculator, clusterResource);
if (idToUnreserve == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("checked to see if could unreserve for app but nothing "
+ "reserved that matches for this app");
}
return false;
}
FiCaSchedulerNode nodeToUnreserve = scheduler.getNode(idToUnreserve);
if (nodeToUnreserve == null) {
LOG.error("node to unreserve doesn't exist, nodeid: " + idToUnreserve);
return false;
}
if (LOG.isDebugEnabled()) {
LOG.debug("unreserving for app: " + application.getApplicationId()
+ " on nodeId: " + idToUnreserve
+ " in order to replace reserved application and place it on node: "
+ node.getNodeID() + " needing: " + minimumUnreservedResource);
}
// headroom
Resources.addTo(application.getHeadroom(), nodeToUnreserve
.getReservedContainer().getReservedResource());
// Make sure to not have completedContainers sort the queues here since
// we are already inside an iterator loop for the queues and this would
// cause an concurrent modification exception.
completedContainer(clusterResource, application, nodeToUnreserve,
nodeToUnreserve.getReservedContainer(),
SchedulerUtils.createAbnormalContainerStatus(nodeToUnreserve
.getReservedContainer().getContainerId(),
SchedulerUtils.UNRESERVED_CONTAINER),
RMContainerEventType.RELEASED, null, false);
return true;
}
private Resource assignNodeLocalContainers(Resource clusterResource,
ResourceRequest nodeLocalResourceRequest, FiCaSchedulerNode node,
FiCaSchedulerApp application, Priority priority,
RMContainer reservedContainer, MutableObject allocatedContainer,
ResourceLimits currentResoureLimits) {
if (canAssign(application, priority, node, NodeType.NODE_LOCAL,
reservedContainer)) {
return assignContainer(clusterResource, node, application, priority,
nodeLocalResourceRequest, NodeType.NODE_LOCAL, reservedContainer,
allocatedContainer, currentResoureLimits);
}
return Resources.none();
}
private Resource assignRackLocalContainers(Resource clusterResource,
ResourceRequest rackLocalResourceRequest, FiCaSchedulerNode node,
FiCaSchedulerApp application, Priority priority,
RMContainer reservedContainer, MutableObject allocatedContainer,
ResourceLimits currentResoureLimits) {
if (canAssign(application, priority, node, NodeType.RACK_LOCAL,
reservedContainer)) {
return assignContainer(clusterResource, node, application, priority,
rackLocalResourceRequest, NodeType.RACK_LOCAL, reservedContainer,
allocatedContainer, currentResoureLimits);
}
return Resources.none();
}
private Resource assignOffSwitchContainers(Resource clusterResource,
ResourceRequest offSwitchResourceRequest, FiCaSchedulerNode node,
FiCaSchedulerApp application, Priority priority,
RMContainer reservedContainer, MutableObject allocatedContainer,
ResourceLimits currentResoureLimits) {
if (canAssign(application, priority, node, NodeType.OFF_SWITCH,
reservedContainer)) {
return assignContainer(clusterResource, node, application, priority,
offSwitchResourceRequest, NodeType.OFF_SWITCH, reservedContainer,
allocatedContainer, currentResoureLimits);
}
return Resources.none();
}
boolean canAssign(FiCaSchedulerApp application, Priority priority,
FiCaSchedulerNode node, NodeType type, RMContainer reservedContainer) {
// Clearly we need containers for this application...
if (type == NodeType.OFF_SWITCH) {
if (reservedContainer != null) {
return true;
}
// 'Delay' off-switch
ResourceRequest offSwitchRequest =
application.getResourceRequest(priority, ResourceRequest.ANY);
long missedOpportunities = application.getSchedulingOpportunities(priority);
long requiredContainers = offSwitchRequest.getNumContainers();
float localityWaitFactor =
application.getLocalityWaitFactor(priority,
scheduler.getNumClusterNodes());
return ((requiredContainers * localityWaitFactor) < missedOpportunities);
}
// Check if we need containers on this rack
ResourceRequest rackLocalRequest =
application.getResourceRequest(priority, node.getRackName());
if (rackLocalRequest == null || rackLocalRequest.getNumContainers() <= 0) {
return false;
}
// If we are here, we do need containers on this rack for RACK_LOCAL req
if (type == NodeType.RACK_LOCAL) {
// 'Delay' rack-local just a little bit...
long missedOpportunities = application.getSchedulingOpportunities(priority);
return (
Math.min(scheduler.getNumClusterNodes(), getNodeLocalityDelay()) <
missedOpportunities
);
}
// Check if we need containers on this host
if (type == NodeType.NODE_LOCAL) {
// Now check if we need containers on this host...
ResourceRequest nodeLocalRequest =
application.getResourceRequest(priority, node.getNodeName());
if (nodeLocalRequest != null) {
return nodeLocalRequest.getNumContainers() > 0;
}
}
return false;
}
private Container getContainer(RMContainer rmContainer,
FiCaSchedulerApp application, FiCaSchedulerNode node,
Resource capability, Priority priority) {
return (rmContainer != null) ? rmContainer.getContainer() :
createContainer(application, node, capability, priority);
}
Container createContainer(FiCaSchedulerApp application, FiCaSchedulerNode node,
Resource capability, Priority priority) {
NodeId nodeId = node.getRMNode().getNodeID();
ContainerId containerId = BuilderUtils.newContainerId(application
.getApplicationAttemptId(), application.getNewContainerId());
// Create the container
Container container =
BuilderUtils.newContainer(containerId, nodeId, node.getRMNode()
.getHttpAddress(), capability, priority, null);
return container;
}
private Resource assignContainer(Resource clusterResource, FiCaSchedulerNode node,
FiCaSchedulerApp application, Priority priority,
ResourceRequest request, NodeType type, RMContainer rmContainer,
MutableObject createdContainer, ResourceLimits currentResoureLimits) {
if (LOG.isDebugEnabled()) {
LOG.debug("assignContainers: node=" + node.getNodeName()
+ " application=" + application.getApplicationId()
+ " priority=" + priority.getPriority()
+ " request=" + request + " type=" + type);
}
// check if the resource request can access the label
if (!SchedulerUtils.checkNodeLabelExpression(
node.getLabels(),
request.getNodeLabelExpression())) {
// this is a reserved container, but we cannot allocate it now according
// to label not match. This can be caused by node label changed
// We should un-reserve this container.
if (rmContainer != null) {
unreserve(application, priority, node, rmContainer);
}
return Resources.none();
}
Resource capability = request.getCapability();
Resource available = node.getAvailableResource();
Resource totalResource = node.getTotalResource();
if (!Resources.lessThanOrEqual(resourceCalculator, clusterResource,
capability, totalResource)) {
LOG.warn("Node : " + node.getNodeID()
+ " does not have sufficient resource for request : " + request
+ " node total capability : " + node.getTotalResource());
return Resources.none();
}
assert Resources.greaterThan(
resourceCalculator, clusterResource, available, Resources.none());
// Create the container if necessary
Container container =
getContainer(rmContainer, application, node, capability, priority);
// something went wrong getting/creating the container
if (container == null) {
LOG.warn("Couldn't get container for allocation!");
return Resources.none();
}
boolean shouldAllocOrReserveNewContainer = shouldAllocOrReserveNewContainer(
application, priority, capability);
// Can we allocate a container on this node?
int availableContainers =
resourceCalculator.computeAvailableContainers(available, capability);
boolean needToUnreserve = Resources.greaterThan(resourceCalculator,clusterResource,
currentResoureLimits.getAmountNeededUnreserve(), Resources.none());
if (availableContainers > 0) {
// Allocate...
// Did we previously reserve containers at this 'priority'?
if (rmContainer != null) {
unreserve(application, priority, node, rmContainer);
} else if (this.reservationsContinueLooking && node.getLabels().isEmpty()) {
// when reservationsContinueLooking is set, we may need to unreserve
// some containers to meet this queue, its parents', or the users' resource limits.
// TODO, need change here when we want to support continuous reservation
// looking for labeled partitions.
if (!shouldAllocOrReserveNewContainer || needToUnreserve) {
// If we shouldn't allocate/reserve new container then we should unreserve one the same
// size we are asking for since the currentResoureLimits.getAmountNeededUnreserve
// could be zero. If the limit was hit then use the amount we need to unreserve to be
// under the limit.
Resource amountToUnreserve = capability;
if (needToUnreserve) {
amountToUnreserve = currentResoureLimits.getAmountNeededUnreserve();
}
boolean containerUnreserved =
findNodeToUnreserve(clusterResource, node, application, priority,
amountToUnreserve);
// When (minimum-unreserved-resource > 0 OR we cannot allocate new/reserved
// container (That means we *have to* unreserve some resource to
// continue)). If we failed to unreserve some resource, we can't continue.
if (!containerUnreserved) {
return Resources.none();
}
}
}
// Inform the application
RMContainer allocatedContainer =
application.allocate(type, node, priority, request, container);
// Does the application need this resource?
if (allocatedContainer == null) {
return Resources.none();
}
// Inform the node
node.allocateContainer(allocatedContainer);
LOG.info("assignedContainer" +
" application attempt=" + application.getApplicationAttemptId() +
" container=" + container +
" queue=" + this +
" clusterResource=" + clusterResource);
createdContainer.setValue(allocatedContainer);
return container.getResource();
} else {
// if we are allowed to allocate but this node doesn't have space, reserve it or
// if this was an already a reserved container, reserve it again
if (shouldAllocOrReserveNewContainer || rmContainer != null) {
if (reservationsContinueLooking && rmContainer == null) {
// we could possibly ignoring queue capacity or user limits when
// reservationsContinueLooking is set. Make sure we didn't need to unreserve
// one.
if (needToUnreserve) {
if (LOG.isDebugEnabled()) {
LOG.debug("we needed to unreserve to be able to allocate");
}
return Resources.none();
}
}
// Reserve by 'charging' in advance...
reserve(application, priority, node, rmContainer, container);
LOG.info("Reserved container " +
" application=" + application.getApplicationId() +
" resource=" + request.getCapability() +
" queue=" + this.toString() +
" usedCapacity=" + getUsedCapacity() +
" absoluteUsedCapacity=" + getAbsoluteUsedCapacity() +
" used=" + queueUsage.getUsed() +
" cluster=" + clusterResource);
return request.getCapability();
}
return Resources.none();
}
}
private void reserve(FiCaSchedulerApp application, Priority priority,
FiCaSchedulerNode node, RMContainer rmContainer, Container container) {
// Update reserved metrics if this is the first reservation
if (rmContainer == null) {
getMetrics().reserveResource(
application.getUser(), container.getResource());
}
// Inform the application
rmContainer = application.reserve(node, priority, rmContainer, container);
// Update the node
node.reserveResource(application, priority, rmContainer);
}
private boolean unreserve(FiCaSchedulerApp application, Priority priority,
FiCaSchedulerNode node, RMContainer rmContainer) {
// Done with the reservation?
if (application.unreserve(node, priority)) {
node.unreserveResource(application);
// Update reserved metrics
getMetrics().unreserveResource(application.getUser(),
rmContainer.getContainer().getResource());
return true;
}
return false;
}
@Override
public void completedContainer(Resource clusterResource,
FiCaSchedulerApp application, FiCaSchedulerNode node, RMContainer rmContainer,
ContainerStatus containerStatus, RMContainerEventType event, CSQueue childQueue,
boolean sortQueues) {
if (application != null) {
boolean removed = false;
// Careful! Locking order is important!
synchronized (this) {
Container container = rmContainer.getContainer();
// Inform the application & the node
// Note: It's safe to assume that all state changes to RMContainer
// happen under scheduler's lock...
// So, this is, in effect, a transaction across application & node
if (rmContainer.getState() == RMContainerState.RESERVED) {
removed = unreserve(application, rmContainer.getReservedPriority(),
node, rmContainer);
} else {
removed =
application.containerCompleted(rmContainer, containerStatus, event);
node.releaseContainer(container);
}
// Book-keeping
if (removed) {
releaseResource(clusterResource, application,
container.getResource(), node.getLabels());
LOG.info("completedContainer" +
" container=" + container +
" queue=" + this +
" cluster=" + clusterResource);
}
}
if (removed) {
// Inform the parent queue _outside_ of the leaf-queue lock
getParent().completedContainer(clusterResource, application, node,
rmContainer, null, event, this, sortQueues);
}
}
}
synchronized void allocateResource(Resource clusterResource,
SchedulerApplicationAttempt application, Resource resource,
Set<String> nodeLabels) {
super.allocateResource(clusterResource, resource, nodeLabels);
// Update user metrics
String userName = application.getUser();
User user = getUser(userName);
user.assignContainer(resource, nodeLabels);
// Note this is a bit unconventional since it gets the object and modifies
// it here, rather then using set routine
Resources.subtractFrom(application.getHeadroom(), resource); // headroom
metrics.setAvailableResourcesToUser(userName, application.getHeadroom());
if (LOG.isDebugEnabled()) {
LOG.info(getQueueName() +
" user=" + userName +
" used=" + queueUsage.getUsed() + " numContainers=" + numContainers +
" headroom = " + application.getHeadroom() +
" user-resources=" + user.getUsed()
);
}
}
synchronized void releaseResource(Resource clusterResource,
FiCaSchedulerApp application, Resource resource, Set<String> nodeLabels) {
super.releaseResource(clusterResource, resource, nodeLabels);
// Update user metrics
String userName = application.getUser();
User user = getUser(userName);
user.releaseContainer(resource, nodeLabels);
metrics.setAvailableResourcesToUser(userName, application.getHeadroom());
LOG.info(getQueueName() +
" used=" + queueUsage.getUsed() + " numContainers=" + numContainers +
" user=" + userName + " user-resources=" + user.getUsed());
}
private void updateAbsoluteCapacityResource(Resource clusterResource) {
absoluteCapacityResource =
Resources.multiplyAndNormalizeUp(resourceCalculator, clusterResource,
queueCapacities.getAbsoluteCapacity(), minimumAllocation);
}
private void updateCurrentResourceLimits(
ResourceLimits currentResourceLimits, Resource clusterResource) {
// TODO: need consider non-empty node labels when resource limits supports
// node labels
// Even if ParentQueue will set limits respect child's max queue capacity,
// but when allocating reserved container, CapacityScheduler doesn't do
// this. So need cap limits by queue's max capacity here.
this.cachedResourceLimitsForHeadroom = new ResourceLimits(currentResourceLimits.getLimit());
Resource queueMaxResource =
Resources.multiplyAndNormalizeDown(resourceCalculator, labelManager
.getResourceByLabel(RMNodeLabelsManager.NO_LABEL, clusterResource),
queueCapacities
.getAbsoluteMaximumCapacity(RMNodeLabelsManager.NO_LABEL),
minimumAllocation);
this.cachedResourceLimitsForHeadroom.setLimit(Resources.min(resourceCalculator,
clusterResource, queueMaxResource, currentResourceLimits.getLimit()));
}
@Override
public synchronized void updateClusterResource(Resource clusterResource,
ResourceLimits currentResourceLimits) {
updateCurrentResourceLimits(currentResourceLimits, clusterResource);
lastClusterResource = clusterResource;
updateAbsoluteCapacityResource(clusterResource);
// Update headroom info based on new cluster resource value
// absoluteMaxCapacity now, will be replaced with absoluteMaxAvailCapacity
// during allocation
setQueueResourceLimitsInfo(clusterResource);
// Update metrics
CSQueueUtils.updateQueueStatistics(
resourceCalculator, this, getParent(), clusterResource,
minimumAllocation);
// queue metrics are updated, more resource may be available
// activate the pending applications if possible
activateApplications();
// Update application properties
for (FiCaSchedulerApp application : activeApplications.values()) {
synchronized (application) {
computeUserLimitAndSetHeadroom(application, clusterResource,
Resources.none(), null);
}
}
}
@VisibleForTesting
public static class User {
ResourceUsage userResourceUsage = new ResourceUsage();
volatile Resource userResourceLimit = Resource.newInstance(0, 0);
int pendingApplications = 0;
int activeApplications = 0;
public ResourceUsage getResourceUsage() {
return userResourceUsage;
}
public Resource getUsed() {
return userResourceUsage.getUsed();
}
public Resource getUsed(String label) {
return userResourceUsage.getUsed(label);
}
public int getPendingApplications() {
return pendingApplications;
}
public int getActiveApplications() {
return activeApplications;
}
public Resource getConsumedAMResources() {
return userResourceUsage.getAMUsed();
}
public int getTotalApplications() {
return getPendingApplications() + getActiveApplications();
}
public synchronized void submitApplication() {
++pendingApplications;
}
public synchronized void activateApplication() {
--pendingApplications;
++activeApplications;
}
public synchronized void finishApplication(boolean wasActive) {
if (wasActive) {
--activeApplications;
}
else {
--pendingApplications;
}
}
public void assignContainer(Resource resource,
Set<String> nodeLabels) {
if (nodeLabels == null || nodeLabels.isEmpty()) {
userResourceUsage.incUsed(resource);
} else {
for (String label : nodeLabels) {
userResourceUsage.incUsed(label, resource);
}
}
}
public void releaseContainer(Resource resource, Set<String> nodeLabels) {
if (nodeLabels == null || nodeLabels.isEmpty()) {
userResourceUsage.decUsed(resource);
} else {
for (String label : nodeLabels) {
userResourceUsage.decUsed(label, resource);
}
}
}
public Resource getUserResourceLimit() {
return userResourceLimit;
}
public void setUserResourceLimit(Resource userResourceLimit) {
this.userResourceLimit = userResourceLimit;
}
}
@Override
public void recoverContainer(Resource clusterResource,
SchedulerApplicationAttempt attempt, RMContainer rmContainer) {
if (rmContainer.getState().equals(RMContainerState.COMPLETED)) {
return;
}
// Careful! Locking order is important!
synchronized (this) {
FiCaSchedulerNode node =
scheduler.getNode(rmContainer.getContainer().getNodeId());
allocateResource(clusterResource, attempt, rmContainer.getContainer()
.getResource(), node.getLabels());
}
getParent().recoverContainer(clusterResource, attempt, rmContainer);
}
/**
* Obtain (read-only) collection of active applications.
*/
public Set<FiCaSchedulerApp> getApplications() {
// need to access the list of apps from the preemption monitor
Comparator<FiCaSchedulerApp> applicationComparator =
scheduler.getApplicationComparator();
Set<FiCaSchedulerApp> result = new TreeSet<FiCaSchedulerApp>(applicationComparator);
result.addAll(activeApplications.values());
return result;
}
// return a single Resource capturing the overal amount of pending resources
public synchronized Resource getTotalResourcePending() {
Resource ret = BuilderUtils.newResource(0, 0);
for (FiCaSchedulerApp f : activeApplications.values()) {
Resources.addTo(ret, f.getTotalPendingRequests());
}
return ret;
}
// Consider the headroom for each user in the queue.
// Total pending for the queue =
// sum for each user(min( (user's headroom), sum(user's pending requests) ))
// NOTE: Used for calculating pedning resources in the preemption monitor.
public synchronized Resource getTotalPendingResourcesConsideringUserLimit(
Resource resources) {
Map<String, Resource> userNameToHeadroom = new HashMap<String, Resource>();
Resource pendingConsideringUserLimit = Resource.newInstance(0, 0);
for (FiCaSchedulerApp app : activeApplications.values()) {
String userName = app.getUser();
if (!userNameToHeadroom.containsKey(userName)) {
User user = getUser(userName);
Resource headroom = Resources.subtract(
computeUserLimit(app, resources, minimumAllocation, user, null),
user.getUsed());
// Make sure none of the the components of headroom is negative.
headroom = Resources.componentwiseMax(headroom, Resources.none());
userNameToHeadroom.put(userName, headroom);
}
Resource minpendingConsideringUserLimit =
Resources.componentwiseMin(userNameToHeadroom.get(userName),
app.getTotalPendingRequests());
Resources.addTo(pendingConsideringUserLimit, minpendingConsideringUserLimit);
Resources.subtractFrom(userNameToHeadroom.get(userName),
minpendingConsideringUserLimit);
}
return pendingConsideringUserLimit;
}
@Override
public synchronized void collectSchedulerApplications(
Collection<ApplicationAttemptId> apps) {
for (FiCaSchedulerApp pendingApp : pendingApplications) {
apps.add(pendingApp.getApplicationAttemptId());
}
for (FiCaSchedulerApp app : activeApplications.values()) {
apps.add(app.getApplicationAttemptId());
}
}
@Override
public void attachContainer(Resource clusterResource,
FiCaSchedulerApp application, RMContainer rmContainer) {
if (application != null) {
FiCaSchedulerNode node =
scheduler.getNode(rmContainer.getContainer().getNodeId());
allocateResource(clusterResource, application, rmContainer.getContainer()
.getResource(), node.getLabels());
LOG.info("movedContainer" + " container=" + rmContainer.getContainer()
+ " resource=" + rmContainer.getContainer().getResource()
+ " queueMoveIn=" + this + " usedCapacity=" + getUsedCapacity()
+ " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used="
+ queueUsage.getUsed() + " cluster=" + clusterResource);
// Inform the parent queue
getParent().attachContainer(clusterResource, application, rmContainer);
}
}
@Override
public void detachContainer(Resource clusterResource,
FiCaSchedulerApp application, RMContainer rmContainer) {
if (application != null) {
FiCaSchedulerNode node =
scheduler.getNode(rmContainer.getContainer().getNodeId());
releaseResource(clusterResource, application, rmContainer.getContainer()
.getResource(), node.getLabels());
LOG.info("movedContainer" + " container=" + rmContainer.getContainer()
+ " resource=" + rmContainer.getContainer().getResource()
+ " queueMoveOut=" + this + " usedCapacity=" + getUsedCapacity()
+ " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used="
+ queueUsage.getUsed() + " cluster=" + clusterResource);
// Inform the parent queue
getParent().detachContainer(clusterResource, application, rmContainer);
}
}
public void setCapacity(float capacity) {
queueCapacities.setCapacity(capacity);
}
public void setAbsoluteCapacity(float absoluteCapacity) {
queueCapacities.setAbsoluteCapacity(absoluteCapacity);
}
public void setMaxApplications(int maxApplications) {
this.maxApplications = maxApplications;
}
/*
* Holds shared values used by all applications in
* the queue to calculate headroom on demand
*/
static class QueueResourceLimitsInfo {
private Resource queueCurrentLimit;
private Resource clusterResource;
public void setQueueCurrentLimit(Resource currentLimit) {
this.queueCurrentLimit = currentLimit;
}
public Resource getQueueCurrentLimit() {
return queueCurrentLimit;
}
public void setClusterResource(Resource clusterResource) {
this.clusterResource = clusterResource;
}
public Resource getClusterResource() {
return clusterResource;
}
}
}