/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience.Private; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.authorize.AccessControlList; import org.apache.hadoop.yarn.api.records.QueueACL; import org.apache.hadoop.yarn.api.records.QueueInfo; import org.apache.hadoop.yarn.api.records.QueueState; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.factories.RecordFactory; import org.apache.hadoop.yarn.factory.providers.RecordFactoryProvider; import org.apache.hadoop.yarn.nodelabels.CommonNodeLabelsManager; import org.apache.hadoop.yarn.security.AccessType; import org.apache.hadoop.yarn.security.PrivilegedEntity; import org.apache.hadoop.yarn.security.PrivilegedEntity.EntityType; import org.apache.hadoop.yarn.security.YarnAuthorizationProvider; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; import com.google.common.collect.Sets; public abstract class AbstractCSQueue implements CSQueue { private static final Log LOG = LogFactory.getLog(AbstractCSQueue.class); CSQueue parent; final String queueName; volatile int numContainers; final Resource minimumAllocation; Resource maximumAllocation; QueueState state; final QueueMetrics metrics; protected final PrivilegedEntity queueEntity; final ResourceCalculator resourceCalculator; Set<String> accessibleLabels; RMNodeLabelsManager labelManager; String defaultLabelExpression; Map<AccessType, AccessControlList> acls = new HashMap<AccessType, AccessControlList>(); boolean reservationsContinueLooking; private boolean preemptionDisabled; // Track resource usage-by-label like used-resource/pending-resource, etc. ResourceUsage queueUsage; // Track capacities like used-capcity/abs-used-capacity/capacity/abs-capacity, // etc. QueueCapacities queueCapacities; private final RecordFactory recordFactory = RecordFactoryProvider.getRecordFactory(null); protected CapacitySchedulerContext csContext; protected YarnAuthorizationProvider authorizer = null; private Resource clusterResource; public AbstractCSQueue(CapacitySchedulerContext cs, String queueName, CSQueue parent, CSQueue old) throws IOException { this.labelManager = cs.getRMContext().getNodeLabelManager(); this.parent = parent; this.queueName = queueName; this.resourceCalculator = cs.getResourceCalculator(); // must be called after parent and queueName is set this.metrics = old != null ? old.getMetrics() : QueueMetrics.forQueue(getQueuePath(), parent, cs.getConfiguration().getEnableUserMetrics(), cs.getConf()); this.csContext = cs; this.minimumAllocation = csContext.getMinimumResourceCapability(); // initialize ResourceUsage queueUsage = new ResourceUsage(); queueEntity = new PrivilegedEntity(EntityType.QUEUE, getQueuePath()); // initialize QueueCapacities queueCapacities = new QueueCapacities(parent == null); } protected void setupConfigurableCapacities() { CSQueueUtils.loadUpdateAndCheckCapacities( getQueuePath(), csContext.getConfiguration(), queueCapacities, parent == null ? null : parent.getQueueCapacities()); } @Override public synchronized float getCapacity() { return queueCapacities.getCapacity(); } @Override public synchronized float getAbsoluteCapacity() { return queueCapacities.getAbsoluteCapacity(); } @Override public float getAbsoluteMaximumCapacity() { return queueCapacities.getAbsoluteMaximumCapacity(); } @Override public synchronized float getAbsoluteUsedCapacity() { return queueCapacities.getAbsoluteUsedCapacity(); } @Override public float getMaximumCapacity() { return queueCapacities.getMaximumCapacity(); } @Override public synchronized float getUsedCapacity() { return queueCapacities.getUsedCapacity(); } @Override public Resource getUsedResources() { return queueUsage.getUsed(); } public synchronized int getNumContainers() { return numContainers; } @Override public synchronized QueueState getState() { return state; } @Override public QueueMetrics getMetrics() { return metrics; } @Override public String getQueueName() { return queueName; } public PrivilegedEntity getPrivilegedEntity() { return queueEntity; } @Override public synchronized CSQueue getParent() { return parent; } @Override public synchronized void setParent(CSQueue newParentQueue) { this.parent = (ParentQueue)newParentQueue; } public Set<String> getAccessibleNodeLabels() { return accessibleLabels; } @Override public boolean hasAccess(QueueACL acl, UserGroupInformation user) { return authorizer.checkPermission(SchedulerUtils.toAccessType(acl), queueEntity, user); } @Override public synchronized void setUsedCapacity(float usedCapacity) { queueCapacities.setUsedCapacity(usedCapacity); } @Override public synchronized void setAbsoluteUsedCapacity(float absUsedCapacity) { queueCapacities.setAbsoluteUsedCapacity(absUsedCapacity); } /** * Set maximum capacity - used only for testing. * @param maximumCapacity new max capacity */ synchronized void setMaxCapacity(float maximumCapacity) { // Sanity check CSQueueUtils.checkMaxCapacity(getQueueName(), queueCapacities.getCapacity(), maximumCapacity); float absMaxCapacity = CSQueueUtils.computeAbsoluteMaximumCapacity(maximumCapacity, parent); CSQueueUtils.checkAbsoluteCapacity(getQueueName(), queueCapacities.getAbsoluteCapacity(), absMaxCapacity); queueCapacities.setMaximumCapacity(maximumCapacity); queueCapacities.setAbsoluteMaximumCapacity(absMaxCapacity); } @Override public String getDefaultNodeLabelExpression() { return defaultLabelExpression; } synchronized void setupQueueConfigs(Resource clusterResource) throws IOException { // get labels this.accessibleLabels = csContext.getConfiguration().getAccessibleNodeLabels(getQueuePath()); this.defaultLabelExpression = csContext.getConfiguration() .getDefaultNodeLabelExpression(getQueuePath()); // inherit from parent if labels not set if (this.accessibleLabels == null && parent != null) { this.accessibleLabels = parent.getAccessibleNodeLabels(); } // inherit from parent if labels not set if (this.defaultLabelExpression == null && parent != null && this.accessibleLabels.containsAll(parent.getAccessibleNodeLabels())) { this.defaultLabelExpression = parent.getDefaultNodeLabelExpression(); } // After we setup labels, we can setup capacities setupConfigurableCapacities(); this.maximumAllocation = csContext.getConfiguration().getMaximumAllocationPerQueue( getQueuePath()); authorizer = YarnAuthorizationProvider.getInstance(csContext.getConf()); this.state = csContext.getConfiguration().getState(getQueuePath()); this.acls = csContext.getConfiguration().getAcls(getQueuePath()); // Update metrics CSQueueUtils.updateQueueStatistics( resourceCalculator, this, parent, clusterResource, minimumAllocation); // Check if labels of this queue is a subset of parent queue, only do this // when we not root if (parent != null && parent.getParent() != null) { if (parent.getAccessibleNodeLabels() != null && !parent.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) { // if parent isn't "*", child shouldn't be "*" too if (this.getAccessibleNodeLabels().contains(RMNodeLabelsManager.ANY)) { throw new IOException("Parent's accessible queue is not ANY(*), " + "but child's accessible queue is *"); } else { Set<String> diff = Sets.difference(this.getAccessibleNodeLabels(), parent.getAccessibleNodeLabels()); if (!diff.isEmpty()) { throw new IOException("Some labels of child queue is not a subset " + "of parent queue, these labels=[" + StringUtils.join(diff, ",") + "]"); } } } } this.reservationsContinueLooking = csContext.getConfiguration() .getReservationContinueLook(); this.preemptionDisabled = isQueueHierarchyPreemptionDisabled(this); this.clusterResource = clusterResource; } protected QueueInfo getQueueInfo() { QueueInfo queueInfo = recordFactory.newRecordInstance(QueueInfo.class); queueInfo.setQueueName(queueName); queueInfo.setAccessibleNodeLabels(accessibleLabels); queueInfo.setCapacity(queueCapacities.getCapacity()); queueInfo.setMaximumCapacity(queueCapacities.getMaximumCapacity()); queueInfo.setQueueState(state); queueInfo.setDefaultNodeLabelExpression(defaultLabelExpression); queueInfo.setCurrentCapacity(getUsedCapacity()); return queueInfo; } @Private public synchronized Resource getMaximumAllocation() { return maximumAllocation; } @Private public Resource getMinimumAllocation() { return minimumAllocation; } synchronized void allocateResource(Resource clusterResource, Resource resource, Set<String> nodeLabels) { // Update usedResources by labels if (nodeLabels == null || nodeLabels.isEmpty()) { queueUsage.incUsed(resource); } else { Set<String> anls = (accessibleLabels.contains(RMNodeLabelsManager.ANY)) ? labelManager.getClusterNodeLabels() : accessibleLabels; for (String label : Sets.intersection(anls, nodeLabels)) { queueUsage.incUsed(label, resource); } } ++numContainers; CSQueueUtils.updateQueueStatistics(resourceCalculator, this, getParent(), clusterResource, minimumAllocation); } protected synchronized void releaseResource(Resource clusterResource, Resource resource, Set<String> nodeLabels) { // Update usedResources by labels if (null == nodeLabels || nodeLabels.isEmpty()) { queueUsage.decUsed(resource); } else { Set<String> anls = (accessibleLabels.contains(RMNodeLabelsManager.ANY)) ? labelManager.getClusterNodeLabels() : accessibleLabels; for (String label : Sets.intersection(anls, nodeLabels)) { queueUsage.decUsed(label, resource); } } CSQueueUtils.updateQueueStatistics(resourceCalculator, this, getParent(), clusterResource, minimumAllocation); --numContainers; } @Private public boolean getReservationContinueLooking() { return reservationsContinueLooking; } @Private public Map<AccessType, AccessControlList> getACLs() { return acls; } @Private public boolean getPreemptionDisabled() { return preemptionDisabled; } @Private public QueueCapacities getQueueCapacities() { return queueCapacities; } @Private public ResourceUsage getQueueResourceUsage() { return queueUsage; } /** * The specified queue is preemptable if system-wide preemption is turned on * unless any queue in the <em>qPath</em> hierarchy has explicitly turned * preemption off. * NOTE: Preemptability is inherited from a queue's parent. * * @return true if queue has preemption disabled, false otherwise */ private boolean isQueueHierarchyPreemptionDisabled(CSQueue q) { CapacitySchedulerConfiguration csConf = csContext.getConfiguration(); boolean systemWidePreemption = csConf.getBoolean(YarnConfiguration.RM_SCHEDULER_ENABLE_MONITORS, YarnConfiguration.DEFAULT_RM_SCHEDULER_ENABLE_MONITORS); CSQueue parentQ = q.getParent(); // If the system-wide preemption switch is turned off, all of the queues in // the qPath hierarchy have preemption disabled, so return true. if (!systemWidePreemption) return true; // If q is the root queue and the system-wide preemption switch is turned // on, then q does not have preemption disabled (default=false, below) // unless the preemption_disabled property is explicitly set. if (parentQ == null) { return csConf.getPreemptionDisabled(q.getQueuePath(), false); } // If this is not the root queue, inherit the default value for the // preemption_disabled property from the parent. Preemptability will be // inherited from the parent's hierarchy unless explicitly overridden at // this level. return csConf.getPreemptionDisabled(q.getQueuePath(), parentQ.getPreemptionDisabled()); } private Resource getCurrentLimitResource(String nodeLabel, Resource clusterResource, ResourceLimits currentResourceLimits) { /* * Current limit resource: For labeled resource: limit = queue-max-resource * (TODO, this part need update when we support labeled-limit) For * non-labeled resource: limit = min(queue-max-resource, * limit-set-by-parent) */ Resource queueMaxResource = Resources.multiplyAndNormalizeDown(resourceCalculator, labelManager.getResourceByLabel(nodeLabel, clusterResource), queueCapacities.getAbsoluteMaximumCapacity(nodeLabel), minimumAllocation); if (nodeLabel.equals(RMNodeLabelsManager.NO_LABEL)) { return Resources.min(resourceCalculator, clusterResource, queueMaxResource, currentResourceLimits.getLimit()); } return queueMaxResource; } synchronized boolean canAssignToThisQueue(Resource clusterResource, Set<String> nodeLabels, ResourceLimits currentResourceLimits, Resource nowRequired, Resource resourceCouldBeUnreserved) { // Get label of this queue can access, it's (nodeLabel AND queueLabel) Set<String> labelCanAccess; if (null == nodeLabels || nodeLabels.isEmpty()) { labelCanAccess = new HashSet<String>(); // Any queue can always access any node without label labelCanAccess.add(RMNodeLabelsManager.NO_LABEL); } else { labelCanAccess = new HashSet<String>( accessibleLabels.contains(CommonNodeLabelsManager.ANY) ? nodeLabels : Sets.intersection(accessibleLabels, nodeLabels)); } for (String label : labelCanAccess) { // New total resource = used + required Resource newTotalResource = Resources.add(queueUsage.getUsed(label), nowRequired); Resource currentLimitResource = getCurrentLimitResource(label, clusterResource, currentResourceLimits); if (Resources.greaterThan(resourceCalculator, clusterResource, newTotalResource, currentLimitResource)) { // if reservation continous looking enabled, check to see if could we // potentially use this node instead of a reserved node if the application // has reserved containers. // TODO, now only consider reservation cases when the node has no label if (this.reservationsContinueLooking && label.equals(RMNodeLabelsManager.NO_LABEL) && Resources.greaterThan(resourceCalculator, clusterResource, resourceCouldBeUnreserved, Resources.none())) { // resource-without-reserved = used - reserved Resource newTotalWithoutReservedResource = Resources.subtract(newTotalResource, resourceCouldBeUnreserved); // when total-used-without-reserved-resource < currentLimit, we still // have chance to allocate on this node by unreserving some containers if (Resources.lessThan(resourceCalculator, clusterResource, newTotalWithoutReservedResource, currentLimitResource)) { if (LOG.isDebugEnabled()) { LOG.debug("try to use reserved: " + getQueueName() + " usedResources: " + queueUsage.getUsed() + ", clusterResources: " + clusterResource + ", reservedResources: " + resourceCouldBeUnreserved + ", capacity-without-reserved: " + newTotalWithoutReservedResource + ", maxLimitCapacity: " + currentLimitResource); } currentResourceLimits.setAmountNeededUnreserve(Resources.subtract(newTotalResource, currentLimitResource)); return true; } } if (LOG.isDebugEnabled()) { LOG.debug(getQueueName() + "Check assign to queue, label=" + label + " usedResources: " + queueUsage.getUsed(label) + " clusterResources: " + clusterResource + " currentUsedCapacity " + Resources.divide(resourceCalculator, clusterResource, queueUsage.getUsed(label), labelManager.getResourceByLabel(label, clusterResource)) + " max-capacity: " + queueCapacities.getAbsoluteMaximumCapacity(label) + ")"); } return false; } return true; } // Actually, this will not happen, since labelCanAccess will be always // non-empty return false; } /** * @param nodePartition node label to check for accessibility * @return true if queue can access nodes with specified label, false if not. */ public final boolean accessibleToPartition(final String nodePartition) { // if queue's label is *, it can access any node if (accessibleLabels != null && accessibleLabels.contains(RMNodeLabelsManager.ANY)) { return true; } // any queue can access to a node without label if (nodePartition == null || nodePartition.equals(RMNodeLabelsManager.NO_LABEL)) { return true; } // a queue can access to a node only if it contains any label of the node if (accessibleLabels != null && accessibleLabels.contains(nodePartition)) { return true; } // sorry, you cannot access return false; } /** * Retrieve used resources by this queue for the specified node label. * Used capacity of label = * (queue's used resources labeled by nodeLabel) * / ( (all resources labeled by nodLabel) * X (percent of labeled resources allocated to this queue) ) * @param nodeLabel label for which to get used resources * @return used resources by this queue for specified label */ public final synchronized float getUsedCapacity(final String nodeLabel) { Resource availableToQueue = Resources.multiply( labelManager.getResourceByLabel(nodeLabel, this.clusterResource), queueCapacities.getAbsoluteCapacity(nodeLabel)); if (!Resources.greaterThan(resourceCalculator, this.clusterResource, availableToQueue, Resources.none())) { return 0.0f; } return Resources.divide(resourceCalculator, this.clusterResource, queueUsage.getUsed(nodeLabel), availableToQueue); } /** * Retrieve absolute used resources by this queue for the specified node label. * @param nodeLabel label for which to get absolute used resources * @return absolute used resources by this queue for specified label */ public final synchronized float getAbsoluteUsedCapacity(final String nodeLabel) { Resource labeledResources = labelManager.getResourceByLabel(nodeLabel, this.clusterResource); if (!Resources.greaterThan(resourceCalculator, this.clusterResource, labeledResources, Resources.none())) { return 0.0f; } return Resources.divide(resourceCalculator, this.clusterResource, queueUsage.getUsed(nodeLabel), labeledResources); } }