/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.util.resource.Resources; import java.io.Serializable; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; /** * Identifies over utilized resources within a queue and tries to normalize * them to resolve resource allocation anomalies w.r.t priority and user-limit. */ public class IntraQueueCandidatesSelector extends PreemptionCandidatesSelector { @SuppressWarnings("serial") static class TAPriorityComparator implements Serializable, Comparator<TempAppPerPartition> { @Override public int compare(TempAppPerPartition tq1, TempAppPerPartition tq2) { Priority p1 = Priority.newInstance(tq1.getPriority()); Priority p2 = Priority.newInstance(tq2.getPriority()); if (!p1.equals(p2)) { return p1.compareTo(p2); } return tq1.getApplicationId().compareTo(tq2.getApplicationId()); } } IntraQueuePreemptionComputePlugin fifoPreemptionComputePlugin = null; final CapacitySchedulerPreemptionContext context; private static final Log LOG = LogFactory.getLog(IntraQueueCandidatesSelector.class); IntraQueueCandidatesSelector( CapacitySchedulerPreemptionContext preemptionContext) { super(preemptionContext); fifoPreemptionComputePlugin = new FifoIntraQueuePreemptionPlugin(rc, preemptionContext); context = preemptionContext; } @Override public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates( Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, Resource clusterResource, Resource totalPreemptedResourceAllowed) { // 1. Calculate the abnormality within each queue one by one. computeIntraQueuePreemptionDemand( clusterResource, totalPreemptedResourceAllowed, selectedCandidates); // 2. Previous selectors (with higher priority) could have already // selected containers. We need to deduct pre-emptable resources // based on already selected candidates. CapacitySchedulerPreemptionUtils .deductPreemptableResourcesBasedSelectedCandidates(preemptionContext, selectedCandidates); // 3. Loop through all partitions to select containers for preemption. for (String partition : preemptionContext.getAllPartitions()) { LinkedHashSet<String> queueNames = preemptionContext .getUnderServedQueuesPerPartition(partition); // Error check to handle non-mapped labels to queue. if (null == queueNames) { continue; } // 4. Iterate from most under-served queue in order. for (String queueName : queueNames) { LeafQueue leafQueue = preemptionContext.getQueueByPartition(queueName, RMNodeLabelsManager.NO_LABEL).leafQueue; // skip if not a leafqueue if (null == leafQueue) { continue; } // 5. Calculate the resource to obtain per partition Map<String, Resource> resToObtainByPartition = fifoPreemptionComputePlugin .getResourceDemandFromAppsPerQueue(queueName, partition); // 6. Based on the selected resource demand per partition, select // containers with known policy from inter-queue preemption. synchronized (leafQueue) { Iterator<FiCaSchedulerApp> desc = leafQueue.getOrderingPolicy() .getPreemptionIterator(); while (desc.hasNext()) { FiCaSchedulerApp app = desc.next(); preemptFromLeastStarvedApp(selectedCandidates, clusterResource, totalPreemptedResourceAllowed, resToObtainByPartition, leafQueue, app); } } } } return selectedCandidates; } private void preemptFromLeastStarvedApp( Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, Resource clusterResource, Resource totalPreemptedResourceAllowed, Map<String, Resource> resToObtainByPartition, LeafQueue leafQueue, FiCaSchedulerApp app) { // ToDo: Reuse reservation selector here. List<RMContainer> liveContainers = new ArrayList<>( app.getLiveContainers()); sortContainers(liveContainers); if (LOG.isDebugEnabled()) { LOG.debug( "totalPreemptedResourceAllowed for preemption at this round is :" + totalPreemptedResourceAllowed); } for (RMContainer c : liveContainers) { // if there are no demand, return. if (resToObtainByPartition.isEmpty()) { return; } // skip preselected containers. if (CapacitySchedulerPreemptionUtils.isContainerAlreadySelected(c, selectedCandidates)) { continue; } // Skip already marked to killable containers if (null != preemptionContext.getKillableContainers() && preemptionContext .getKillableContainers().contains(c.getContainerId())) { continue; } // Skip AM Container from preemption for now. if (c.isAMContainer()) { continue; } // Try to preempt this container CapacitySchedulerPreemptionUtils.tryPreemptContainerAndDeductResToObtain( rc, preemptionContext, resToObtainByPartition, c, clusterResource, selectedCandidates, totalPreemptedResourceAllowed); } } private void computeIntraQueuePreemptionDemand(Resource clusterResource, Resource totalPreemptedResourceAllowed, Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates) { // 1. Iterate through all partition to calculate demand within a partition. for (String partition : context.getAllPartitions()) { LinkedHashSet<String> queueNames = context .getUnderServedQueuesPerPartition(partition); if (null == queueNames) { continue; } // 2. Its better to get partition based resource limit earlier before // starting calculation Resource partitionBasedResource = context.getPartitionResource(partition); // 3. loop through all queues corresponding to a partition. for (String queueName : queueNames) { TempQueuePerPartition tq = context.getQueueByPartition(queueName, partition); LeafQueue leafQueue = tq.leafQueue; // skip if its parent queue if (null == leafQueue) { continue; } // 4. Consider reassignableResource as (used - actuallyToBePreempted). // This provides as upper limit to split apps quota in a queue. Resource queueReassignableResource = Resources.subtract(tq.getUsed(), tq.getActuallyToBePreempted()); // 5. Check queue's used capacity. Make sure that the used capacity is // above certain limit to consider for intra queue preemption. if (leafQueue.getQueueCapacities().getUsedCapacity(partition) < context .getMinimumThresholdForIntraQueuePreemption()) { continue; } // 6. compute the allocation of all apps based on queue's unallocated // capacity fifoPreemptionComputePlugin.computeAppsIdealAllocation(clusterResource, partitionBasedResource, tq, selectedCandidates, totalPreemptedResourceAllowed, queueReassignableResource, context.getMaxAllowableLimitForIntraQueuePreemption()); } } } }