/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.PriorityQueue; import java.util.Set; import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerPreemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType; import org.apache.hadoop.yarn.util.Clock; import org.apache.hadoop.yarn.util.SystemClock; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; /** * This class implement a {@link SchedulingEditPolicy} that is designed to be * paired with the {@code CapacityScheduler}. At every invocation of {@code * editSchedule()} it computes the ideal amount of resources assigned to each * queue (for each queue in the hierarchy), and determines whether preemption * is needed. Overcapacity is distributed among queues in a weighted fair manner, * where the weight is the amount of guaranteed capacity for the queue. * Based on this ideal assignment it determines whether preemption is required * and select a set of containers from each application that would be killed if * the corresponding amount of resources is not freed up by the application. * * If not in {@code observeOnly} mode, it triggers preemption requests via a * {@link ContainerPreemptEvent} that the {@code ResourceManager} will ensure * to deliver to the application (or to execute). * * If the deficit of resources is persistent over a long enough period of time * this policy will trigger forced termination of containers (again by generating * {@link ContainerPreemptEvent}). */ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolicy { private static final Log LOG = LogFactory.getLog(ProportionalCapacityPreemptionPolicy.class); /** If true, run the policy but do not affect the cluster with preemption and * kill events. */ public static final String OBSERVE_ONLY = "yarn.resourcemanager.monitor.capacity.preemption.observe_only"; /** Time in milliseconds between invocations of this policy */ public static final String MONITORING_INTERVAL = "yarn.resourcemanager.monitor.capacity.preemption.monitoring_interval"; /** Time in milliseconds between requesting a preemption from an application * and killing the container. */ public static final String WAIT_TIME_BEFORE_KILL = "yarn.resourcemanager.monitor.capacity.preemption.max_wait_before_kill"; /** Maximum percentage of resources preempted in a single round. By * controlling this value one can throttle the pace at which containers are * reclaimed from the cluster. After computing the total desired preemption, * the policy scales it back within this limit. */ public static final String TOTAL_PREEMPTION_PER_ROUND = "yarn.resourcemanager.monitor.capacity.preemption.total_preemption_per_round"; /** Maximum amount of resources above the target capacity ignored for * preemption. This defines a deadzone around the target capacity that helps * prevent thrashing and oscillations around the computed target balance. * High values would slow the time to capacity and (absent natural * completions) it might prevent convergence to guaranteed capacity. */ public static final String MAX_IGNORED_OVER_CAPACITY = "yarn.resourcemanager.monitor.capacity.preemption.max_ignored_over_capacity"; /** * Given a computed preemption target, account for containers naturally * expiring and preempt only this percentage of the delta. This determines * the rate of geometric convergence into the deadzone ({@link * #MAX_IGNORED_OVER_CAPACITY}). For example, a termination factor of 0.5 * will reclaim almost 95% of resources within 5 * {@link * #WAIT_TIME_BEFORE_KILL}, even absent natural termination. */ public static final String NATURAL_TERMINATION_FACTOR = "yarn.resourcemanager.monitor.capacity.preemption.natural_termination_factor"; private RMContext rmContext; private final Clock clock; private double maxIgnoredOverCapacity; private long maxWaitTime; private CapacityScheduler scheduler; private long monitoringInterval; private final Map<RMContainer,Long> preempted = new HashMap<RMContainer,Long>(); private ResourceCalculator rc; private float percentageClusterPreemptionAllowed; private double naturalTerminationFactor; private boolean observeOnly; private Map<String, Map<String, TempQueuePerPartition>> queueToPartitions = new HashMap<>(); private RMNodeLabelsManager nlm; public ProportionalCapacityPreemptionPolicy() { clock = new SystemClock(); } public ProportionalCapacityPreemptionPolicy(Configuration config, RMContext context, CapacityScheduler scheduler) { this(config, context, scheduler, new SystemClock()); } public ProportionalCapacityPreemptionPolicy(Configuration config, RMContext context, CapacityScheduler scheduler, Clock clock) { init(config, context, scheduler); this.clock = clock; } public void init(Configuration config, RMContext context, PreemptableResourceScheduler sched) { LOG.info("Preemption monitor:" + this.getClass().getCanonicalName()); assert null == scheduler : "Unexpected duplicate call to init"; if (!(sched instanceof CapacityScheduler)) { throw new YarnRuntimeException("Class " + sched.getClass().getCanonicalName() + " not instance of " + CapacityScheduler.class.getCanonicalName()); } rmContext = context; scheduler = (CapacityScheduler) sched; maxIgnoredOverCapacity = config.getDouble(MAX_IGNORED_OVER_CAPACITY, 0.1); naturalTerminationFactor = config.getDouble(NATURAL_TERMINATION_FACTOR, 0.2); maxWaitTime = config.getLong(WAIT_TIME_BEFORE_KILL, 15000); monitoringInterval = config.getLong(MONITORING_INTERVAL, 3000); percentageClusterPreemptionAllowed = config.getFloat(TOTAL_PREEMPTION_PER_ROUND, (float) 0.1); observeOnly = config.getBoolean(OBSERVE_ONLY, false); rc = scheduler.getResourceCalculator(); nlm = scheduler.getRMContext().getNodeLabelManager(); } @VisibleForTesting public ResourceCalculator getResourceCalculator() { return rc; } @Override public void editSchedule() { CSQueue root = scheduler.getRootQueue(); Resource clusterResources = Resources.clone(scheduler.getClusterResource()); containerBasedPreemptOrKill(root, clusterResources); } /** * This method selects and tracks containers to be preempted. If a container * is in the target list for more than maxWaitTime it is killed. * * @param root the root of the CapacityScheduler queue hierarchy * @param clusterResources the total amount of resources in the cluster */ @SuppressWarnings("unchecked") private void containerBasedPreemptOrKill(CSQueue root, Resource clusterResources) { // All partitions to look at Set<String> allPartitions = new HashSet<>(); allPartitions.addAll(scheduler.getRMContext() .getNodeLabelManager().getClusterNodeLabelNames()); allPartitions.add(RMNodeLabelsManager.NO_LABEL); // extract a summary of the queues from scheduler synchronized (scheduler) { queueToPartitions.clear(); for (String partitionToLookAt : allPartitions) { cloneQueues(root, nlm.getResourceByLabel(partitionToLookAt, clusterResources), partitionToLookAt); } } // compute total preemption allowed Resource totalPreemptionAllowed = Resources.multiply(clusterResources, percentageClusterPreemptionAllowed); Set<String> leafQueueNames = null; for (String partition : allPartitions) { TempQueuePerPartition tRoot = getQueueByPartition(CapacitySchedulerConfiguration.ROOT, partition); // compute the ideal distribution of resources among queues // updates cloned queues state accordingly tRoot.idealAssigned = tRoot.guaranteed; leafQueueNames = recursivelyComputeIdealAssignment(tRoot, totalPreemptionAllowed); } // based on ideal allocation select containers to be preempted from each // queue and each application Map<ApplicationAttemptId,Set<RMContainer>> toPreempt = getContainersToPreempt(leafQueueNames, clusterResources); if (LOG.isDebugEnabled()) { logToCSV(new ArrayList<String>(leafQueueNames)); } // if we are in observeOnly mode return before any action is taken if (observeOnly) { return; } // preempt (or kill) the selected containers for (Map.Entry<ApplicationAttemptId,Set<RMContainer>> e : toPreempt.entrySet()) { ApplicationAttemptId appAttemptId = e.getKey(); if (LOG.isDebugEnabled()) { LOG.debug("Send to scheduler: in app=" + appAttemptId + " #containers-to-be-preempted=" + e.getValue().size()); } for (RMContainer container : e.getValue()) { // if we tried to preempt this for more than maxWaitTime if (preempted.get(container) != null && preempted.get(container) + maxWaitTime < clock.getTime()) { // kill it rmContext.getDispatcher().getEventHandler().handle( new ContainerPreemptEvent(appAttemptId, container, SchedulerEventType.KILL_CONTAINER)); preempted.remove(container); } else { if (preempted.get(container) != null) { // We already updated the information to scheduler earlier, we need // not have to raise another event. continue; } //otherwise just send preemption events rmContext.getDispatcher().getEventHandler().handle( new ContainerPreemptEvent(appAttemptId, container, SchedulerEventType.PREEMPT_CONTAINER)); preempted.put(container, clock.getTime()); } } } // Keep the preempted list clean for (Iterator<RMContainer> i = preempted.keySet().iterator(); i.hasNext();){ RMContainer id = i.next(); // garbage collect containers that are irrelevant for preemption if (preempted.get(id) + 2 * maxWaitTime < clock.getTime()) { i.remove(); } } } /** * This method recursively computes the ideal assignment of resources to each * level of the hierarchy. This ensures that leafs that are over-capacity but * with parents within capacity will not be preempted. Preemptions are allowed * within each subtree according to local over/under capacity. * * @param root the root of the cloned queue hierachy * @param totalPreemptionAllowed maximum amount of preemption allowed * @return a list of leaf queues updated with preemption targets */ private Set<String> recursivelyComputeIdealAssignment( TempQueuePerPartition root, Resource totalPreemptionAllowed) { Set<String> leafQueueNames = new HashSet<>(); if (root.getChildren() != null && root.getChildren().size() > 0) { // compute ideal distribution at this level computeIdealResourceDistribution(rc, root.getChildren(), totalPreemptionAllowed, root.idealAssigned); // compute recursively for lower levels and build list of leafs for(TempQueuePerPartition t : root.getChildren()) { leafQueueNames.addAll(recursivelyComputeIdealAssignment(t, totalPreemptionAllowed)); } } else { // we are in a leaf nothing to do, just return yourself return ImmutableSet.of(root.queueName); } return leafQueueNames; } /** * This method computes (for a single level in the tree, passed as a {@code * List<TempQueue>}) the ideal assignment of resources. This is done * recursively to allocate capacity fairly across all queues with pending * demands. It terminates when no resources are left to assign, or when all * demand is satisfied. * * @param rc resource calculator * @param queues a list of cloned queues to be assigned capacity to (this is * an out param) * @param totalPreemptionAllowed total amount of preemption we allow * @param tot_guarant the amount of capacity assigned to this pool of queues */ private void computeIdealResourceDistribution(ResourceCalculator rc, List<TempQueuePerPartition> queues, Resource totalPreemptionAllowed, Resource tot_guarant) { // qAlloc tracks currently active queues (will decrease progressively as // demand is met) List<TempQueuePerPartition> qAlloc = new ArrayList<TempQueuePerPartition>(queues); // unassigned tracks how much resources are still to assign, initialized // with the total capacity for this set of queues Resource unassigned = Resources.clone(tot_guarant); // group queues based on whether they have non-zero guaranteed capacity Set<TempQueuePerPartition> nonZeroGuarQueues = new HashSet<TempQueuePerPartition>(); Set<TempQueuePerPartition> zeroGuarQueues = new HashSet<TempQueuePerPartition>(); for (TempQueuePerPartition q : qAlloc) { if (Resources .greaterThan(rc, tot_guarant, q.guaranteed, Resources.none())) { nonZeroGuarQueues.add(q); } else { zeroGuarQueues.add(q); } } // first compute the allocation as a fixpoint based on guaranteed capacity computeFixpointAllocation(rc, tot_guarant, nonZeroGuarQueues, unassigned, false); // if any capacity is left unassigned, distributed among zero-guarantee // queues uniformly (i.e., not based on guaranteed capacity, as this is zero) if (!zeroGuarQueues.isEmpty() && Resources.greaterThan(rc, tot_guarant, unassigned, Resources.none())) { computeFixpointAllocation(rc, tot_guarant, zeroGuarQueues, unassigned, true); } // based on ideal assignment computed above and current assignment we derive // how much preemption is required overall Resource totPreemptionNeeded = Resource.newInstance(0, 0); for (TempQueuePerPartition t:queues) { if (Resources.greaterThan(rc, tot_guarant, t.current, t.idealAssigned)) { Resources.addTo(totPreemptionNeeded, Resources.subtract(t.current, t.idealAssigned)); } } // if we need to preempt more than is allowed, compute a factor (0<f<1) // that is used to scale down how much we ask back from each queue float scalingFactor = 1.0F; if (Resources.greaterThan(rc, tot_guarant, totPreemptionNeeded, totalPreemptionAllowed)) { scalingFactor = Resources.divide(rc, tot_guarant, totalPreemptionAllowed, totPreemptionNeeded); } // assign to each queue the amount of actual preemption based on local // information of ideal preemption and scaling factor for (TempQueuePerPartition t : queues) { t.assignPreemption(scalingFactor, rc, tot_guarant); } if (LOG.isDebugEnabled()) { long time = clock.getTime(); for (TempQueuePerPartition t : queues) { LOG.debug(time + ": " + t); } } } /** * Given a set of queues compute the fix-point distribution of unassigned * resources among them. As pending request of a queue are exhausted, the * queue is removed from the set and remaining capacity redistributed among * remaining queues. The distribution is weighted based on guaranteed * capacity, unless asked to ignoreGuarantee, in which case resources are * distributed uniformly. */ private void computeFixpointAllocation(ResourceCalculator rc, Resource tot_guarant, Collection<TempQueuePerPartition> qAlloc, Resource unassigned, boolean ignoreGuarantee) { // Prior to assigning the unused resources, process each queue as follows: // If current > guaranteed, idealAssigned = guaranteed + untouchable extra // Else idealAssigned = current; // Subtract idealAssigned resources from unassigned. // If the queue has all of its needs met (that is, if // idealAssigned >= current + pending), remove it from consideration. // Sort queues from most under-guaranteed to most over-guaranteed. TQComparator tqComparator = new TQComparator(rc, tot_guarant); PriorityQueue<TempQueuePerPartition> orderedByNeed = new PriorityQueue<TempQueuePerPartition>(10, tqComparator); for (Iterator<TempQueuePerPartition> i = qAlloc.iterator(); i.hasNext();) { TempQueuePerPartition q = i.next(); if (Resources.greaterThan(rc, tot_guarant, q.current, q.guaranteed)) { q.idealAssigned = Resources.add(q.guaranteed, q.untouchableExtra); } else { q.idealAssigned = Resources.clone(q.current); } Resources.subtractFrom(unassigned, q.idealAssigned); // If idealAssigned < (current + pending), q needs more resources, so // add it to the list of underserved queues, ordered by need. Resource curPlusPend = Resources.add(q.current, q.pending); if (Resources.lessThan(rc, tot_guarant, q.idealAssigned, curPlusPend)) { orderedByNeed.add(q); } } //assign all cluster resources until no more demand, or no resources are left while (!orderedByNeed.isEmpty() && Resources.greaterThan(rc,tot_guarant, unassigned,Resources.none())) { Resource wQassigned = Resource.newInstance(0, 0); // we compute normalizedGuarantees capacity based on currently active // queues resetCapacity(rc, unassigned, orderedByNeed, ignoreGuarantee); // For each underserved queue (or set of queues if multiple are equally // underserved), offer its share of the unassigned resources based on its // normalized guarantee. After the offer, if the queue is not satisfied, // place it back in the ordered list of queues, recalculating its place // in the order of most under-guaranteed to most over-guaranteed. In this // way, the most underserved queue(s) are always given resources first. Collection<TempQueuePerPartition> underserved = getMostUnderservedQueues(orderedByNeed, tqComparator); for (Iterator<TempQueuePerPartition> i = underserved.iterator(); i .hasNext();) { TempQueuePerPartition sub = i.next(); Resource wQavail = Resources.multiplyAndNormalizeUp(rc, unassigned, sub.normalizedGuarantee, Resource.newInstance(1, 1)); Resource wQidle = sub.offer(wQavail, rc, tot_guarant); Resource wQdone = Resources.subtract(wQavail, wQidle); if (Resources.greaterThan(rc, tot_guarant, wQdone, Resources.none())) { // The queue is still asking for more. Put it back in the priority // queue, recalculating its order based on need. orderedByNeed.add(sub); } Resources.addTo(wQassigned, wQdone); } Resources.subtractFrom(unassigned, wQassigned); } } // Take the most underserved TempQueue (the one on the head). Collect and // return the list of all queues that have the same idealAssigned // percentage of guaranteed. protected Collection<TempQueuePerPartition> getMostUnderservedQueues( PriorityQueue<TempQueuePerPartition> orderedByNeed, TQComparator tqComparator) { ArrayList<TempQueuePerPartition> underserved = new ArrayList<TempQueuePerPartition>(); while (!orderedByNeed.isEmpty()) { TempQueuePerPartition q1 = orderedByNeed.remove(); underserved.add(q1); TempQueuePerPartition q2 = orderedByNeed.peek(); // q1's pct of guaranteed won't be larger than q2's. If it's less, then // return what has already been collected. Otherwise, q1's pct of // guaranteed == that of q2, so add q2 to underserved list during the // next pass. if (q2 == null || tqComparator.compare(q1,q2) < 0) { return underserved; } } return underserved; } /** * Computes a normalizedGuaranteed capacity based on active queues * @param rc resource calculator * @param clusterResource the total amount of resources in the cluster * @param queues the list of queues to consider */ private void resetCapacity(ResourceCalculator rc, Resource clusterResource, Collection<TempQueuePerPartition> queues, boolean ignoreGuar) { Resource activeCap = Resource.newInstance(0, 0); if (ignoreGuar) { for (TempQueuePerPartition q : queues) { q.normalizedGuarantee = (float) 1.0f / ((float) queues.size()); } } else { for (TempQueuePerPartition q : queues) { Resources.addTo(activeCap, q.guaranteed); } for (TempQueuePerPartition q : queues) { q.normalizedGuarantee = Resources.divide(rc, clusterResource, q.guaranteed, activeCap); } } } private String getPartitionByNodeId(NodeId nodeId) { return scheduler.getSchedulerNode(nodeId).getPartition(); } /** * Return should we preempt rmContainer. If we should, deduct from * <code>resourceToObtainByPartition</code> */ private boolean tryPreemptContainerAndDeductResToObtain( Map<String, Resource> resourceToObtainByPartitions, RMContainer rmContainer, Resource clusterResource, Map<ApplicationAttemptId, Set<RMContainer>> preemptMap) { ApplicationAttemptId attemptId = rmContainer.getApplicationAttemptId(); // We will not account resource of a container twice or more if (preemptMapContains(preemptMap, attemptId, rmContainer)) { return false; } String nodePartition = getPartitionByNodeId(rmContainer.getAllocatedNode()); Resource toObtainByPartition = resourceToObtainByPartitions.get(nodePartition); if (null != toObtainByPartition && Resources.greaterThan(rc, clusterResource, toObtainByPartition, Resources.none())) { Resources.subtractFrom(toObtainByPartition, rmContainer.getAllocatedResource()); // When we have no more resource need to obtain, remove from map. if (Resources.lessThanOrEqual(rc, clusterResource, toObtainByPartition, Resources.none())) { resourceToObtainByPartitions.remove(nodePartition); } if (LOG.isDebugEnabled()) { LOG.debug("Marked container=" + rmContainer.getContainerId() + " in partition=" + nodePartition + " will be preempted"); } // Add to preemptMap addToPreemptMap(preemptMap, attemptId, rmContainer); return true; } return false; } private boolean preemptMapContains( Map<ApplicationAttemptId, Set<RMContainer>> preemptMap, ApplicationAttemptId attemptId, RMContainer rmContainer) { Set<RMContainer> rmContainers; if (null == (rmContainers = preemptMap.get(attemptId))) { return false; } return rmContainers.contains(rmContainer); } private void addToPreemptMap( Map<ApplicationAttemptId, Set<RMContainer>> preemptMap, ApplicationAttemptId appAttemptId, RMContainer containerToPreempt) { Set<RMContainer> set; if (null == (set = preemptMap.get(appAttemptId))) { set = new HashSet<RMContainer>(); preemptMap.put(appAttemptId, set); } set.add(containerToPreempt); } /** * Based a resource preemption target drop reservations of containers and * if necessary select containers for preemption from applications in each * over-capacity queue. It uses {@link #NATURAL_TERMINATION_FACTOR} to * account for containers that will naturally complete. * * @param queues set of leaf queues to preempt from * @param clusterResource total amount of cluster resources * @return a map of applciationID to set of containers to preempt */ private Map<ApplicationAttemptId,Set<RMContainer>> getContainersToPreempt( Set<String> leafQueueNames, Resource clusterResource) { Map<ApplicationAttemptId, Set<RMContainer>> preemptMap = new HashMap<ApplicationAttemptId, Set<RMContainer>>(); List<RMContainer> skippedAMContainerlist = new ArrayList<RMContainer>(); // Loop all leaf queues for (String queueName : leafQueueNames) { // check if preemption disabled for the queue if (getQueueByPartition(queueName, RMNodeLabelsManager.NO_LABEL).preemptionDisabled) { if (LOG.isDebugEnabled()) { LOG.debug("skipping from queue=" + queueName + " because it's a non-preemptable queue"); } continue; } // compute resToObtainByPartition considered inter-queue preemption LeafQueue leafQueue = null; Map<String, Resource> resToObtainByPartition = new HashMap<String, Resource>(); for (TempQueuePerPartition qT : getQueuePartitions(queueName)) { leafQueue = qT.leafQueue; // we act only if we are violating balance by more than // maxIgnoredOverCapacity if (Resources.greaterThan(rc, clusterResource, qT.current, Resources.multiply(qT.guaranteed, 1.0 + maxIgnoredOverCapacity))) { // we introduce a dampening factor naturalTerminationFactor that // accounts for natural termination of containers Resource resToObtain = Resources.multiply(qT.toBePreempted, naturalTerminationFactor); // Only add resToObtain when it >= 0 if (Resources.greaterThan(rc, clusterResource, resToObtain, Resources.none())) { resToObtainByPartition.put(qT.partition, resToObtain); if (LOG.isDebugEnabled()) { LOG.debug("Queue=" + queueName + " partition=" + qT.partition + " resource-to-obtain=" + resToObtain); } } qT.actuallyPreempted = Resources.clone(resToObtain); } else { qT.actuallyPreempted = Resources.none(); } } synchronized (leafQueue) { // go through all ignore-partition-exclusivity containers first to make // sure such containers will be preempted first Map<String, TreeSet<RMContainer>> ignorePartitionExclusivityContainers = leafQueue.getIgnoreExclusivityRMContainers(); for (String partition : resToObtainByPartition.keySet()) { if (ignorePartitionExclusivityContainers.containsKey(partition)) { TreeSet<RMContainer> rmContainers = ignorePartitionExclusivityContainers.get(partition); // We will check container from reverse order, so latter submitted // application's containers will be preempted first. for (RMContainer c : rmContainers.descendingSet()) { boolean preempted = tryPreemptContainerAndDeductResToObtain( resToObtainByPartition, c, clusterResource, preemptMap); if (!preempted) { break; } } } } // preempt other containers Resource skippedAMSize = Resource.newInstance(0, 0); Iterator<FiCaSchedulerApp> desc = leafQueue.getOrderingPolicy().getPreemptionIterator(); while (desc.hasNext()) { FiCaSchedulerApp fc = desc.next(); // When we complete preempt from one partition, we will remove from // resToObtainByPartition, so when it becomes empty, we can get no // more preemption is needed if (resToObtainByPartition.isEmpty()) { break; } preemptFrom(fc, clusterResource, resToObtainByPartition, skippedAMContainerlist, skippedAMSize, preemptMap); } // Can try preempting AMContainers (still saving atmost // maxAMCapacityForThisQueue AMResource's) if more resources are // required to be preempted from this Queue. Resource maxAMCapacityForThisQueue = Resources.multiply( Resources.multiply(clusterResource, leafQueue.getAbsoluteCapacity()), leafQueue.getMaxAMResourcePerQueuePercent()); preemptAMContainers(clusterResource, preemptMap, skippedAMContainerlist, resToObtainByPartition, skippedAMSize, maxAMCapacityForThisQueue); } } return preemptMap; } /** * As more resources are needed for preemption, saved AMContainers has to be * rescanned. Such AMContainers can be preempted based on resToObtain, but * maxAMCapacityForThisQueue resources will be still retained. * * @param clusterResource * @param preemptMap * @param skippedAMContainerlist * @param resToObtain * @param skippedAMSize * @param maxAMCapacityForThisQueue */ private void preemptAMContainers(Resource clusterResource, Map<ApplicationAttemptId, Set<RMContainer>> preemptMap, List<RMContainer> skippedAMContainerlist, Map<String, Resource> resToObtainByPartition, Resource skippedAMSize, Resource maxAMCapacityForThisQueue) { for (RMContainer c : skippedAMContainerlist) { // Got required amount of resources for preemption, can stop now if (resToObtainByPartition.isEmpty()) { break; } // Once skippedAMSize reaches down to maxAMCapacityForThisQueue, // container selection iteration for preemption will be stopped. if (Resources.lessThanOrEqual(rc, clusterResource, skippedAMSize, maxAMCapacityForThisQueue)) { break; } boolean preempted = tryPreemptContainerAndDeductResToObtain(resToObtainByPartition, c, clusterResource, preemptMap); if (preempted) { Resources.subtractFrom(skippedAMSize, c.getAllocatedResource()); } } skippedAMContainerlist.clear(); } /** * Given a target preemption for a specific application, select containers * to preempt (after unreserving all reservation for that app). */ @SuppressWarnings("unchecked") private void preemptFrom(FiCaSchedulerApp app, Resource clusterResource, Map<String, Resource> resToObtainByPartition, List<RMContainer> skippedAMContainerlist, Resource skippedAMSize, Map<ApplicationAttemptId, Set<RMContainer>> preemptMap) { ApplicationAttemptId appId = app.getApplicationAttemptId(); if (LOG.isDebugEnabled()) { LOG.debug("Looking at application=" + app.getApplicationAttemptId() + " resourceToObtain=" + resToObtainByPartition); } // first drop reserved containers towards rsrcPreempt List<RMContainer> reservedContainers = new ArrayList<RMContainer>(app.getReservedContainers()); for (RMContainer c : reservedContainers) { if (resToObtainByPartition.isEmpty()) { return; } // Try to preempt this container tryPreemptContainerAndDeductResToObtain(resToObtainByPartition, c, clusterResource, preemptMap); if (!observeOnly) { rmContext.getDispatcher().getEventHandler().handle( new ContainerPreemptEvent( appId, c, SchedulerEventType.DROP_RESERVATION)); } } // if more resources are to be freed go through all live containers in // reverse priority and reverse allocation order and mark them for // preemption List<RMContainer> liveContainers = new ArrayList<RMContainer>(app.getLiveContainers()); sortContainers(liveContainers); for (RMContainer c : liveContainers) { if (resToObtainByPartition.isEmpty()) { return; } // Skip AM Container from preemption for now. if (c.isAMContainer()) { skippedAMContainerlist.add(c); Resources.addTo(skippedAMSize, c.getAllocatedResource()); continue; } // Try to preempt this container tryPreemptContainerAndDeductResToObtain(resToObtainByPartition, c, clusterResource, preemptMap); } } /** * Compare by reversed priority order first, and then reversed containerId * order * @param containers */ @VisibleForTesting static void sortContainers(List<RMContainer> containers){ Collections.sort(containers, new Comparator<RMContainer>() { @Override public int compare(RMContainer a, RMContainer b) { Comparator<Priority> c = new org.apache.hadoop.yarn.server .resourcemanager.resource.Priority.Comparator(); int priorityComp = c.compare(b.getContainer().getPriority(), a.getContainer().getPriority()); if (priorityComp != 0) { return priorityComp; } return b.getContainerId().compareTo(a.getContainerId()); } }); } @Override public long getMonitoringInterval() { return monitoringInterval; } @Override public String getPolicyName() { return "ProportionalCapacityPreemptionPolicy"; } /** * This method walks a tree of CSQueue and clones the portion of the state * relevant for preemption in TempQueue(s). It also maintains a pointer to * the leaves. Finally it aggregates pending resources in each queue and rolls * it up to higher levels. * * @param curQueue current queue which I'm looking at now * @param partitionResource the total amount of resources in the cluster * @return the root of the cloned queue hierarchy */ private TempQueuePerPartition cloneQueues(CSQueue curQueue, Resource partitionResource, String partitionToLookAt) { TempQueuePerPartition ret; synchronized (curQueue) { String queueName = curQueue.getQueueName(); QueueCapacities qc = curQueue.getQueueCapacities(); float absCap = qc.getAbsoluteCapacity(partitionToLookAt); float absMaxCap = qc.getAbsoluteMaximumCapacity(partitionToLookAt); boolean preemptionDisabled = curQueue.getPreemptionDisabled(); Resource current = curQueue.getQueueResourceUsage().getUsed( partitionToLookAt); Resource guaranteed = Resources.multiply(partitionResource, absCap); Resource maxCapacity = Resources.multiply(partitionResource, absMaxCap); // when partition is a non-exclusive partition, the actual maxCapacity // could more than specified maxCapacity try { if (!scheduler.getRMContext().getNodeLabelManager() .isExclusiveNodeLabel(partitionToLookAt)) { maxCapacity = Resources.max(rc, partitionResource, maxCapacity, current); } } catch (IOException e) { // This may cause by partition removed when running capacity monitor, // just ignore the error, this will be corrected when doing next check. } Resource extra = Resource.newInstance(0, 0); if (Resources.greaterThan(rc, partitionResource, current, guaranteed)) { extra = Resources.subtract(current, guaranteed); } if (curQueue instanceof LeafQueue) { LeafQueue l = (LeafQueue) curQueue; Resource pending = l.getQueueResourceUsage().getPending(partitionToLookAt); ret = new TempQueuePerPartition(queueName, current, pending, guaranteed, maxCapacity, preemptionDisabled, partitionToLookAt); if (preemptionDisabled) { ret.untouchableExtra = extra; } else { ret.preemptableExtra = extra; } ret.setLeafQueue(l); } else { Resource pending = Resource.newInstance(0, 0); ret = new TempQueuePerPartition(curQueue.getQueueName(), current, pending, guaranteed, maxCapacity, false, partitionToLookAt); Resource childrensPreemptable = Resource.newInstance(0, 0); for (CSQueue c : curQueue.getChildQueues()) { TempQueuePerPartition subq = cloneQueues(c, partitionResource, partitionToLookAt); Resources.addTo(childrensPreemptable, subq.preemptableExtra); ret.addChild(subq); } // untouchableExtra = max(extra - childrenPreemptable, 0) if (Resources.greaterThanOrEqual( rc, partitionResource, childrensPreemptable, extra)) { ret.untouchableExtra = Resource.newInstance(0, 0); } else { ret.untouchableExtra = Resources.subtract(extra, childrensPreemptable); } ret.preemptableExtra = Resources.min( rc, partitionResource, childrensPreemptable, extra); } } addTempQueuePartition(ret); return ret; } // simple printout function that reports internal queue state (useful for // plotting) private void logToCSV(List<String> leafQueueNames){ Collections.sort(leafQueueNames); String queueState = " QUEUESTATE: " + clock.getTime(); StringBuilder sb = new StringBuilder(); sb.append(queueState); for (String queueName : leafQueueNames) { TempQueuePerPartition tq = getQueueByPartition(queueName, RMNodeLabelsManager.NO_LABEL); sb.append(", "); tq.appendLogString(sb); } LOG.debug(sb.toString()); } private void addTempQueuePartition(TempQueuePerPartition queuePartition) { String queueName = queuePartition.queueName; Map<String, TempQueuePerPartition> queuePartitions; if (null == (queuePartitions = queueToPartitions.get(queueName))) { queuePartitions = new HashMap<String, TempQueuePerPartition>(); queueToPartitions.put(queueName, queuePartitions); } queuePartitions.put(queuePartition.partition, queuePartition); } /** * Get queue partition by given queueName and partitionName */ private TempQueuePerPartition getQueueByPartition(String queueName, String partition) { Map<String, TempQueuePerPartition> partitionToQueues = null; if (null == (partitionToQueues = queueToPartitions.get(queueName))) { return null; } return partitionToQueues.get(partition); } /** * Get all queue partitions by given queueName */ private Collection<TempQueuePerPartition> getQueuePartitions(String queueName) { if (!queueToPartitions.containsKey(queueName)) { return null; } return queueToPartitions.get(queueName).values(); } /** * Temporary data-structure tracking resource availability, pending resource * need, current utilization. This is per-queue-per-partition data structure */ static class TempQueuePerPartition { final String queueName; final Resource current; final Resource pending; final Resource guaranteed; final Resource maxCapacity; final String partition; Resource idealAssigned; Resource toBePreempted; // For logging purpose Resource actuallyPreempted; Resource untouchableExtra; Resource preemptableExtra; double normalizedGuarantee; final ArrayList<TempQueuePerPartition> children; LeafQueue leafQueue; boolean preemptionDisabled; TempQueuePerPartition(String queueName, Resource current, Resource pending, Resource guaranteed, Resource maxCapacity, boolean preemptionDisabled, String partition) { this.queueName = queueName; this.current = current; this.pending = pending; this.guaranteed = guaranteed; this.maxCapacity = maxCapacity; this.idealAssigned = Resource.newInstance(0, 0); this.actuallyPreempted = Resource.newInstance(0, 0); this.toBePreempted = Resource.newInstance(0, 0); this.normalizedGuarantee = Float.NaN; this.children = new ArrayList<TempQueuePerPartition>(); this.untouchableExtra = Resource.newInstance(0, 0); this.preemptableExtra = Resource.newInstance(0, 0); this.preemptionDisabled = preemptionDisabled; this.partition = partition; } public void setLeafQueue(LeafQueue l){ assert children.size() == 0; this.leafQueue = l; } /** * When adding a child we also aggregate its pending resource needs. * @param q the child queue to add to this queue */ public void addChild(TempQueuePerPartition q) { assert leafQueue == null; children.add(q); Resources.addTo(pending, q.pending); } public void addChildren(ArrayList<TempQueuePerPartition> queues) { assert leafQueue == null; children.addAll(queues); } public ArrayList<TempQueuePerPartition> getChildren(){ return children; } // This function "accepts" all the resources it can (pending) and return // the unused ones Resource offer(Resource avail, ResourceCalculator rc, Resource clusterResource) { Resource absMaxCapIdealAssignedDelta = Resources.componentwiseMax( Resources.subtract(maxCapacity, idealAssigned), Resource.newInstance(0, 0)); // remain = avail - min(avail, (max - assigned), (current + pending - assigned)) Resource accepted = Resources.min(rc, clusterResource, absMaxCapIdealAssignedDelta, Resources.min(rc, clusterResource, avail, Resources.subtract( Resources.add(current, pending), idealAssigned))); Resource remain = Resources.subtract(avail, accepted); Resources.addTo(idealAssigned, accepted); return remain; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(" NAME: " + queueName) .append(" CUR: ").append(current) .append(" PEN: ").append(pending) .append(" GAR: ").append(guaranteed) .append(" NORM: ").append(normalizedGuarantee) .append(" IDEAL_ASSIGNED: ").append(idealAssigned) .append(" IDEAL_PREEMPT: ").append(toBePreempted) .append(" ACTUAL_PREEMPT: ").append(actuallyPreempted) .append(" UNTOUCHABLE: ").append(untouchableExtra) .append(" PREEMPTABLE: ").append(preemptableExtra) .append("\n"); return sb.toString(); } public void printAll() { LOG.info(this.toString()); for (TempQueuePerPartition sub : this.getChildren()) { sub.printAll(); } } public void assignPreemption(float scalingFactor, ResourceCalculator rc, Resource clusterResource) { if (Resources.greaterThan(rc, clusterResource, current, idealAssigned)) { toBePreempted = Resources.multiply( Resources.subtract(current, idealAssigned), scalingFactor); } else { toBePreempted = Resource.newInstance(0, 0); } } void appendLogString(StringBuilder sb) { sb.append(queueName).append(", ") .append(current.getMemory()).append(", ") .append(current.getVirtualCores()).append(", ") .append(pending.getMemory()).append(", ") .append(pending.getVirtualCores()).append(", ") .append(guaranteed.getMemory()).append(", ") .append(guaranteed.getVirtualCores()).append(", ") .append(idealAssigned.getMemory()).append(", ") .append(idealAssigned.getVirtualCores()).append(", ") .append(toBePreempted.getMemory()).append(", ") .append(toBePreempted.getVirtualCores() ).append(", ") .append(actuallyPreempted.getMemory()).append(", ") .append(actuallyPreempted.getVirtualCores()); } } static class TQComparator implements Comparator<TempQueuePerPartition> { private ResourceCalculator rc; private Resource clusterRes; TQComparator(ResourceCalculator rc, Resource clusterRes) { this.rc = rc; this.clusterRes = clusterRes; } @Override public int compare(TempQueuePerPartition tq1, TempQueuePerPartition tq2) { if (getIdealPctOfGuaranteed(tq1) < getIdealPctOfGuaranteed(tq2)) { return -1; } if (getIdealPctOfGuaranteed(tq1) > getIdealPctOfGuaranteed(tq2)) { return 1; } return 0; } // Calculates idealAssigned / guaranteed // TempQueues with 0 guarantees are always considered the most over // capacity and therefore considered last for resources. private double getIdealPctOfGuaranteed(TempQueuePerPartition q) { double pctOver = Integer.MAX_VALUE; if (q != null && Resources.greaterThan( rc, clusterRes, q.guaranteed, Resources.none())) { pctOver = Resources.divide(rc, clusterRes, q.idealAssigned, q.guaranteed); } return (pctOver); } } @VisibleForTesting public Map<String, Map<String, TempQueuePerPartition>> getQueuePartitions() { return queueToPartitions; } }