/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.NavigableSet; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.Priority; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.event.EventHandler; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerPreemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerPreemptEventType; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.util.Clock; import org.apache.hadoop.yarn.util.SystemClock; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; import com.google.common.annotations.VisibleForTesting; /** * This class implement a {@link SchedulingEditPolicy} that is designed to be * paired with the {@code CapacityScheduler}. At every invocation of {@code * editSchedule()} it computes the ideal amount of resources assigned to each * queue (for each queue in the hierarchy), and determines whether preemption * is needed. Overcapacity is distributed among queues in a weighted fair manner, * where the weight is the amount of guaranteed capacity for the queue. * Based on this ideal assignment it determines whether preemption is required * and select a set of containers from each application that would be killed if * the corresponding amount of resources is not freed up by the application. * * If not in {@code observeOnly} mode, it triggers preemption requests via a * {@link ContainerPreemptEvent} that the {@code ResourceManager} will ensure * to deliver to the application (or to execute). * * If the deficit of resources is persistent over a long enough period of time * this policy will trigger forced termination of containers (again by generating * {@link ContainerPreemptEvent}). */ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolicy { private static final Log LOG = LogFactory.getLog(ProportionalCapacityPreemptionPolicy.class); /** If true, run the policy but do not affect the cluster with preemption and * kill events. */ public static final String OBSERVE_ONLY = "yarn.resourcemanager.monitor.capacity.preemption.observe_only"; /** Time in milliseconds between invocations of this policy */ public static final String MONITORING_INTERVAL = "yarn.resourcemanager.monitor.capacity.preemption.monitoring_interval"; /** Time in milliseconds between requesting a preemption from an application * and killing the container. */ public static final String WAIT_TIME_BEFORE_KILL = "yarn.resourcemanager.monitor.capacity.preemption.max_wait_before_kill"; /** Maximum percentage of resources preempted in a single round. By * controlling this value one can throttle the pace at which containers are * reclaimed from the cluster. After computing the total desired preemption, * the policy scales it back within this limit. */ public static final String TOTAL_PREEMPTION_PER_ROUND = "yarn.resourcemanager.monitor.capacity.preemption.total_preemption_per_round"; /** Maximum amount of resources above the target capacity ignored for * preemption. This defines a deadzone around the target capacity that helps * prevent thrashing and oscillations around the computed target balance. * High values would slow the time to capacity and (absent natural * completions) it might prevent convergence to guaranteed capacity. */ public static final String MAX_IGNORED_OVER_CAPACITY = "yarn.resourcemanager.monitor.capacity.preemption.max_ignored_over_capacity"; /** * Given a computed preemption target, account for containers naturally * expiring and preempt only this percentage of the delta. This determines * the rate of geometric convergence into the deadzone ({@link * #MAX_IGNORED_OVER_CAPACITY}). For example, a termination factor of 0.5 * will reclaim almost 95% of resources within 5 * {@link * #WAIT_TIME_BEFORE_KILL}, even absent natural termination. */ public static final String NATURAL_TERMINATION_FACTOR = "yarn.resourcemanager.monitor.capacity.preemption.natural_termination_factor"; //the dispatcher to send preempt and kill events public EventHandler<ContainerPreemptEvent> dispatcher; private final Clock clock; private double maxIgnoredOverCapacity; private long maxWaitTime; private CapacityScheduler scheduler; private long monitoringInterval; private final Map<RMContainer,Long> preempted = new HashMap<RMContainer,Long>(); private ResourceCalculator rc; private float percentageClusterPreemptionAllowed; private double naturalTerminationFactor; private boolean observeOnly; public ProportionalCapacityPreemptionPolicy() { clock = new SystemClock(); } public ProportionalCapacityPreemptionPolicy(Configuration config, EventHandler<ContainerPreemptEvent> dispatcher, CapacityScheduler scheduler) { this(config, dispatcher, scheduler, new SystemClock()); } public ProportionalCapacityPreemptionPolicy(Configuration config, EventHandler<ContainerPreemptEvent> dispatcher, CapacityScheduler scheduler, Clock clock) { init(config, dispatcher, scheduler); this.clock = clock; } public void init(Configuration config, EventHandler<ContainerPreemptEvent> disp, PreemptableResourceScheduler sched) { LOG.info("Preemption monitor:" + this.getClass().getCanonicalName()); assert null == scheduler : "Unexpected duplicate call to init"; if (!(sched instanceof CapacityScheduler)) { throw new YarnRuntimeException("Class " + sched.getClass().getCanonicalName() + " not instance of " + CapacityScheduler.class.getCanonicalName()); } dispatcher = disp; scheduler = (CapacityScheduler) sched; maxIgnoredOverCapacity = config.getDouble(MAX_IGNORED_OVER_CAPACITY, 0.1); naturalTerminationFactor = config.getDouble(NATURAL_TERMINATION_FACTOR, 0.2); maxWaitTime = config.getLong(WAIT_TIME_BEFORE_KILL, 15000); monitoringInterval = config.getLong(MONITORING_INTERVAL, 3000); percentageClusterPreemptionAllowed = config.getFloat(TOTAL_PREEMPTION_PER_ROUND, (float) 0.1); observeOnly = config.getBoolean(OBSERVE_ONLY, false); rc = scheduler.getResourceCalculator(); } @Override public void editSchedule(){ CSQueue root = scheduler.getRootQueue(); Resource clusterResources = Resources.clone(scheduler.getClusterResources()); containerBasedPreemptOrKill(root, clusterResources); } /** * This method selects and tracks containers to be preempted. If a container * is in the target list for more than maxWaitTime it is killed. * * @param root the root of the CapacityScheduler queue hierarchy * @param clusterResources the total amount of resources in the cluster */ private void containerBasedPreemptOrKill(CSQueue root, Resource clusterResources) { // extract a summary of the queues from scheduler TempQueue tRoot; synchronized (scheduler) { tRoot = cloneQueues(root, clusterResources); } // compute the ideal distribution of resources among queues // updates cloned queues state accordingly tRoot.idealAssigned = tRoot.guaranteed; Resource totalPreemptionAllowed = Resources.multiply(clusterResources, percentageClusterPreemptionAllowed); List<TempQueue> queues = recursivelyComputeIdealAssignment(tRoot, totalPreemptionAllowed); // based on ideal allocation select containers to be preempted from each // queue and each application Map<ApplicationAttemptId,Set<RMContainer>> toPreempt = getContainersToPreempt(queues, clusterResources); logToCSV(queues); // if we are in observeOnly mode return before any action is taken if (observeOnly) { return; } // preempt (or kill) the selected containers for (Map.Entry<ApplicationAttemptId,Set<RMContainer>> e : toPreempt.entrySet()) { for (RMContainer container : e.getValue()) { // if we tried to preempt this for more than maxWaitTime if (preempted.get(container) != null && preempted.get(container) + maxWaitTime < clock.getTime()) { // kill it dispatcher.handle(new ContainerPreemptEvent(e.getKey(), container, ContainerPreemptEventType.KILL_CONTAINER)); preempted.remove(container); } else { //otherwise just send preemption events dispatcher.handle(new ContainerPreemptEvent(e.getKey(), container, ContainerPreemptEventType.PREEMPT_CONTAINER)); if (preempted.get(container) == null) { preempted.put(container, clock.getTime()); } } } } // Keep the preempted list clean for (Iterator<RMContainer> i = preempted.keySet().iterator(); i.hasNext();){ RMContainer id = i.next(); // garbage collect containers that are irrelevant for preemption if (preempted.get(id) + 2 * maxWaitTime < clock.getTime()) { i.remove(); } } } /** * This method recursively computes the ideal assignment of resources to each * level of the hierarchy. This ensures that leafs that are over-capacity but * with parents within capacity will not be preempted. Preemptions are allowed * within each subtree according to local over/under capacity. * * @param root the root of the cloned queue hierachy * @param totalPreemptionAllowed maximum amount of preemption allowed * @return a list of leaf queues updated with preemption targets */ private List<TempQueue> recursivelyComputeIdealAssignment( TempQueue root, Resource totalPreemptionAllowed) { List<TempQueue> leafs = new ArrayList<TempQueue>(); if (root.getChildren() != null && root.getChildren().size() > 0) { // compute ideal distribution at this level computeIdealResourceDistribution(rc, root.getChildren(), totalPreemptionAllowed, root.idealAssigned); // compute recursively for lower levels and build list of leafs for(TempQueue t : root.getChildren()) { leafs.addAll(recursivelyComputeIdealAssignment(t, totalPreemptionAllowed)); } } else { // we are in a leaf nothing to do, just return yourself return Collections.singletonList(root); } return leafs; } /** * This method computes (for a single level in the tree, passed as a {@code * List<TempQueue>}) the ideal assignment of resources. This is done * recursively to allocate capacity fairly across all queues with pending * demands. It terminates when no resources are left to assign, or when all * demand is satisfied. * * @param rc resource calculator * @param queues a list of cloned queues to be assigned capacity to (this is * an out param) * @param totalPreemptionAllowed total amount of preemption we allow * @param tot_guarant the amount of capacity assigned to this pool of queues */ private void computeIdealResourceDistribution(ResourceCalculator rc, List<TempQueue> queues, Resource totalPreemptionAllowed, Resource tot_guarant) { // qAlloc tracks currently active queues (will decrease progressively as // demand is met) List<TempQueue> qAlloc = new ArrayList<TempQueue>(queues); // unassigned tracks how much resources are still to assign, initialized // with the total capacity for this set of queues Resource unassigned = Resources.clone(tot_guarant); //assign all cluster resources until no more demand, or no resources are left while (!qAlloc.isEmpty() && Resources.greaterThan(rc, tot_guarant, unassigned, Resources.none())) { Resource wQassigned = Resource.newInstance(0, 0); // we compute normalizedGuarantees capacity based on currently active // queues resetCapacity(rc, unassigned, qAlloc); // offer for each queue their capacity first and in following invocations // their share of over-capacity for (Iterator<TempQueue> i = qAlloc.iterator(); i.hasNext();) { TempQueue sub = i.next(); Resource wQavail = Resources.multiply(unassigned, sub.normalizedGuarantee); Resource wQidle = sub.offer(wQavail, rc, tot_guarant); Resource wQdone = Resources.subtract(wQavail, wQidle); // if the queue returned a value > 0 it means it is fully satisfied // and it is removed from the list of active queues qAlloc if (!Resources.greaterThan(rc, tot_guarant, wQdone, Resources.none())) { i.remove(); } Resources.addTo(wQassigned, wQdone); } Resources.subtractFrom(unassigned, wQassigned); } // based on ideal assignment computed above and current assignment we derive // how much preemption is required overall Resource totPreemptionNeeded = Resource.newInstance(0, 0); for (TempQueue t:queues) { if (Resources.greaterThan(rc, tot_guarant, t.current, t.idealAssigned)) { Resources.addTo(totPreemptionNeeded, Resources.subtract(t.current, t.idealAssigned)); } } // if we need to preempt more than is allowed, compute a factor (0<f<1) // that is used to scale down how much we ask back from each queue float scalingFactor = 1.0F; if (Resources.greaterThan(rc, tot_guarant, totPreemptionNeeded, totalPreemptionAllowed)) { scalingFactor = Resources.divide(rc, tot_guarant, totalPreemptionAllowed, totPreemptionNeeded); } // assign to each queue the amount of actual preemption based on local // information of ideal preemption and scaling factor for (TempQueue t : queues) { t.assignPreemption(scalingFactor, rc, tot_guarant); } if (LOG.isDebugEnabled()) { long time = clock.getTime(); for (TempQueue t : queues) { LOG.debug(time + ": " + t); } } } /** * Computes a normalizedGuaranteed capacity based on active queues * @param rc resource calculator * @param clusterResource the total amount of resources in the cluster * @param queues the list of queues to consider */ private void resetCapacity(ResourceCalculator rc, Resource clusterResource, List<TempQueue> queues) { Resource activeCap = Resource.newInstance(0, 0); for (TempQueue q : queues) { Resources.addTo(activeCap, q.guaranteed); } for (TempQueue q : queues) { q.normalizedGuarantee = Resources.divide(rc, clusterResource, q.guaranteed, activeCap); } } /** * Based a resource preemption target drop reservations of containers and * if necessary select containers for preemption from applications in each * over-capacity queue. It uses {@link #NATURAL_TERMINATION_FACTOR} to * account for containers that will naturally complete. * * @param queues set of leaf queues to preempt from * @param clusterResource total amount of cluster resources * @return a map of applciationID to set of containers to preempt */ private Map<ApplicationAttemptId,Set<RMContainer>> getContainersToPreempt( List<TempQueue> queues, Resource clusterResource) { Map<ApplicationAttemptId,Set<RMContainer>> list = new HashMap<ApplicationAttemptId,Set<RMContainer>>(); for (TempQueue qT : queues) { // we act only if we are violating balance by more than // maxIgnoredOverCapacity if (Resources.greaterThan(rc, clusterResource, qT.current, Resources.multiply(qT.guaranteed, 1.0 + maxIgnoredOverCapacity))) { // we introduce a dampening factor naturalTerminationFactor that // accounts for natural termination of containers Resource resToObtain = Resources.multiply(qT.toBePreempted, naturalTerminationFactor); // lock the leafqueue while we scan applications and unreserve synchronized(qT.leafQueue) { NavigableSet<FiCaSchedulerApp> ns = (NavigableSet<FiCaSchedulerApp>) qT.leafQueue.getApplications(); Iterator<FiCaSchedulerApp> desc = ns.descendingIterator(); qT.actuallyPreempted = Resources.clone(resToObtain); while (desc.hasNext()) { FiCaSchedulerApp fc = desc.next(); if (Resources.lessThanOrEqual(rc, clusterResource, resToObtain, Resources.none())) { break; } list.put(fc.getApplicationAttemptId(), preemptFrom(fc, clusterResource, resToObtain)); } } } } return list; } /** * Given a target preemption for a specific application, select containers * to preempt (after unreserving all reservation for that app). * * @param app * @param clusterResource * @param rsrcPreempt * @return */ private Set<RMContainer> preemptFrom( FiCaSchedulerApp app, Resource clusterResource, Resource rsrcPreempt) { Set<RMContainer> ret = new HashSet<RMContainer>(); ApplicationAttemptId appId = app.getApplicationAttemptId(); // first drop reserved containers towards rsrcPreempt List<RMContainer> reservations = new ArrayList<RMContainer>(app.getReservedContainers()); for (RMContainer c : reservations) { if (Resources.lessThanOrEqual(rc, clusterResource, rsrcPreempt, Resources.none())) { return ret; } if (!observeOnly) { dispatcher.handle(new ContainerPreemptEvent(appId, c, ContainerPreemptEventType.DROP_RESERVATION)); } Resources.subtractFrom(rsrcPreempt, c.getContainer().getResource()); } // if more resources are to be freed go through all live containers in // reverse priority and reverse allocation order and mark them for // preemption List<RMContainer> containers = new ArrayList<RMContainer>(app.getLiveContainers()); sortContainers(containers); for (RMContainer c : containers) { if (Resources.lessThanOrEqual(rc, clusterResource, rsrcPreempt, Resources.none())) { return ret; } ret.add(c); Resources.subtractFrom(rsrcPreempt, c.getContainer().getResource()); } return ret; } /** * Compare by reversed priority order first, and then reversed containerId * order * @param containers */ @VisibleForTesting static void sortContainers(List<RMContainer> containers){ Collections.sort(containers, new Comparator<RMContainer>() { @Override public int compare(RMContainer a, RMContainer b) { Comparator<Priority> c = new org.apache.hadoop.yarn.server .resourcemanager.resource.Priority.Comparator(); int priorityComp = c.compare(b.getContainer().getPriority(), a.getContainer().getPriority()); if (priorityComp != 0) { return priorityComp; } return b.getContainerId().getId() - a.getContainerId().getId(); } }); } @Override public long getMonitoringInterval() { return monitoringInterval; } @Override public String getPolicyName() { return "ProportionalCapacityPreemptionPolicy"; } /** * This method walks a tree of CSQueue and clones the portion of the state * relevant for preemption in TempQueue(s). It also maintains a pointer to * the leaves. Finally it aggregates pending resources in each queue and rolls * it up to higher levels. * * @param root the root of the CapacityScheduler queue hierarchy * @param clusterResources the total amount of resources in the cluster * @return the root of the cloned queue hierarchy */ private TempQueue cloneQueues(CSQueue root, Resource clusterResources) { TempQueue ret; synchronized (root) { float absUsed = root.getAbsoluteUsedCapacity(); Resource current = Resources.multiply(clusterResources, absUsed); Resource guaranteed = Resources.multiply(clusterResources, root.getAbsoluteCapacity()); if (root instanceof LeafQueue) { LeafQueue l = (LeafQueue) root; Resource pending = l.getTotalResourcePending(); ret = new TempQueue(root.getQueueName(), current, pending, guaranteed); ret.setLeafQueue(l); } else { Resource pending = Resource.newInstance(0, 0); ret = new TempQueue(root.getQueueName(), current, pending, guaranteed); for (CSQueue c : root.getChildQueues()) { ret.addChild(cloneQueues(c, clusterResources)); } } } return ret; } // simple printout function that reports internal queue state (useful for // plotting) private void logToCSV(List<TempQueue> unorderedqueues){ List<TempQueue> queues = new ArrayList<TempQueue>(unorderedqueues); Collections.sort(queues, new Comparator<TempQueue>(){ @Override public int compare(TempQueue o1, TempQueue o2) { return o1.queueName.compareTo(o2.queueName); }}); String queueState = " QUEUESTATE: " + clock.getTime(); StringBuilder sb = new StringBuilder(); sb.append(queueState); for (TempQueue tq : queues) { sb.append(", "); tq.appendLogString(sb); } LOG.info(sb.toString()); } /** * Temporary data-structure tracking resource availability, pending resource * need, current utilization. Used to clone {@link CSQueue}. */ static class TempQueue { final String queueName; final Resource current; final Resource pending; final Resource guaranteed; Resource idealAssigned; Resource toBePreempted; Resource actuallyPreempted; double normalizedGuarantee; final ArrayList<TempQueue> children; LeafQueue leafQueue; TempQueue(String queueName, Resource current, Resource pending, Resource guaranteed) { this.queueName = queueName; this.current = current; this.pending = pending; this.guaranteed = guaranteed; this.idealAssigned = Resource.newInstance(0, 0); this.actuallyPreempted = Resource.newInstance(0, 0); this.toBePreempted = Resource.newInstance(0, 0); this.normalizedGuarantee = Float.NaN; this.children = new ArrayList<TempQueue>(); } public void setLeafQueue(LeafQueue l){ assert children.size() == 0; this.leafQueue = l; } /** * When adding a child we also aggregate its pending resource needs. * @param q the child queue to add to this queue */ public void addChild(TempQueue q) { assert leafQueue == null; children.add(q); Resources.addTo(pending, q.pending); } public void addChildren(ArrayList<TempQueue> queues) { assert leafQueue == null; children.addAll(queues); } public ArrayList<TempQueue> getChildren(){ return children; } // This function "accepts" all the resources it can (pending) and return // the unused ones Resource offer(Resource avail, ResourceCalculator rc, Resource clusterResource) { // remain = avail - min(avail, current + pending - assigned) Resource accepted = Resources.min(rc, clusterResource, avail, Resources.subtract( Resources.add(current, pending), idealAssigned)); Resource remain = Resources.subtract(avail, accepted); Resources.addTo(idealAssigned, accepted); return remain; } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("CUR: ").append(current) .append(" PEN: ").append(pending) .append(" GAR: ").append(guaranteed) .append(" NORM: ").append(normalizedGuarantee) .append(" IDEAL_ASSIGNED: ").append(idealAssigned) .append(" IDEAL_PREEMPT: ").append(toBePreempted) .append(" ACTUAL_PREEMPT: ").append(actuallyPreempted); return sb.toString(); } public void assignPreemption(float scalingFactor, ResourceCalculator rc, Resource clusterResource) { if (Resources.greaterThan(rc, clusterResource, current, idealAssigned)) { toBePreempted = Resources.multiply( Resources.subtract(current, idealAssigned), scalingFactor); } else { toBePreempted = Resource.newInstance(0, 0); } } void appendLogString(StringBuilder sb) { sb.append(queueName).append(", ") .append(current.getMemory()).append(", ") .append(current.getVirtualCores()).append(", ") .append(pending.getMemory()).append(", ") .append(pending.getVirtualCores()).append(", ") .append(guaranteed.getMemory()).append(", ") .append(guaranteed.getVirtualCores()).append(", ") .append(idealAssigned.getMemory()).append(", ") .append(idealAssigned.getVirtualCores()).append(", ") .append(toBePreempted.getMemory()).append(", ") .append(toBePreempted.getVirtualCores() ).append(", ") .append(actuallyPreempted.getMemory()).append(", ") .append(actuallyPreempted.getVirtualCores()); } } }