/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableSet; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.ContainerId; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; import org.apache.hadoop.yarn.server.resourcemanager.RMContext; import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy; import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CSQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.ParentQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.QueueCapacities; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.preemption.PreemptableQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.ContainerPreemptEvent; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType; import org.apache.hadoop.yarn.util.Clock; import org.apache.hadoop.yarn.util.SystemClock; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; import java.io.IOException; import java.text.MessageFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; /** * This class implement a {@link SchedulingEditPolicy} that is designed to be * paired with the {@code CapacityScheduler}. At every invocation of {@code * editSchedule()} it computes the ideal amount of resources assigned to each * queue (for each queue in the hierarchy), and determines whether preemption * is needed. Overcapacity is distributed among queues in a weighted fair manner, * where the weight is the amount of guaranteed capacity for the queue. * Based on this ideal assignment it determines whether preemption is required * and select a set of containers from each application that would be killed if * the corresponding amount of resources is not freed up by the application. * * If not in {@code observeOnly} mode, it triggers preemption requests via a * {@link ContainerPreemptEvent} that the {@code ResourceManager} will ensure * to deliver to the application (or to execute). * * If the deficit of resources is persistent over a long enough period of time * this policy will trigger forced termination of containers (again by generating * {@link ContainerPreemptEvent}). */ public class ProportionalCapacityPreemptionPolicy implements SchedulingEditPolicy, CapacitySchedulerPreemptionContext { private static final Log LOG = LogFactory.getLog(ProportionalCapacityPreemptionPolicy.class); private final Clock clock; // Configurable fields private double maxIgnoredOverCapacity; private long maxWaitTime; private long monitoringInterval; private float percentageClusterPreemptionAllowed; private double naturalTerminationFactor; private boolean observeOnly; private boolean lazyPreempionEnabled; private float maxAllowableLimitForIntraQueuePreemption; private float minimumThresholdForIntraQueuePreemption; // Pointer to other RM components private RMContext rmContext; private ResourceCalculator rc; private CapacityScheduler scheduler; private RMNodeLabelsManager nlm; // Internal properties to make decisions of what to preempt private final Map<RMContainer,Long> preemptionCandidates = new HashMap<>(); private Map<String, Map<String, TempQueuePerPartition>> queueToPartitions = new HashMap<>(); private Map<String, LinkedHashSet<String>> partitionToUnderServedQueues = new HashMap<String, LinkedHashSet<String>>(); private List<PreemptionCandidatesSelector> candidatesSelectionPolicies = new ArrayList<>(); private Set<String> allPartitions; private Set<String> leafQueueNames; // Preemptable Entities, synced from scheduler at every run private Map<String, PreemptableQueue> preemptableQueues; private Set<ContainerId> killableContainers; @SuppressWarnings("unchecked") public ProportionalCapacityPreemptionPolicy() { clock = new SystemClock(); allPartitions = Collections.EMPTY_SET; leafQueueNames = Collections.EMPTY_SET; preemptableQueues = Collections.EMPTY_MAP; } @SuppressWarnings("unchecked") @VisibleForTesting public ProportionalCapacityPreemptionPolicy(RMContext context, CapacityScheduler scheduler, Clock clock) { init(context.getYarnConfiguration(), context, scheduler); this.clock = clock; allPartitions = Collections.EMPTY_SET; leafQueueNames = Collections.EMPTY_SET; preemptableQueues = Collections.EMPTY_MAP; } public void init(Configuration config, RMContext context, PreemptableResourceScheduler sched) { LOG.info("Preemption monitor:" + this.getClass().getCanonicalName()); assert null == scheduler : "Unexpected duplicate call to init"; if (!(sched instanceof CapacityScheduler)) { throw new YarnRuntimeException("Class " + sched.getClass().getCanonicalName() + " not instance of " + CapacityScheduler.class.getCanonicalName()); } rmContext = context; scheduler = (CapacityScheduler) sched; CapacitySchedulerConfiguration csConfig = scheduler.getConfiguration(); maxIgnoredOverCapacity = csConfig.getDouble( CapacitySchedulerConfiguration.PREEMPTION_MAX_IGNORED_OVER_CAPACITY, CapacitySchedulerConfiguration.DEFAULT_PREEMPTION_MAX_IGNORED_OVER_CAPACITY); naturalTerminationFactor = csConfig.getDouble( CapacitySchedulerConfiguration.PREEMPTION_NATURAL_TERMINATION_FACTOR, CapacitySchedulerConfiguration.DEFAULT_PREEMPTION_NATURAL_TERMINATION_FACTOR); maxWaitTime = csConfig.getLong( CapacitySchedulerConfiguration.PREEMPTION_WAIT_TIME_BEFORE_KILL, CapacitySchedulerConfiguration.DEFAULT_PREEMPTION_WAIT_TIME_BEFORE_KILL); monitoringInterval = csConfig.getLong( CapacitySchedulerConfiguration.PREEMPTION_MONITORING_INTERVAL, CapacitySchedulerConfiguration.DEFAULT_PREEMPTION_MONITORING_INTERVAL); percentageClusterPreemptionAllowed = csConfig.getFloat( CapacitySchedulerConfiguration.TOTAL_PREEMPTION_PER_ROUND, CapacitySchedulerConfiguration.DEFAULT_TOTAL_PREEMPTION_PER_ROUND); observeOnly = csConfig.getBoolean( CapacitySchedulerConfiguration.PREEMPTION_OBSERVE_ONLY, CapacitySchedulerConfiguration.DEFAULT_PREEMPTION_OBSERVE_ONLY); lazyPreempionEnabled = csConfig.getBoolean( CapacitySchedulerConfiguration.LAZY_PREEMPTION_ENALBED, CapacitySchedulerConfiguration.DEFAULT_LAZY_PREEMPTION_ENABLED); maxAllowableLimitForIntraQueuePreemption = csConfig.getFloat( CapacitySchedulerConfiguration. INTRAQUEUE_PREEMPTION_MAX_ALLOWABLE_LIMIT, CapacitySchedulerConfiguration. DEFAULT_INTRAQUEUE_PREEMPTION_MAX_ALLOWABLE_LIMIT); minimumThresholdForIntraQueuePreemption = csConfig.getFloat( CapacitySchedulerConfiguration. INTRAQUEUE_PREEMPTION_MINIMUM_THRESHOLD, CapacitySchedulerConfiguration. DEFAULT_INTRAQUEUE_PREEMPTION_MINIMUM_THRESHOLD); rc = scheduler.getResourceCalculator(); nlm = scheduler.getRMContext().getNodeLabelManager(); // Do we need white queue-priority preemption policy? boolean isQueuePriorityPreemptionEnabled = csConfig.getPUOrderingPolicyUnderUtilizedPreemptionEnabled(); if (isQueuePriorityPreemptionEnabled) { candidatesSelectionPolicies.add( new QueuePriorityContainerCandidateSelector(this)); } // Do we need to specially consider reserved containers? boolean selectCandidatesForResevedContainers = csConfig.getBoolean( CapacitySchedulerConfiguration. PREEMPTION_SELECT_CANDIDATES_FOR_RESERVED_CONTAINERS, CapacitySchedulerConfiguration. DEFAULT_PREEMPTION_SELECT_CANDIDATES_FOR_RESERVED_CONTAINERS); if (selectCandidatesForResevedContainers) { candidatesSelectionPolicies .add(new ReservedContainerCandidatesSelector(this)); } // initialize candidates preemption selection policies candidatesSelectionPolicies.add(new FifoCandidatesSelector(this)); // Do we need to specially consider intra queue boolean isIntraQueuePreemptionEnabled = csConfig.getBoolean( CapacitySchedulerConfiguration.INTRAQUEUE_PREEMPTION_ENABLED, CapacitySchedulerConfiguration.DEFAULT_INTRAQUEUE_PREEMPTION_ENABLED); if (isIntraQueuePreemptionEnabled) { candidatesSelectionPolicies.add(new IntraQueueCandidatesSelector(this)); } } @Override public ResourceCalculator getResourceCalculator() { return rc; } @Override public synchronized void editSchedule() { long startTs = clock.getTime(); CSQueue root = scheduler.getRootQueue(); Resource clusterResources = Resources.clone(scheduler.getClusterResource()); containerBasedPreemptOrKill(root, clusterResources); if (LOG.isDebugEnabled()) { LOG.debug("Total time used=" + (clock.getTime() - startTs) + " ms."); } } @SuppressWarnings("unchecked") private void preemptOrkillSelectedContainerAfterWait( Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, long currentTime) { if (LOG.isDebugEnabled()) { LOG.debug( "Starting to preempt containers for selectedCandidates and size:" + selectedCandidates.size()); } // preempt (or kill) the selected containers for (Map.Entry<ApplicationAttemptId, Set<RMContainer>> e : selectedCandidates .entrySet()) { ApplicationAttemptId appAttemptId = e.getKey(); if (LOG.isDebugEnabled()) { LOG.debug("Send to scheduler: in app=" + appAttemptId + " #containers-to-be-preemptionCandidates=" + e.getValue().size()); } for (RMContainer container : e.getValue()) { // if we tried to preempt this for more than maxWaitTime if (preemptionCandidates.get(container) != null && preemptionCandidates.get(container) + maxWaitTime <= currentTime) { // kill it rmContext.getDispatcher().getEventHandler().handle( new ContainerPreemptEvent(appAttemptId, container, SchedulerEventType.MARK_CONTAINER_FOR_KILLABLE)); preemptionCandidates.remove(container); } else { if (preemptionCandidates.get(container) != null) { // We already updated the information to scheduler earlier, we need // not have to raise another event. continue; } //otherwise just send preemption events rmContext.getDispatcher().getEventHandler().handle( new ContainerPreemptEvent(appAttemptId, container, SchedulerEventType.MARK_CONTAINER_FOR_PREEMPTION)); preemptionCandidates.put(container, currentTime); } } } } private void syncKillableContainersFromScheduler() { // sync preemptable entities from scheduler preemptableQueues = scheduler.getPreemptionManager().getShallowCopyOfPreemptableQueues(); killableContainers = new HashSet<>(); for (Map.Entry<String, PreemptableQueue> entry : preemptableQueues .entrySet()) { PreemptableQueue entity = entry.getValue(); for (Map<ContainerId, RMContainer> map : entity.getKillableContainers() .values()) { killableContainers.addAll(map.keySet()); } } } private void cleanupStaledPreemptionCandidates(long currentTime) { // Keep the preemptionCandidates list clean for (Iterator<RMContainer> i = preemptionCandidates.keySet().iterator(); i.hasNext(); ) { RMContainer id = i.next(); // garbage collect containers that are irrelevant for preemption // And avoid preempt selected containers for *this execution* // or within 1 ms if (preemptionCandidates.get(id) + 2 * maxWaitTime < currentTime) { i.remove(); } } } private Set<String> getLeafQueueNames(TempQueuePerPartition q) { if (q.children == null || q.children.isEmpty()) { return ImmutableSet.of(q.queueName); } Set<String> leafQueueNames = new HashSet<>(); for (TempQueuePerPartition child : q.children) { leafQueueNames.addAll(getLeafQueueNames(child)); } return leafQueueNames; } /** * This method selects and tracks containers to be preemptionCandidates. If a container * is in the target list for more than maxWaitTime it is killed. * * @param root the root of the CapacityScheduler queue hierarchy * @param clusterResources the total amount of resources in the cluster */ private void containerBasedPreemptOrKill(CSQueue root, Resource clusterResources) { // Sync killable containers from scheduler when lazy preemption enabled if (lazyPreempionEnabled) { syncKillableContainersFromScheduler(); } // All partitions to look at Set<String> partitions = new HashSet<>(); partitions.addAll(scheduler.getRMContext() .getNodeLabelManager().getClusterNodeLabelNames()); partitions.add(RMNodeLabelsManager.NO_LABEL); this.allPartitions = ImmutableSet.copyOf(partitions); // extract a summary of the queues from scheduler synchronized (scheduler) { queueToPartitions.clear(); for (String partitionToLookAt : allPartitions) { cloneQueues(root, Resources .clone(nlm.getResourceByLabel(partitionToLookAt, clusterResources)), partitionToLookAt); } // Update effective priority of queues } this.leafQueueNames = ImmutableSet.copyOf(getLeafQueueNames( getQueueByPartition(CapacitySchedulerConfiguration.ROOT, RMNodeLabelsManager.NO_LABEL))); // compute total preemption allowed Resource totalPreemptionAllowed = Resources.multiply(clusterResources, percentageClusterPreemptionAllowed); // based on ideal allocation select containers to be preemptionCandidates from each // queue and each application Map<ApplicationAttemptId, Set<RMContainer>> toPreempt = new HashMap<>(); for (PreemptionCandidatesSelector selector : candidatesSelectionPolicies) { long startTime = 0; if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat .format("Trying to use {0} to select preemption candidates", selector.getClass().getName())); startTime = clock.getTime(); } toPreempt = selector.selectCandidates(toPreempt, clusterResources, totalPreemptionAllowed); if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat .format("{0} uses {1} millisecond to run", selector.getClass().getName(), clock.getTime() - startTime)); int totalSelected = 0; for (Set<RMContainer> set : toPreempt.values()) { totalSelected += set.size(); } LOG.debug(MessageFormat .format("So far, total {0} containers selected to be preempted", totalSelected)); } } if (LOG.isDebugEnabled()) { logToCSV(new ArrayList<>(leafQueueNames)); } // if we are in observeOnly mode return before any action is taken if (observeOnly) { return; } // TODO: need consider revert killable containers when no more demandings. // Since we could have several selectors to make decisions concurrently. // So computed ideal-allocation varies between different selectors. // // We may need to "score" killable containers and revert the most preferred // containers. The bottom line is, we shouldn't preempt a queue which is already // below its guaranteed resource. long currentTime = clock.getTime(); // preempt (or kill) the selected containers preemptOrkillSelectedContainerAfterWait(toPreempt, currentTime); // cleanup staled preemption candidates cleanupStaledPreemptionCandidates(currentTime); } @Override public long getMonitoringInterval() { return monitoringInterval; } @Override public String getPolicyName() { return "ProportionalCapacityPreemptionPolicy"; } @VisibleForTesting public Map<RMContainer, Long> getToPreemptContainers() { return preemptionCandidates; } /** * This method walks a tree of CSQueue and clones the portion of the state * relevant for preemption in TempQueue(s). It also maintains a pointer to * the leaves. Finally it aggregates pending resources in each queue and rolls * it up to higher levels. * * @param curQueue current queue which I'm looking at now * @param partitionResource the total amount of resources in the cluster * @return the root of the cloned queue hierarchy */ private TempQueuePerPartition cloneQueues(CSQueue curQueue, Resource partitionResource, String partitionToLookAt) { TempQueuePerPartition ret; synchronized (curQueue) { String queueName = curQueue.getQueueName(); QueueCapacities qc = curQueue.getQueueCapacities(); float absCap = qc.getAbsoluteCapacity(partitionToLookAt); float absMaxCap = qc.getAbsoluteMaximumCapacity(partitionToLookAt); boolean preemptionDisabled = curQueue.getPreemptionDisabled(); Resource current = Resources.clone( curQueue.getQueueResourceUsage().getUsed(partitionToLookAt)); Resource killable = Resources.none(); Resource reserved = Resources.clone( curQueue.getQueueResourceUsage().getReserved(partitionToLookAt)); if (null != preemptableQueues.get(queueName)) { killable = Resources.clone(preemptableQueues.get(queueName) .getKillableResource(partitionToLookAt)); } // when partition is a non-exclusive partition, the actual maxCapacity // could more than specified maxCapacity try { if (!scheduler.getRMContext().getNodeLabelManager() .isExclusiveNodeLabel(partitionToLookAt)) { absMaxCap = 1.0f; } } catch (IOException e) { // This may cause by partition removed when running capacity monitor, // just ignore the error, this will be corrected when doing next check. } ret = new TempQueuePerPartition(queueName, current, preemptionDisabled, partitionToLookAt, killable, absCap, absMaxCap, partitionResource, reserved, curQueue); if (curQueue instanceof ParentQueue) { String configuredOrderingPolicy = ((ParentQueue) curQueue).getQueueOrderingPolicy().getConfigName(); // Recursively add children for (CSQueue c : curQueue.getChildQueues()) { TempQueuePerPartition subq = cloneQueues(c, partitionResource, partitionToLookAt); // If we respect priority if (StringUtils.equals( CapacitySchedulerConfiguration.QUEUE_PRIORITY_UTILIZATION_ORDERING_POLICY, configuredOrderingPolicy)) { subq.relativePriority = c.getPriority().getPriority(); } ret.addChild(subq); subq.parent = ret; } } } addTempQueuePartition(ret); return ret; } // simple printout function that reports internal queue state (useful for // plotting) private void logToCSV(List<String> leafQueueNames){ Collections.sort(leafQueueNames); String queueState = " QUEUESTATE: " + clock.getTime(); StringBuilder sb = new StringBuilder(); sb.append(queueState); for (String queueName : leafQueueNames) { TempQueuePerPartition tq = getQueueByPartition(queueName, RMNodeLabelsManager.NO_LABEL); sb.append(", "); tq.appendLogString(sb); } LOG.debug(sb.toString()); } private void addTempQueuePartition(TempQueuePerPartition queuePartition) { String queueName = queuePartition.queueName; Map<String, TempQueuePerPartition> queuePartitions; if (null == (queuePartitions = queueToPartitions.get(queueName))) { queuePartitions = new HashMap<>(); queueToPartitions.put(queueName, queuePartitions); } queuePartitions.put(queuePartition.partition, queuePartition); } /** * Get queue partition by given queueName and partitionName */ @Override public TempQueuePerPartition getQueueByPartition(String queueName, String partition) { Map<String, TempQueuePerPartition> partitionToQueues; if (null == (partitionToQueues = queueToPartitions.get(queueName))) { throw new YarnRuntimeException("This shouldn't happen, cannot find " + "TempQueuePerPartition for queueName=" + queueName); } return partitionToQueues.get(partition); } /** * Get all queue partitions by given queueName */ @Override public Collection<TempQueuePerPartition> getQueuePartitions(String queueName) { if (!queueToPartitions.containsKey(queueName)) { throw new YarnRuntimeException("This shouldn't happen, cannot find " + "TempQueuePerPartition collection for queueName=" + queueName); } return queueToPartitions.get(queueName).values(); } @Override public CapacityScheduler getScheduler() { return scheduler; } @Override public RMContext getRMContext() { return rmContext; } @Override public boolean isObserveOnly() { return observeOnly; } @Override public Set<ContainerId> getKillableContainers() { return killableContainers; } @Override public double getMaxIgnoreOverCapacity() { return maxIgnoredOverCapacity; } @Override public double getNaturalTerminationFactor() { return naturalTerminationFactor; } @Override public Set<String> getLeafQueueNames() { return leafQueueNames; } @Override public Set<String> getAllPartitions() { return allPartitions; } @VisibleForTesting Map<String, Map<String, TempQueuePerPartition>> getQueuePartitions() { return queueToPartitions; } @Override public float getMaxAllowableLimitForIntraQueuePreemption() { return maxAllowableLimitForIntraQueuePreemption; } @Override public float getMinimumThresholdForIntraQueuePreemption() { return minimumThresholdForIntraQueuePreemption; } @Override public Resource getPartitionResource(String partition) { return Resources.clone(nlm.getResourceByLabel(partition, Resources.clone(scheduler.getClusterResource()))); } public LinkedHashSet<String> getUnderServedQueuesPerPartition( String partition) { return partitionToUnderServedQueues.get(partition); } public void addPartitionToUnderServedQueues(String queueName, String partition) { LinkedHashSet<String> underServedQueues = partitionToUnderServedQueues .get(partition); if (null == underServedQueues) { underServedQueues = new LinkedHashSet<String>(); partitionToUnderServedQueues.put(partition, underServedQueues); } underServedQueues.add(queueName); } }