/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity; import java.util.Collection; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.PriorityQueue; import java.util.Set; import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.IntraQueueCandidatesSelector.TAPriorityComparator; import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue; import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp; import org.apache.hadoop.yarn.util.resource.ResourceCalculator; import org.apache.hadoop.yarn.util.resource.Resources; /** * FifoIntraQueuePreemptionPlugin will handle intra-queue preemption for * priority and user-limit. */ public class FifoIntraQueuePreemptionPlugin implements IntraQueuePreemptionComputePlugin { protected final CapacitySchedulerPreemptionContext context; protected final ResourceCalculator rc; private static final Log LOG = LogFactory.getLog(FifoIntraQueuePreemptionPlugin.class); public FifoIntraQueuePreemptionPlugin(ResourceCalculator rc, CapacitySchedulerPreemptionContext preemptionContext) { this.context = preemptionContext; this.rc = rc; } @Override public Map<String, Resource> getResourceDemandFromAppsPerQueue( String queueName, String partition) { Map<String, Resource> resToObtainByPartition = new HashMap<>(); TempQueuePerPartition tq = context .getQueueByPartition(queueName, partition); Collection<TempAppPerPartition> appsOrderedByPriority = tq.getApps(); Resource actualPreemptNeeded = resToObtainByPartition.get(partition); // Updating pending resource per-partition level. if (actualPreemptNeeded == null) { actualPreemptNeeded = Resources.createResource(0, 0); resToObtainByPartition.put(partition, actualPreemptNeeded); } for (TempAppPerPartition a1 : appsOrderedByPriority) { Resources.addTo(actualPreemptNeeded, a1.getActuallyToBePreempted()); } if (LOG.isDebugEnabled()) { LOG.debug("Selected to preempt " + actualPreemptNeeded + " resource from partition:" + partition); } return resToObtainByPartition; } @Override public void computeAppsIdealAllocation(Resource clusterResource, Resource partitionBasedResource, TempQueuePerPartition tq, Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, Resource totalPreemptedResourceAllowed, Resource queueReassignableResource, float maxAllowablePreemptLimit) { // 1. AM used resource can be considered as a frozen resource for now. // Hence such containers in a queue can be omitted from the preemption // calculation. Map<String, Resource> perUserAMUsed = new HashMap<String, Resource>(); Resource amUsed = calculateUsedAMResourcesPerQueue(tq.partition, tq.leafQueue, perUserAMUsed); Resources.subtractFrom(queueReassignableResource, amUsed); // 2. tq.leafQueue will not be null as we validated it in caller side Collection<FiCaSchedulerApp> apps = tq.leafQueue.getAllApplications(); // We do not need preemption for a single app if (apps.size() == 1) { return; } // 3. Create all tempApps for internal calculation and return a list from // high priority to low priority order. TAPriorityComparator taComparator = new TAPriorityComparator(); PriorityQueue<TempAppPerPartition> orderedByPriority = createTempAppForResCalculation(tq.partition, apps, taComparator); // 4. Calculate idealAssigned per app by checking based on queue's // unallocated resource.Also return apps arranged from lower priority to // higher priority. TreeSet<TempAppPerPartition> orderedApps = calculateIdealAssignedResourcePerApp(clusterResource, partitionBasedResource, tq, selectedCandidates, queueReassignableResource, orderedByPriority, perUserAMUsed); // 5. A configurable limit that could define an ideal allowable preemption // limit. Based on current queue's capacity,defined how much % could become // preemptable. Resource maxIntraQueuePreemptable = Resources.multiply(tq.getGuaranteed(), maxAllowablePreemptLimit); if (Resources.greaterThan(rc, clusterResource, maxIntraQueuePreemptable, tq.getActuallyToBePreempted())) { Resources.subtractFrom(maxIntraQueuePreemptable, tq.getActuallyToBePreempted()); } else { maxIntraQueuePreemptable = Resource.newInstance(0, 0); } // 6. We have two configurations here, one is intra queue limit and second // one is per-round limit for any time preemption. Take a minimum of these Resource preemptionLimit = Resources.min(rc, clusterResource, maxIntraQueuePreemptable, totalPreemptedResourceAllowed); // 7. From lowest priority app onwards, calculate toBePreempted resource // based on demand. calculateToBePreemptedResourcePerApp(clusterResource, orderedApps, preemptionLimit); // Save all apps (low to high) to temp queue for further reference tq.addAllApps(orderedApps); // 8. There are chances that we may preempt for the demand from same // priority level, such cases are to be validated out. validateOutSameAppPriorityFromDemand(clusterResource, (TreeSet<TempAppPerPartition>) tq.getApps()); if (LOG.isDebugEnabled()) { LOG.debug("Queue Name:" + tq.queueName + ", partition:" + tq.partition); for (TempAppPerPartition tmpApp : tq.getApps()) { LOG.debug(tmpApp); } } } private void calculateToBePreemptedResourcePerApp(Resource clusterResource, TreeSet<TempAppPerPartition> orderedApps, Resource preemptionLimit) { for (TempAppPerPartition tmpApp : orderedApps) { if (Resources.lessThanOrEqual(rc, clusterResource, preemptionLimit, Resources.none()) || Resources.lessThanOrEqual(rc, clusterResource, tmpApp.getUsed(), Resources.none())) { continue; } Resource preemtableFromApp = Resources.subtract(tmpApp.getUsed(), tmpApp.idealAssigned); Resources.subtractFrom(preemtableFromApp, tmpApp.selected); Resources.subtractFrom(preemtableFromApp, tmpApp.getAMUsed()); // Calculate toBePreempted from apps as follows: // app.preemptable = min(max(app.used - app.selected - app.ideal, 0), // intra_q_preemptable) tmpApp.toBePreempted = Resources.min(rc, clusterResource, Resources .max(rc, clusterResource, preemtableFromApp, Resources.none()), preemptionLimit); preemptionLimit = Resources.subtract(preemptionLimit, tmpApp.toBePreempted); } } /** * Algorithm for calculating idealAssigned is as follows: * For each partition: * Q.reassignable = Q.used - Q.selected; * * # By default set ideal assigned 0 for app. * app.idealAssigned as 0 * # get user limit from scheduler. * userLimitRes = Q.getUserLimit(userName) * * # initial all value to 0 * Map<String, Resource> userToAllocated * * # Loop from highest priority to lowest priority app to calculate ideal * for app in sorted-by(priority) { * if Q.reassignable < 0: * break; * * if (user-to-allocated.get(app.user) < userLimitRes) { * idealAssigned = min((userLimitRes - userToAllocated.get(app.user)), * (app.used + app.pending - app.selected)) * app.idealAssigned = min(Q.reassignable, idealAssigned) * userToAllocated.get(app.user) += app.idealAssigned; * } else { * // skip this app because user-limit reached * } * Q.reassignable -= app.idealAssigned * } * * @param clusterResource Cluster Resource * @param partitionBasedResource resource per partition * @param tq TempQueue * @param selectedCandidates Already Selected preemption candidates * @param queueReassignableResource Resource used in a queue * @param orderedByPriority List of running apps * @param perUserAMUsed AM used resource * @return List of temp apps ordered from low to high priority */ private TreeSet<TempAppPerPartition> calculateIdealAssignedResourcePerApp( Resource clusterResource, Resource partitionBasedResource, TempQueuePerPartition tq, Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, Resource queueReassignableResource, PriorityQueue<TempAppPerPartition> orderedByPriority, Map<String, Resource> perUserAMUsed) { Comparator<TempAppPerPartition> reverseComp = Collections .reverseOrder(new TAPriorityComparator()); TreeSet<TempAppPerPartition> orderedApps = new TreeSet<>(reverseComp); Map<String, Resource> userIdealAssignedMapping = new HashMap<>(); String partition = tq.partition; Map<String, Resource> preCalculatedUserLimit = new HashMap<String, Resource>(); while (!orderedByPriority.isEmpty()) { // Remove app from the next highest remaining priority and process it to // calculate idealAssigned per app. TempAppPerPartition tmpApp = orderedByPriority.remove(); orderedApps.add(tmpApp); // Once unallocated resource is 0, we can stop assigning ideal per app. if (Resources.lessThanOrEqual(rc, clusterResource, queueReassignableResource, Resources.none())) { continue; } String userName = tmpApp.app.getUser(); Resource userLimitResource = preCalculatedUserLimit.get(userName); // Verify whether we already calculated headroom for this user. if (userLimitResource == null) { userLimitResource = Resources.clone(tq.leafQueue .getUserLimitPerUser(userName, partitionBasedResource, partition)); Resource amUsed = perUserAMUsed.get(userName); if (null == amUsed) { amUsed = Resources.createResource(0, 0); } // Real AM used need not have to be considered for user-limit as well. userLimitResource = Resources.subtract(userLimitResource, amUsed); if (LOG.isDebugEnabled()) { LOG.debug("Userlimit for user '" + userName + "' is :" + userLimitResource + ", and amUsed is:" + amUsed); } preCalculatedUserLimit.put(userName, userLimitResource); } Resource idealAssignedForUser = userIdealAssignedMapping.get(userName); if (idealAssignedForUser == null) { idealAssignedForUser = Resources.createResource(0, 0); userIdealAssignedMapping.put(userName, idealAssignedForUser); } // Calculate total selected container resources from current app. getAlreadySelectedPreemptionCandidatesResource(selectedCandidates, tmpApp, partition); // For any app, used+pending will give its idealAssigned. However it will // be tightly linked to queue's unallocated quota. So lower priority apps // idealAssigned may fall to 0 if higher priority apps demand is more. Resource appIdealAssigned = Resources.add(tmpApp.getUsedDeductAM(), tmpApp.getPending()); Resources.subtractFrom(appIdealAssigned, tmpApp.selected); if (Resources.lessThan(rc, clusterResource, idealAssignedForUser, userLimitResource)) { appIdealAssigned = Resources.min(rc, clusterResource, appIdealAssigned, Resources.subtract(userLimitResource, idealAssignedForUser)); tmpApp.idealAssigned = Resources.clone(Resources.min(rc, clusterResource, queueReassignableResource, appIdealAssigned)); Resources.addTo(idealAssignedForUser, tmpApp.idealAssigned); } else { continue; } // Also set how much resource is needed by this app from others. Resource appUsedExcludedSelected = Resources .subtract(tmpApp.getUsedDeductAM(), tmpApp.selected); if (Resources.greaterThan(rc, clusterResource, tmpApp.idealAssigned, appUsedExcludedSelected)) { tmpApp.setToBePreemptFromOther( Resources.subtract(tmpApp.idealAssigned, appUsedExcludedSelected)); } Resources.subtractFrom(queueReassignableResource, tmpApp.idealAssigned); } return orderedApps; } /* * Previous policies would have already selected few containers from an * application. Calculate total resource from these selected containers. */ private void getAlreadySelectedPreemptionCandidatesResource( Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, TempAppPerPartition tmpApp, String partition) { tmpApp.selected = Resources.createResource(0, 0); Set<RMContainer> containers = selectedCandidates .get(tmpApp.app.getApplicationAttemptId()); if (containers == null) { return; } for (RMContainer cont : containers) { if (partition.equals(cont.getNodeLabelExpression())) { Resources.addTo(tmpApp.selected, cont.getAllocatedResource()); } } } private PriorityQueue<TempAppPerPartition> createTempAppForResCalculation( String partition, Collection<FiCaSchedulerApp> apps, TAPriorityComparator taComparator) { PriorityQueue<TempAppPerPartition> orderedByPriority = new PriorityQueue<>( 100, taComparator); // have an internal temp app structure to store intermediate data(priority) for (FiCaSchedulerApp app : apps) { Resource used = app.getAppAttemptResourceUsage().getUsed(partition); Resource amUsed = null; if (!app.isWaitingForAMContainer()) { amUsed = app.getAMResource(partition); } Resource pending = app.getTotalPendingRequestsPerPartition() .get(partition); Resource reserved = app.getAppAttemptResourceUsage() .getReserved(partition); used = (used == null) ? Resources.createResource(0, 0) : used; amUsed = (amUsed == null) ? Resources.createResource(0, 0) : amUsed; pending = (pending == null) ? Resources.createResource(0, 0) : pending; reserved = (reserved == null) ? Resources.createResource(0, 0) : reserved; HashSet<String> partitions = new HashSet<String>( app.getAppAttemptResourceUsage().getNodePartitionsSet()); partitions.addAll(app.getTotalPendingRequestsPerPartition().keySet()); // Create TempAppPerQueue for further calculation. TempAppPerPartition tmpApp = new TempAppPerPartition(app, Resources.clone(used), Resources.clone(amUsed), Resources.clone(reserved), Resources.clone(pending)); // Set ideal allocation of app as 0. tmpApp.idealAssigned = Resources.createResource(0, 0); orderedByPriority.add(tmpApp); } return orderedByPriority; } /* * Fifo+Priority based preemption policy need not have to preempt resources at * same priority level. Such cases will be validated out. */ public void validateOutSameAppPriorityFromDemand(Resource cluster, TreeSet<TempAppPerPartition> appsOrderedfromLowerPriority) { TempAppPerPartition[] apps = appsOrderedfromLowerPriority .toArray(new TempAppPerPartition[appsOrderedfromLowerPriority.size()]); if (apps.length <= 0) { return; } int lPriority = 0; int hPriority = apps.length - 1; while (lPriority < hPriority && !apps[lPriority].equals(apps[hPriority]) && apps[lPriority].getPriority() < apps[hPriority].getPriority()) { Resource toPreemptFromOther = apps[hPriority] .getToBePreemptFromOther(); Resource actuallyToPreempt = apps[lPriority].getActuallyToBePreempted(); Resource delta = Resources.subtract(apps[lPriority].toBePreempted, actuallyToPreempt); if (Resources.greaterThan(rc, cluster, delta, Resources.none())) { Resource toPreempt = Resources.min(rc, cluster, toPreemptFromOther, delta); apps[hPriority].setToBePreemptFromOther( Resources.subtract(toPreemptFromOther, toPreempt)); apps[lPriority].setActuallyToBePreempted( Resources.add(actuallyToPreempt, toPreempt)); } if (Resources.lessThanOrEqual(rc, cluster, apps[lPriority].toBePreempted, apps[lPriority].getActuallyToBePreempted())) { lPriority++; continue; } if (Resources.equals(apps[hPriority].getToBePreemptFromOther(), Resources.none())) { hPriority--; continue; } } } private Resource calculateUsedAMResourcesPerQueue(String partition, LeafQueue leafQueue, Map<String, Resource> perUserAMUsed) { Collection<FiCaSchedulerApp> runningApps = leafQueue.getApplications(); Resource amUsed = Resources.createResource(0, 0); for (FiCaSchedulerApp app : runningApps) { Resource userAMResource = perUserAMUsed.get(app.getUser()); if (null == userAMResource) { userAMResource = Resources.createResource(0, 0); perUserAMUsed.put(app.getUser(), userAMResource); } Resources.addTo(userAMResource, app.getAMResource(partition)); Resources.addTo(amUsed, app.getAMResource(partition)); } return amUsed; } }