/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.aurora.scheduler.preemptor; import java.time.Instant; import java.util.List; import java.util.Set; import javax.inject.Inject; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Optional; import com.google.common.base.Predicate; import com.google.common.collect.FluentIterable; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import com.google.common.collect.Ordering; import com.google.common.collect.Sets; import org.apache.aurora.scheduler.HostOffer; import org.apache.aurora.scheduler.TierManager; import org.apache.aurora.scheduler.configuration.executor.ExecutorSettings; import org.apache.aurora.scheduler.filter.AttributeAggregate; import org.apache.aurora.scheduler.filter.SchedulingFilter; import org.apache.aurora.scheduler.filter.SchedulingFilter.ResourceRequest; import org.apache.aurora.scheduler.filter.SchedulingFilter.UnusedResource; import org.apache.aurora.scheduler.filter.SchedulingFilter.Veto; import org.apache.aurora.scheduler.resources.ResourceBag; import org.apache.aurora.scheduler.resources.ResourceManager; import org.apache.aurora.scheduler.storage.Storage.StoreProvider; import org.apache.aurora.scheduler.storage.entities.IHostAttributes; import org.apache.aurora.scheduler.storage.entities.ITaskConfig; import static java.util.Objects.requireNonNull; import static org.apache.aurora.scheduler.resources.ResourceBag.EMPTY; import static org.apache.aurora.scheduler.resources.ResourceBag.IS_MESOS_REVOCABLE; import static org.apache.aurora.scheduler.resources.ResourceManager.bagFromMesosResources; import static org.apache.aurora.scheduler.resources.ResourceManager.getNonRevocableOfferResources; /** * Filters active tasks (victims) and available offer (slack) resources that can accommodate a * given task (candidate), provided victims are preempted. * <p> * A task may preempt another task if the following conditions hold true: * <ol> * <li>The resources reserved for a victim (or a set of victims) are sufficient to satisfy * the candidate. * </li> * <li>Both candidate and victim are owned by the same user and the * {@link ITaskConfig#getPriority} of a victim is lower OR a victim is non-production and the * candidate is production. * </li> * </ol> */ public interface PreemptionVictimFilter { /** * Returns a set of {@link PreemptionVictim} that can accommodate a given task if preempted. * * @param pendingTask Task to search preemption slot for. * @param victims Active tasks on a slave. * @param attributeAggregate An {@link AttributeAggregate} instance for the task's job. * @param offer A resource offer for a slave. * @param storeProvider A store provider to access task data. * @return A set of {@code PreemptionVictim} instances to preempt for a given task. */ Optional<ImmutableSet<PreemptionVictim>> filterPreemptionVictims( ITaskConfig pendingTask, Iterable<PreemptionVictim> victims, AttributeAggregate attributeAggregate, Optional<HostOffer> offer, StoreProvider storeProvider); class PreemptionVictimFilterImpl implements PreemptionVictimFilter { private final SchedulingFilter schedulingFilter; private final ExecutorSettings executorSettings; private final PreemptorMetrics metrics; private final TierManager tierManager; @Inject PreemptionVictimFilterImpl( SchedulingFilter schedulingFilter, ExecutorSettings executorSettings, PreemptorMetrics metrics, TierManager tierManager) { this.schedulingFilter = requireNonNull(schedulingFilter); this.executorSettings = requireNonNull(executorSettings); this.metrics = requireNonNull(metrics); this.tierManager = requireNonNull(tierManager); } private static final Function<HostOffer, String> OFFER_TO_HOST = offer -> offer.getOffer().getHostname(); private static final Function<PreemptionVictim, String> VICTIM_TO_HOST = PreemptionVictim::getSlaveHost; private final Function<PreemptionVictim, ResourceBag> victimToResources = new Function<PreemptionVictim, ResourceBag>() { @Override public ResourceBag apply(PreemptionVictim victim) { ResourceBag bag = victim.getResourceBag(); if (victim.getConfig().isSetExecutorConfig()) { // Be pessimistic about revocable resource available if config is not available bag.add(executorSettings.getExecutorOverhead( victim.getConfig().getExecutorConfig().getName()).orElse(EMPTY)); } if (tierManager.getTier(victim.getConfig()).isRevocable()) { // Revocable task CPU cannot be used for preemption purposes as it's a compressible // resource. We can still use RAM, DISK and PORTS as they are not compressible. bag = bag.filter(IS_MESOS_REVOCABLE.negate()); } return bag; } }; private static final java.util.function.Predicate<Integer> IS_ZERO = e -> e == 0; /** * A Resources object is greater than another iff _all_ of its resource components are greater. * A Resources object compares as equal if some but not all components are greater * than or equal to the other. */ @VisibleForTesting static final Ordering<ResourceBag> ORDER = new Ordering<ResourceBag>() { @Override public int compare(ResourceBag left, ResourceBag right) { ImmutableList.Builder<Integer> builder = ImmutableList.builder(); left.streamResourceVectors().forEach( entry -> builder.add(entry.getValue().compareTo(right.valueOf(entry.getKey())))); List<Integer> results = builder.build(); if (results.stream().allMatch(IS_ZERO)) { return 0; } if (results.stream().filter(IS_ZERO.negate()).allMatch(e -> e > 0)) { return 1; } if (results.stream().filter(IS_ZERO.negate()).allMatch(e -> e < 0)) { return -1; } return 0; } }; // TODO(zmanji) Consider using Dominant Resource Fairness for ordering instead of the vector // ordering private final Ordering<PreemptionVictim> resourceOrder = ORDER.onResultOf(victimToResources).reverse(); @Override public Optional<ImmutableSet<PreemptionVictim>> filterPreemptionVictims( ITaskConfig pendingTask, Iterable<PreemptionVictim> possibleVictims, AttributeAggregate jobState, Optional<HostOffer> offer, StoreProvider storeProvider) { // This enforces the precondition that all of the resources are from the same host. We need to // get the host for the schedulingFilter. Set<String> hosts = ImmutableSet.<String>builder() .addAll(Iterables.transform(possibleVictims, VICTIM_TO_HOST)) .addAll(Iterables.transform(offer.asSet(), OFFER_TO_HOST)).build(); ResourceBag slackResources = offer.asSet().stream() .map(o -> bagFromMesosResources(getNonRevocableOfferResources(o.getOffer()))) .reduce((l, r) -> l.add(r)) .orElse(EMPTY); FluentIterable<PreemptionVictim> preemptableTasks = FluentIterable.from(possibleVictims) .filter(preemptionFilter(pendingTask)); List<PreemptionVictim> sortedVictims = resourceOrder.immutableSortedCopy(preemptableTasks); if (sortedVictims.isEmpty()) { return Optional.absent(); } Set<PreemptionVictim> toPreemptTasks = Sets.newHashSet(); Optional<IHostAttributes> attributes = storeProvider.getAttributeStore().getHostAttributes(Iterables.getOnlyElement(hosts)); if (!attributes.isPresent()) { metrics.recordMissingAttributes(); return Optional.absent(); } ResourceBag overhead = pendingTask.isSetExecutorConfig() ? executorSettings.getExecutorOverhead( pendingTask.getExecutorConfig().getName()).orElse(EMPTY) : EMPTY; ResourceBag totalResource = slackResources; for (PreemptionVictim victim : sortedVictims) { toPreemptTasks.add(victim); totalResource = totalResource.add(victimToResources.apply(victim)); Optional<Instant> unavailability = Optional.absent(); if (offer.isPresent()) { unavailability = offer.get().getUnavailabilityStart(); } Set<Veto> vetoes = schedulingFilter.filter( new UnusedResource(totalResource, attributes.get(), unavailability), new ResourceRequest( pendingTask, ResourceManager.bagFromResources(pendingTask.getResources()).add(overhead), jobState)); if (vetoes.isEmpty()) { return Optional.of(ImmutableSet.copyOf(toPreemptTasks)); } } return Optional.absent(); } /** * Creates a filter that will find tasks that the provided {@code pendingTask} may preempt. * * @param pendingTask A task that is not scheduled to possibly preempt other tasks for. * @return A filter that will compare the priorities and resources required by other tasks * with {@code preemptableTask}. */ private Predicate<PreemptionVictim> preemptionFilter(final ITaskConfig pendingTask) { return possibleVictim -> { boolean pendingIsPreemptible = tierManager.getTier(pendingTask).isPreemptible(); boolean victimIsPreemptible = tierManager.getTier(possibleVictim.getConfig()).isPreemptible(); if (!pendingIsPreemptible && victimIsPreemptible) { return true; } else if (pendingIsPreemptible == victimIsPreemptible) { // If preemptible flags are equal, preemption is based on priority within the same role. if (pendingTask.getJob().getRole().equals(possibleVictim.getRole())) { return pendingTask.getPriority() > possibleVictim.getPriority(); } else { return false; } } else { return false; } }; } } }