/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.aurora.scheduler.mesos; import java.time.Instant; import java.time.temporal.ChronoUnit; import java.util.List; import java.util.concurrent.Executor; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import javax.inject.Inject; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Optional; import org.apache.aurora.common.application.Lifecycle; import org.apache.aurora.common.quantity.Amount; import org.apache.aurora.common.quantity.Time; import org.apache.aurora.common.stats.StatsProvider; import org.apache.aurora.common.util.Clock; import org.apache.aurora.scheduler.HostOffer; import org.apache.aurora.scheduler.TaskStatusHandler; import org.apache.aurora.scheduler.base.Conversions; import org.apache.aurora.scheduler.base.SchedulerException; import org.apache.aurora.scheduler.events.EventSink; import org.apache.aurora.scheduler.events.PubsubEvent; import org.apache.aurora.scheduler.offers.OfferManager; import org.apache.aurora.scheduler.offers.OffersModule; import org.apache.aurora.scheduler.state.MaintenanceController; import org.apache.aurora.scheduler.storage.AttributeStore; import org.apache.aurora.scheduler.storage.Storage; import org.apache.aurora.scheduler.storage.entities.IHostAttributes; import org.apache.mesos.v1.Protos.AgentID; import org.apache.mesos.v1.Protos.ExecutorID; import org.apache.mesos.v1.Protos.Filters; import org.apache.mesos.v1.Protos.FrameworkID; import org.apache.mesos.v1.Protos.InverseOffer; import org.apache.mesos.v1.Protos.MasterInfo; import org.apache.mesos.v1.Protos.Offer; import org.apache.mesos.v1.Protos.OfferID; import org.apache.mesos.v1.Protos.TaskStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static java.util.Objects.requireNonNull; import static org.apache.mesos.v1.Protos.TaskStatus.Reason.REASON_RECONCILIATION; /** * Abstracts the logic of handling scheduler events/callbacks from Mesos. * This interface allows the two different Mesos Scheduler Callback classes * to share logic and to simplify testing. */ public interface MesosCallbackHandler { void handleRegistration(FrameworkID frameworkId, MasterInfo masterInfo); void handleReregistration(MasterInfo masterInfo); void handleOffers(List<Offer> offers); void handleDisconnection(); void handleRescind(OfferID offerId); void handleMessage(ExecutorID executor, AgentID agent); void handleError(String message); void handleUpdate(TaskStatus status); void handleLostAgent(AgentID agentId); void handleLostExecutor(ExecutorID executorID, AgentID slaveID, int status); void handleInverseOffer(List<InverseOffer> offers); class MesosCallbackHandlerImpl implements MesosCallbackHandler { private final TaskStatusHandler taskStatusHandler; private final OfferManager offerManager; private final Storage storage; private final Lifecycle lifecycle; private final EventSink eventSink; private final Executor executor; private final Logger log; private final Driver driver; private final Clock clock; private final MaintenanceController maintenanceController; private final Amount<Long, Time> unavailabilityThreshold; private final AtomicLong offersRescinded; private final AtomicLong slavesLost; private final AtomicLong reRegisters; private final AtomicLong offersReceived; private final AtomicLong inverseOffersReceived; private final AtomicLong disconnects; private final AtomicLong executorsLost; private final AtomicBoolean frameworkRegistered; /** * Creates a new handler for callbacks. * * @param storage Store to save host attributes into. * @param lifecycle Application lifecycle manager. * @param taskStatusHandler Task status update manager. * @param offerManager Offer manager. * @param eventSink Pubsub sink to send driver status changes to. * @param executor Executor for async work */ @Inject public MesosCallbackHandlerImpl( Storage storage, Lifecycle lifecycle, TaskStatusHandler taskStatusHandler, OfferManager offerManager, EventSink eventSink, @SchedulerDriverModule.SchedulerExecutor Executor executor, StatsProvider statsProvider, Driver driver, Clock clock, MaintenanceController controller, @OffersModule.UnavailabilityThreshold Amount<Long, Time> unavailabilityThreshold) { this( storage, lifecycle, taskStatusHandler, offerManager, eventSink, executor, LoggerFactory.getLogger(MesosCallbackHandlerImpl.class), statsProvider, driver, clock, controller, unavailabilityThreshold); } @VisibleForTesting MesosCallbackHandlerImpl( Storage storage, Lifecycle lifecycle, TaskStatusHandler taskStatusHandler, OfferManager offerManager, EventSink eventSink, Executor executor, Logger log, StatsProvider statsProvider, Driver driver, Clock clock, MaintenanceController maintenanceController, Amount<Long, Time> unavailabilityThreshold) { this.storage = requireNonNull(storage); this.lifecycle = requireNonNull(lifecycle); this.taskStatusHandler = requireNonNull(taskStatusHandler); this.offerManager = requireNonNull(offerManager); this.eventSink = requireNonNull(eventSink); this.executor = requireNonNull(executor); this.log = requireNonNull(log); this.driver = requireNonNull(driver); this.clock = requireNonNull(clock); this.maintenanceController = requireNonNull(maintenanceController); this.unavailabilityThreshold = requireNonNull(unavailabilityThreshold); this.offersRescinded = statsProvider.makeCounter("offers_rescinded"); this.slavesLost = statsProvider.makeCounter("slaves_lost"); this.reRegisters = statsProvider.makeCounter("scheduler_framework_reregisters"); this.offersReceived = statsProvider.makeCounter("scheduler_resource_offers"); this.inverseOffersReceived = statsProvider.makeCounter("scheduler_inverse_offers"); this.disconnects = statsProvider.makeCounter("scheduler_framework_disconnects"); this.executorsLost = statsProvider.makeCounter("scheduler_lost_executors"); this.frameworkRegistered = new AtomicBoolean(false); statsProvider.makeGauge("framework_registered", () -> frameworkRegistered.get() ? 1 : 0); } @Override public void handleRegistration(FrameworkID frameworkId, MasterInfo masterInfo) { log.info("Registered with ID " + frameworkId + ", master: " + masterInfo); storage.write( (Storage.MutateWork.NoResult.Quiet) storeProvider -> storeProvider.getSchedulerStore().saveFrameworkId(frameworkId.getValue())); frameworkRegistered.set(true); eventSink.post(new PubsubEvent.DriverRegistered()); } @Override public void handleReregistration(MasterInfo masterInfo) { log.info("Framework re-registered with master " + masterInfo); frameworkRegistered.set(true); reRegisters.incrementAndGet(); } @Override public void handleOffers(List<Offer> offers) { // Don't invoke the executor or storage lock if the list of offers is empty. if (offers.isEmpty()) { return; } executor.execute(() -> { // TODO(wfarner): Reconsider the requirements here, augment the task scheduler to skip over // offers when the host attributes cannot be found. (AURORA-137) storage.write((Storage.MutateWork.NoResult.Quiet) storeProvider -> { for (Offer offer : offers) { IHostAttributes attributes = AttributeStore.Util.mergeOffer(storeProvider.getAttributeStore(), offer); storeProvider.getAttributeStore().saveHostAttributes(attributes); log.debug("Received offer: {}", offer); offersReceived.incrementAndGet(); offerManager.addOffer(new HostOffer(offer, attributes)); } }); }); } @Override public void handleDisconnection() { log.warn("Framework disconnected."); disconnects.incrementAndGet(); frameworkRegistered.set(false); eventSink.post(new PubsubEvent.DriverDisconnected()); } @Override public void handleRescind(OfferID offerId) { log.info("Offer rescinded: {}", offerId.getValue()); offerManager.cancelOffer(offerId); offersRescinded.incrementAndGet(); } @Override public void handleMessage(ExecutorID executorID, AgentID agentID) { log.warn( "Ignoring framework message from {} on {}.", executorID.getValue(), agentID.getValue()); } @Override public void handleError(String message) { log.error("Received error message: " + message); lifecycle.shutdown(); } private static void logStatusUpdate(Logger logger, TaskStatus status) { // Periodic task reconciliation runs generate a large amount of no-op messages. // Suppress logging for reconciliation status updates by default. boolean debugLevel = status.hasReason() && status.getReason() == REASON_RECONCILIATION; StringBuilder message = new StringBuilder("Received status update for task ") .append(status.getTaskId().getValue()) .append(" in state ") .append(status.getState()); if (status.hasSource()) { message.append(" from ").append(status.getSource()); } if (status.hasReason()) { message.append(" with ").append(status.getReason()); } if (status.hasMessage()) { message.append(": ").append(status.getMessage()); } if (debugLevel) { logger.debug(message.toString()); } else { logger.info(message.toString()); } } private static final Function<Double, Long> SECONDS_TO_MICROS = seconds -> (long) (seconds * 1E6); @Override public void handleUpdate(TaskStatus status) { logStatusUpdate(log, status); eventSink.post(new PubsubEvent.TaskStatusReceived( status.getState(), // Source and Reason are enums. They cannot be null so we we need to use `hasXXX`. status.hasSource() ? Optional.of(status.getSource()) : Optional.absent(), status.hasReason() ? Optional.of(status.getReason()) : Optional.absent(), Optional.fromNullable(status.getTimestamp()).transform(SECONDS_TO_MICROS))); try { // The status handler is responsible for acknowledging the update. taskStatusHandler.statusUpdate(status); } catch (SchedulerException e) { log.error("Status update failed due to scheduler exception: " + e, e); // We re-throw the exception here to trigger an abort of the driver. throw e; } } @Override public void handleLostAgent(AgentID agentId) { log.info("Received notification of lost agent: " + agentId); slavesLost.incrementAndGet(); } @Override public void handleLostExecutor(ExecutorID executorID, AgentID slaveID, int status) { // With the current implementation of MESOS-313, Mesos is also reporting clean terminations of // custom executors via the executorLost callback. if (status != 0) { log.warn("Lost executor " + executorID + " on slave " + slaveID + " with status " + status); executorsLost.incrementAndGet(); } } @Override public void handleInverseOffer(List<InverseOffer> offers) { if (offers.isEmpty()) { return; } executor.execute(() -> { for (InverseOffer offer: offers) { inverseOffersReceived.incrementAndGet(); log.debug("Received inverse offer: {}", offer); // Use the default filter for accepting inverse offers. driver.acceptInverseOffer(offer.getId(), Filters.newBuilder().build()); Instant start = Conversions.getStart(offer.getUnavailability()); Instant drainTime = start .minus(unavailabilityThreshold.as(Time.MILLISECONDS), ChronoUnit.MILLIS); if (clock.nowInstant().isAfter(drainTime)) { maintenanceController.drainForInverseOffer(offer); } } }); } } }