/** * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.aurora.scheduler.mesos; import java.util.concurrent.Executor; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import javax.inject.Inject; import com.google.common.base.Optional; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.cache.LoadingCache; import org.apache.aurora.common.inject.TimedInterceptor; import org.apache.aurora.common.stats.StatsProvider; import org.apache.aurora.common.util.BackoffHelper; import org.apache.aurora.scheduler.stats.CachedCounters; import org.apache.aurora.scheduler.storage.Storage; import org.apache.mesos.v1.Protos; import org.apache.mesos.v1.scheduler.Mesos; import org.apache.mesos.v1.scheduler.Protos.Call; import org.apache.mesos.v1.scheduler.Protos.Event; import org.apache.mesos.v1.scheduler.Scheduler; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static java.util.Objects.requireNonNull; import static com.google.common.base.Preconditions.checkState; /** * Implementation of Scheduler callback interfaces for the V1 Driver. */ public class VersionedMesosSchedulerImpl implements Scheduler { private static final Logger LOG = LoggerFactory.getLogger(VersionedMesosSchedulerImpl.class); private final CachedCounters counters; private final MesosCallbackHandler handler; private final Storage storage; private final FrameworkInfoFactory infoFactory; private final Executor executor; private final BackoffHelper backoffHelper; private final AtomicBoolean isSubscribed = new AtomicBoolean(false); private final AtomicBoolean isConnected = new AtomicBoolean(false); private final AtomicBoolean isRegistered = new AtomicBoolean(false); private final AtomicLong subcriptionCalls; private static final String EVENT_COUNTER_STAT_PREFIX = "mesos_scheduler_event_"; // A cache to hold the metric names to prevent us from creating strings for every event private final LoadingCache<Event.Type, String> eventMetricNameCache = CacheBuilder.newBuilder() .maximumSize(Event.Type.values().length) .initialCapacity(Event.Type.values().length) .build(new CacheLoader<Event.Type, String>() { @Override public String load(Event.Type key) throws Exception { return EVENT_COUNTER_STAT_PREFIX + key.name(); } }); @Inject VersionedMesosSchedulerImpl( MesosCallbackHandler handler, CachedCounters counters, StatsProvider statsProvider, Storage storage, @SchedulerDriverModule.SchedulerExecutor Executor executor, BackoffHelper backoffHelper, FrameworkInfoFactory factory) { this.handler = requireNonNull(handler); this.counters = requireNonNull(counters); this.storage = requireNonNull(storage); this.infoFactory = requireNonNull(factory); this.executor = requireNonNull(executor); this.backoffHelper = requireNonNull(backoffHelper); initializeEventMetrics(); this.subcriptionCalls = statsProvider.makeCounter("mesos_scheduler_subscription_attempts"); } @Override public void connected(Mesos mesos) { LOG.info("Connected to Mesos master."); isConnected.set(true); Optional<String> frameworkId = storage.read( storeProvider -> storeProvider.getSchedulerStore().fetchFrameworkId()); Protos.FrameworkInfo.Builder frameworkBuilder = infoFactory.getFrameworkInfo().toBuilder(); Call.Builder call = Call.newBuilder().setType(Call.Type.SUBSCRIBE); if (frameworkId.isPresent()) { LOG.info("Found persisted framework ID: " + frameworkId); Protos.FrameworkID id = Protos.FrameworkID.newBuilder().setValue(frameworkId.get()).build(); frameworkBuilder.setId(id); call.setFrameworkId(id); } else { frameworkBuilder.clearId(); call.clearFrameworkId(); LOG.warn("Did not find a persisted framework ID, connecting as a new framework."); } call.setSubscribe(Call.Subscribe.newBuilder().setFrameworkInfo(frameworkBuilder)); executor.execute(() -> { LOG.info("Starting to subscribe to Mesos with backoff."); try { backoffHelper.doUntilSuccess(() -> { if (!isConnected.get()) { LOG.info("Disconnected while attempting to subscribe. Stopping attempt."); return true; } if (!isSubscribed.get()) { LOG.info("Sending subscribe call."); mesos.send(call.build()); subcriptionCalls.incrementAndGet(); return false; } LOG.info("Subscribed to Mesos"); return true; }); } catch (InterruptedException e) { throw new RuntimeException(e); } }); } @Override public void disconnected(Mesos mesos) { isSubscribed.set(false); isConnected.set(false); handler.handleDisconnection(); } private void initializeEventMetrics() { // For variable named metrics that are keyed on mesos enums, this ensures that we set // all possible metrics to 0. for (Event.Type type : Event.Type.values()) { this.counters.get(eventMetricNameCache.getUnchecked(type)); } } private void countEventMetrics(Event event) { this.counters.get(eventMetricNameCache.getUnchecked(event.getType())).incrementAndGet(); } @TimedInterceptor.Timed("scheduler_received") @Override public void received(Mesos mesos, Event event) { countEventMetrics(event); switch(event.getType()) { case SUBSCRIBED: Event.Subscribed subscribed = event.getSubscribed(); if (isRegistered.get()) { handler.handleReregistration(subscribed.getMasterInfo()); } else { isRegistered.set(true); handler.handleRegistration(subscribed.getFrameworkId(), subscribed.getMasterInfo()); } isSubscribed.set(true); break; case OFFERS: checkState(isSubscribed.get(), "Must be registered before receiving offers."); handler.handleOffers(event.getOffers().getOffersList()); break; case RESCIND: handler.handleRescind(event.getRescind().getOfferId()); break; case INVERSE_OFFERS: handler.handleInverseOffer(event.getInverseOffers().getInverseOffersList()); break; case RESCIND_INVERSE_OFFER: Protos.OfferID id = event.getRescindInverseOffer().getInverseOfferId(); LOG.warn("Ignoring rescinded inverse offer: {}", id); break; case UPDATE: Protos.TaskStatus status = event.getUpdate().getStatus(); handler.handleUpdate(status); break; case MESSAGE: Event.Message m = event.getMessage(); handler.handleMessage(m.getExecutorId(), m.getAgentId()); break; case ERROR: handler.handleError(event.getError().getMessage()); break; case FAILURE: Event.Failure failure = event.getFailure(); if (failure.hasExecutorId()) { handler.handleLostExecutor( failure.getExecutorId(), failure.getAgentId(), failure.getStatus()); } else { handler.handleLostAgent(failure.getAgentId()); } break; // TODO(zmanji): handle HEARTBEAT in a graceful manner // For now it is ok to silently ignore heart beats because the driver wil // detect disconnections for us. case HEARTBEAT: break; default: LOG.warn("Unknown event from Mesos \n{}", event); break; } } }