package com.hubspot.baragon.agent.managed; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.io.Writer; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.lang3.tuple.Pair; import org.apache.curator.framework.recipes.leader.LeaderLatch; import org.eclipse.jetty.server.Server; import org.slf4j.ILoggerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.core.type.TypeReference; import com.github.rholder.retry.Retryer; import com.github.rholder.retry.RetryerBuilder; import com.github.rholder.retry.StopStrategies; import com.github.rholder.retry.WaitStrategies; import com.google.common.base.Optional; import com.google.common.base.Stopwatch; import com.google.common.base.Throwables; import com.google.inject.Inject; import com.google.inject.name.Named; import com.hubspot.baragon.agent.BaragonAgentServiceModule; import com.hubspot.baragon.agent.ServerProvider; import com.hubspot.baragon.agent.config.BaragonAgentConfiguration; import com.hubspot.baragon.agent.lbs.BootstrapFileChecker; import com.hubspot.baragon.agent.lbs.FilesystemConfigHelper; import com.hubspot.baragon.data.BaragonAuthDatastore; import com.hubspot.baragon.data.BaragonStateDatastore; import com.hubspot.baragon.data.BaragonWorkerDatastore; import com.hubspot.baragon.exceptions.AgentServiceNotifyException; import com.hubspot.baragon.exceptions.LockTimeoutException; import com.hubspot.baragon.models.AgentCheckInResponse; import com.hubspot.baragon.models.BaragonAgentMetadata; import com.hubspot.baragon.models.BaragonAgentState; import com.hubspot.baragon.models.BaragonAuthKey; import com.hubspot.baragon.models.BaragonConfigFile; import com.hubspot.baragon.models.BaragonServiceState; import com.hubspot.baragon.models.ServiceContext; import com.hubspot.baragon.models.TrafficSourceState; import com.hubspot.horizon.HttpClient; import com.hubspot.horizon.HttpRequest; import com.hubspot.horizon.HttpRequest.Method; import com.hubspot.horizon.HttpResponse; import ch.qos.logback.classic.LoggerContext; import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; public class LifecycleHelper { private static final Logger LOG = LoggerFactory.getLogger(LifecycleHelper.class); private static final String SERVICE_CHECKIN_URL_FORMAT = "%s/checkin/%s/%s"; private static final String GLOBAL_STATE_FORMAT = "%s/state"; private final BaragonAuthDatastore authDatastore; private final BaragonWorkerDatastore workerDatastore; private final BaragonAgentConfiguration configuration; private final BaragonAgentMetadata baragonAgentMetadata; private final FilesystemConfigHelper configHelper; private final BaragonStateDatastore stateDatastore; private final ServerProvider serverProvider; private final AtomicReference<BaragonAgentState> agentState; private final HttpClient httpClient; private final ScheduledExecutorService executorService; private final LeaderLatch leaderLatch; private final ReentrantLock agentLock; private final long agentLockTimeoutMs; private final AtomicInteger bootstrapStateNodeVersion = new AtomicInteger(0); @Inject public LifecycleHelper(BaragonWorkerDatastore workerDatastore, BaragonAuthDatastore authDatastore, BaragonAgentConfiguration configuration, BaragonAgentMetadata baragonAgentMetadata, FilesystemConfigHelper configHelper, BaragonStateDatastore stateDatastore, ServerProvider serverProvider, AtomicReference<BaragonAgentState> agentState, @Named(BaragonAgentServiceModule.BARAGON_AGENT_HTTP_CLIENT) HttpClient httpClient, @Named(BaragonAgentServiceModule.AGENT_SCHEDULED_EXECUTOR) ScheduledExecutorService executorService, @Named(BaragonAgentServiceModule.AGENT_LEADER_LATCH) LeaderLatch leaderLatch, @Named(BaragonAgentServiceModule.AGENT_LOCK) ReentrantLock agentLock, @Named(BaragonAgentServiceModule.AGENT_LOCK_TIMEOUT_MS) long agentLockTimeoutMs) { this.workerDatastore = workerDatastore; this.authDatastore = authDatastore; this.configuration = configuration; this.baragonAgentMetadata = baragonAgentMetadata; this.configHelper = configHelper; this.stateDatastore = stateDatastore; this.serverProvider = serverProvider; this.agentState = agentState; this.httpClient = httpClient; this.executorService = executorService; this.leaderLatch = leaderLatch; this.agentLock = agentLock; this.agentLockTimeoutMs = agentLockTimeoutMs; } public void notifyService(String action) throws Exception { long start = System.currentTimeMillis(); Retryer<AgentCheckInResponse> retryer = RetryerBuilder.<AgentCheckInResponse>newBuilder() .retryIfException() .withStopStrategy(StopStrategies.stopAfterAttempt(configuration.getMaxNotifyServiceAttempts())) .withWaitStrategy(WaitStrategies.exponentialWait(1, TimeUnit.SECONDS)) .build(); AgentCheckInResponse agentCheckInResponse = retryer.call(checkInCallable(action, false)); while ((agentCheckInResponse.getState() != TrafficSourceState.DONE && System.currentTimeMillis() - start < configuration.getAgentCheckInTimeoutMs())) { try { Thread.sleep(agentCheckInResponse.getWaitTime()); } catch (InterruptedException ie) { LOG.error("Interrupted waiting for check in with service, shutting down early"); break; } agentCheckInResponse = retryer.call(checkInCallable(action, true)); } LOG.info("Finished agent check in"); } private Callable<AgentCheckInResponse> checkInCallable(String action, boolean addStatusParam) { return () -> { HttpResponse response = httpClient.execute(buildNotifyServiceRequest(action, addStatusParam)); LOG.info(String.format("Got %s response from BaragonService", response.getStatusCode())); if (response.isError()) { throw new AgentServiceNotifyException(String.format("Bad response received from BaragonService %s", response.getAsString())); } try { LOG.debug("Got {} response {}", action, response.getAsString()); return response.getAs(AgentCheckInResponse.class); } catch (Exception e) { if (response.isSuccess()) { LOG.warn("Unable to parse response ({}) from successful shutdown call", response.getAsString()); return null; } else { throw e; } } }; } private HttpRequest buildNotifyServiceRequest(String action, boolean addStatusParam) throws AgentServiceNotifyException { Collection<String> baseUris = workerDatastore.getBaseUris(); if (!baseUris.isEmpty()) { HttpRequest.Builder requestBuilder = HttpRequest.newBuilder() .setUrl(String.format(SERVICE_CHECKIN_URL_FORMAT, baseUris.iterator().next(), configuration.getLoadBalancerConfiguration().getName(), action)) .setMethod(HttpRequest.Method.POST) .setBody(baragonAgentMetadata); if (addStatusParam) { requestBuilder.setQueryParam("status").to(true); } Map<String, BaragonAuthKey> authKeys = authDatastore.getAuthKeyMap(); if (!authKeys.isEmpty()) { requestBuilder.setQueryParam("authkey").to(authKeys.entrySet().iterator().next().getValue().getValue()); } return requestBuilder.build(); } else { throw new AgentServiceNotifyException("No services available to notify"); } } public void writeStateFileIfConfigured() throws IOException { if (configuration.getStateFile().isPresent()) { LOG.info("Writing state file..."); Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(configuration.getStateFile().get()), "UTF-8")); try { writer.write("RUNNING"); } finally { writer.close(); } } } public boolean removeStateFile() { File stateFile = new File(configuration.getStateFile().get()); return (!stateFile.exists() || stateFile.delete()); } public void applyCurrentConfigs() throws AgentServiceNotifyException { LOG.info("Getting current state of the world from Baragon Service..."); final Stopwatch stopwatch = Stopwatch.createStarted(); final long now = System.currentTimeMillis(); final Collection<String> services = stateDatastore.getServices(); if (services.size() > 0) { ExecutorService executorService = Executors.newFixedThreadPool(services.size()); List<Callable<Optional<Pair<ServiceContext, Collection<BaragonConfigFile>>>>> todo = new ArrayList<>(services.size()); Optional<Integer> maybeVersion = stateDatastore.getStateVersion(); if (maybeVersion.isPresent()) { bootstrapStateNodeVersion.set(maybeVersion.get()); } for (BaragonServiceState serviceState : getGlobalStateWithRetry()) { if (!(serviceState.getService().getLoadBalancerGroups() == null) && serviceState.getService().getLoadBalancerGroups().contains(configuration.getLoadBalancerConfiguration().getName())) { todo.add(new BootstrapFileChecker(configHelper, serviceState, now)); } } LOG.info("Going to apply {} services...", todo.size()); try { List<Future<Optional<Pair<ServiceContext, Collection<BaragonConfigFile>>>>> applied = executorService.invokeAll(todo); for (Future<Optional<Pair<ServiceContext, Collection<BaragonConfigFile>>>> serviceFuture : applied) { Optional<Pair<ServiceContext, Collection<BaragonConfigFile>>> maybeToApply = serviceFuture.get(); if (maybeToApply.isPresent()) { try { configHelper.bootstrapApply(maybeToApply.get().getKey(), maybeToApply.get().getValue()); } catch (Exception e) { LOG.error(String.format("Caught exception while applying %s during bootstrap", maybeToApply.get().getKey().getService().getServiceId()), e); } } } configHelper.checkAndReload(); } catch (Exception e) { LOG.error("Caught exception while applying and parsing configs", e); if (configuration.isExitOnStartupError()) { Throwables.propagate(e); } } LOG.info("Applied {} services in {}ms", todo.size(), stopwatch.elapsed(TimeUnit.MILLISECONDS)); } else { LOG.info("No services were found to apply"); } } private Collection<BaragonServiceState> getGlobalStateWithRetry() throws AgentServiceNotifyException { Callable<Collection<BaragonServiceState>> callable = new Callable<Collection<BaragonServiceState>>() { public Collection<BaragonServiceState> call() throws Exception { return getGlobalState(); } }; Retryer<Collection<BaragonServiceState>> retryer = RetryerBuilder.<Collection<BaragonServiceState>>newBuilder() .retryIfException() .withStopStrategy(StopStrategies.stopAfterAttempt(configuration.getMaxGetGloablStateAttempts())) .withWaitStrategy(WaitStrategies.exponentialWait(1, TimeUnit.SECONDS)) .build(); try { return retryer.call(callable); } catch (Exception e) { LOG.error("Could not get global state from Baragon Service"); throw Throwables.propagate(e); } } private Collection<BaragonServiceState> getGlobalState() throws AgentServiceNotifyException { Collection<String> baseUris = workerDatastore.getBaseUris(); HttpRequest.Builder requestBuilder = HttpRequest.newBuilder() .setUrl(String.format(GLOBAL_STATE_FORMAT, baseUris.iterator().next())) .setMethod(Method.GET); Map<String, BaragonAuthKey> authKeys = authDatastore.getAuthKeyMap(); if (!authKeys.isEmpty()) { requestBuilder.setQueryParam("authkey").to(authKeys.entrySet().iterator().next().getValue().getValue()); } HttpRequest request = requestBuilder.build(); HttpResponse response = httpClient.execute(request); LOG.info(String.format("Got %s response from BaragonService", response.getStatusCode())); if (response.isError()) { throw new AgentServiceNotifyException(String.format("Bad response received from BaragonService %s", response.getAsString())); } return response.getAs(new TypeReference<Collection<BaragonServiceState>>() {}); } public void shutdown() throws Exception { leaderLatch.close(); executorService.shutdown(); if (configuration.isDeregisterOnGracefulShutdown()) { LOG.info("Notifying BaragonService of shutdown..."); notifyService("shutdown"); } if (configuration.getStateFile().isPresent()) { LOG.info("Removing state file"); removeStateFile(); } } public void checkStateNodeVersion() { agentState.set(BaragonAgentState.BOOTSTRAPING); try { Optional<Integer> maybeStateVersion = stateDatastore.getStateVersion(); if (maybeStateVersion.isPresent()) { if (!agentLock.tryLock(agentLockTimeoutMs, TimeUnit.MILLISECONDS)) { throw new LockTimeoutException("Could not acquire lock to reapply configs", agentLock); } try { if (bootstrapStateNodeVersion.get() < maybeStateVersion.get()) { applyCurrentConfigs(); bootstrapStateNodeVersion.set(maybeStateVersion.get()); } } catch (Exception e) { abort("Could not ensure configs are up to date, aborting", e); } finally { agentLock.unlock(); } } } catch (Exception e) { abort("Interrupted while trying to reapply configs, shutting down", e); } agentState.set(BaragonAgentState.ACCEPTING); } @SuppressFBWarnings("DM_EXIT") public void abort(String message, Exception exception) { LOG.error(message, exception); flushLogs(); Optional<Server> server = serverProvider.get(); if (server.isPresent()) { try { server.get().stop(); shutdown(); } catch (Exception e) { LOG.warn("While aborting server", e); } } else { LOG.warn("Baragon Agent abort called before server has fully initialized!"); } System.exit(1); } private void flushLogs() { final long millisToWait = 100; ILoggerFactory loggerFactory = LoggerFactory.getILoggerFactory(); if (loggerFactory instanceof LoggerContext) { LoggerContext context = (LoggerContext) loggerFactory; context.stop(); } try { Thread.sleep(millisToWait); } catch (Exception e) { LOG.info("While sleeping for log flush", e); } } }