/*-
* -\-\-
* Helios Services
* --
* Copyright (C) 2016 Spotify AB
* --
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* -/-/-
*/
package com.spotify.helios.agent;
import static com.google.common.base.Charsets.UTF_8;
import static com.google.common.base.Strings.isNullOrEmpty;
import static com.spotify.helios.agent.Agent.EMPTY_EXECUTIONS;
import static com.spotify.helios.servicescommon.ServiceRegistrars.createServiceRegistrar;
import static com.spotify.helios.servicescommon.ZooKeeperAclProviders.digest;
import static com.spotify.helios.servicescommon.ZooKeeperAclProviders.heliosAclProvider;
import static java.lang.management.ManagementFactory.getOperatingSystemMXBean;
import static java.lang.management.ManagementFactory.getRuntimeMXBean;
import static java.nio.file.StandardOpenOption.CREATE;
import static java.nio.file.StandardOpenOption.WRITE;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.jvm.GarbageCollectorMetricSet;
import com.codahale.metrics.jvm.MemoryUsageGaugeSet;
import com.fasterxml.jackson.core.type.TypeReference;
import com.google.common.base.Strings;
import com.google.common.base.Suppliers;
import com.google.common.base.Throwables;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.io.Resources;
import com.google.common.util.concurrent.AbstractIdleService;
import com.spotify.docker.client.DefaultDockerClient;
import com.spotify.docker.client.DockerCertificates;
import com.spotify.docker.client.DockerClient;
import com.spotify.docker.client.exceptions.DockerCertificateException;
import com.spotify.helios.common.HeliosRuntimeException;
import com.spotify.helios.common.SystemClock;
import com.spotify.helios.common.descriptors.JobId;
import com.spotify.helios.common.descriptors.TaskStatusEvent;
import com.spotify.helios.master.metrics.HealthCheckGauge;
import com.spotify.helios.master.metrics.TotalHealthCheckGauge;
import com.spotify.helios.serviceregistration.ServiceRegistrar;
import com.spotify.helios.servicescommon.EventSender;
import com.spotify.helios.servicescommon.EventSenderFactory;
import com.spotify.helios.servicescommon.FastForwardConfig;
import com.spotify.helios.servicescommon.ManagedStatsdReporter;
import com.spotify.helios.servicescommon.PersistentAtomicReference;
import com.spotify.helios.servicescommon.ReactorFactory;
import com.spotify.helios.servicescommon.ServiceUtil;
import com.spotify.helios.servicescommon.ZooKeeperRegistrarService;
import com.spotify.helios.servicescommon.coordination.CuratorClientFactoryImpl;
import com.spotify.helios.servicescommon.coordination.DefaultZooKeeperClient;
import com.spotify.helios.servicescommon.coordination.ZooKeeperClient;
import com.spotify.helios.servicescommon.coordination.ZooKeeperClientProvider;
import com.spotify.helios.servicescommon.coordination.ZooKeeperHealthChecker;
import com.spotify.helios.servicescommon.coordination.ZooKeeperModelReporter;
import com.spotify.helios.servicescommon.coordination.ZooKeeperNodeUpdaterFactory;
import com.spotify.helios.servicescommon.statistics.DockerVersionSupplier;
import com.spotify.helios.servicescommon.statistics.FastForwardReporter;
import com.spotify.helios.servicescommon.statistics.Metrics;
import com.spotify.helios.servicescommon.statistics.MetricsImpl;
import com.spotify.helios.servicescommon.statistics.NoopMetrics;
import com.sun.management.OperatingSystemMXBean;
import io.dropwizard.configuration.ConfigurationException;
import io.dropwizard.lifecycle.Managed;
import io.dropwizard.setup.Environment;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.nio.channels.OverlappingFileLockException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
import org.apache.curator.RetryPolicy;
import org.apache.curator.framework.AuthInfo;
import org.apache.curator.framework.CuratorFramework;
import org.apache.curator.framework.api.ACLProvider;
import org.apache.curator.retry.ExponentialBackoffRetry;
import org.eclipse.jetty.server.Server;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* The Helios agent.
*/
public class AgentService extends AbstractIdleService implements Managed {
private static final Logger log = LoggerFactory.getLogger(AgentService.class);
private static final String TASK_HISTORY_FILENAME = "task-history.json";
private static final TypeReference<Map<JobId, Execution>> JOBID_EXECUTIONS_MAP =
new TypeReference<Map<JobId, Execution>>() {
};
private final Agent agent;
private final Server server;
private final ZooKeeperClient zooKeeperClient;
private final HostInfoReporter hostInfoReporter;
private final AgentInfoReporter agentInfoReporter;
private final EnvironmentVariableReporter environmentVariableReporter;
private final LabelReporter labelReporter;
private final FileChannel stateLockFile;
private final FileLock stateLock;
private final ZooKeeperAgentModel model;
private final Metrics metrics;
private final ServiceRegistrar serviceRegistrar;
private ZooKeeperRegistrarService zkRegistrar;
/**
* Create a new agent instance.
*
* @param config The service configuration.
* @param environment The DropWizard environment.
* @throws ConfigurationException If an error occurs with the DropWizard configuration.
* @throws InterruptedException If the thread is interrupted.
* @throws IOException IOException
*/
public AgentService(final AgentConfig config, final Environment environment)
throws ConfigurationException, InterruptedException, IOException {
// Create state directory, if necessary
final Path stateDirectory = config.getStateDirectory().toAbsolutePath().normalize();
if (!Files.exists(stateDirectory)) {
try {
Files.createDirectories(stateDirectory);
} catch (IOException e) {
log.error("Failed to create state directory: {}", stateDirectory, e);
throw new RuntimeException(e);
}
}
// Take a file lock in the state directory to ensure this is the only agent using it
final Path lockPath = config.getStateDirectory().resolve("lock");
try {
stateLockFile = FileChannel.open(lockPath, CREATE, WRITE);
stateLock = stateLockFile.tryLock();
if (stateLock == null) {
throw new IllegalStateException("State lock file already locked: " + lockPath);
}
} catch (OverlappingFileLockException e) {
throw new IllegalStateException("State lock file already locked: " + lockPath);
} catch (IOException e) {
log.error("Failed to take state lock: {}", lockPath, e);
throw new RuntimeException(e);
}
final Path idPath = config.getStateDirectory().resolve("id");
final String id;
try {
if (Files.exists(idPath)) {
id = new String(Files.readAllBytes(idPath), UTF_8);
} else {
id = config.getId();
Files.write(idPath, id.getBytes(UTF_8));
}
} catch (IOException e) {
log.error("Failed to set up id file: {}", idPath, e);
throw new RuntimeException(e);
}
// Configure metrics
final MetricRegistry metricsRegistry = environment.metrics();
metricsRegistry.registerAll(new GarbageCollectorMetricSet());
metricsRegistry.registerAll(new MemoryUsageGaugeSet());
final DockerClient dockerClient = createDockerClient(config);
if (config.isInhibitMetrics()) {
log.info("Not starting metrics");
metrics = new NoopMetrics();
} else {
log.info("Starting metrics");
metrics = new MetricsImpl(metricsRegistry, MetricsImpl.Type.AGENT);
if (!Strings.isNullOrEmpty(config.getStatsdHostPort())) {
environment.lifecycle().manage(new ManagedStatsdReporter(config.getStatsdHostPort(),
metricsRegistry));
}
final FastForwardConfig ffwdConfig = config.getFfwdConfig();
if (ffwdConfig != null) {
// include the version docker as an additional attribute in FastForwardReporter
final DockerVersionSupplier versionSupplier = new DockerVersionSupplier(dockerClient);
final Supplier<Map<String, String>> attributesSupplier =
() -> ImmutableMap.of("docker_version", versionSupplier.get());
final FastForwardReporter reporter = FastForwardReporter.create(
metricsRegistry,
ffwdConfig.getAddress(),
ffwdConfig.getMetricKey(),
ffwdConfig.getReportingIntervalSeconds(),
attributesSupplier);
environment.lifecycle().manage(reporter);
}
}
// This CountDownLatch will signal EnvironmentVariableReporter and LabelReporter when to report
// data to ZK. They only report once and then stop, so we need to tell them when to start
// reporting otherwise they'll race with ZooKeeperRegistrarService and might have their data
// erased if they are too fast.
final CountDownLatch zkRegistrationSignal = new CountDownLatch(1);
this.zooKeeperClient = setupZookeeperClient(config, id, zkRegistrationSignal);
final DockerHealthChecker dockerHealthChecker = new DockerHealthChecker(
metrics.getSupervisorMetrics(), TimeUnit.SECONDS, 30);
environment.lifecycle().manage(dockerHealthChecker);
// Set up model
final ZooKeeperModelReporter modelReporter =
new ZooKeeperModelReporter(metrics.getZooKeeperMetrics());
final ZooKeeperClientProvider zkClientProvider = new ZooKeeperClientProvider(
zooKeeperClient, modelReporter);
final String taskStatusEventTopic = TaskStatusEvent.TASK_STATUS_EVENT_TOPIC;
final List<EventSender> eventSenders = EventSenderFactory
.build(environment, config, metricsRegistry, taskStatusEventTopic);
final TaskHistoryWriter historyWriter;
if (config.isJobHistoryDisabled()) {
historyWriter = null;
} else {
historyWriter = new TaskHistoryWriter(
config.getName(), zooKeeperClient, stateDirectory.resolve(TASK_HISTORY_FILENAME));
}
try {
this.model =
new ZooKeeperAgentModel(zkClientProvider, config.getName(), stateDirectory, historyWriter,
eventSenders, taskStatusEventTopic);
} catch (IOException e) {
throw new RuntimeException(e);
}
// Set up service registrar
this.serviceRegistrar = createServiceRegistrar(config.getServiceRegistrarPlugin(),
config.getServiceRegistryAddress(),
config.getDomain());
final ZooKeeperNodeUpdaterFactory nodeUpdaterFactory =
new ZooKeeperNodeUpdaterFactory(zooKeeperClient);
this.hostInfoReporter =
new HostInfoReporter((OperatingSystemMXBean) getOperatingSystemMXBean(), nodeUpdaterFactory,
config.getName(), dockerClient, config.getDockerHost(),
1, TimeUnit.MINUTES, zkRegistrationSignal);
this.agentInfoReporter =
new AgentInfoReporter(getRuntimeMXBean(), nodeUpdaterFactory, config.getName(),
1, TimeUnit.MINUTES, zkRegistrationSignal);
this.environmentVariableReporter = new EnvironmentVariableReporter(
config.getName(), config.getEnvVars(), nodeUpdaterFactory, zkRegistrationSignal);
this.labelReporter = new LabelReporter(
config.getName(), config.getLabels(), nodeUpdaterFactory, zkRegistrationSignal);
final String namespace = "helios-" + id;
final List<ContainerDecorator> decorators = Lists.newArrayList();
if (!isNullOrEmpty(config.getRedirectToSyslog())) {
decorators.add(new SyslogRedirectingContainerDecorator(config.getRedirectToSyslog()));
}
if (!config.getBinds().isEmpty()) {
decorators.add(new BindVolumeContainerDecorator(config.getBinds()));
}
if (!config.getExtraHosts().isEmpty()) {
decorators.add(new AddExtraHostContainerDecorator(config.getExtraHosts()));
}
final SupervisorFactory supervisorFactory = new SupervisorFactory(
model, dockerClient,
config.getEnvVars(), serviceRegistrar,
decorators,
config.getDockerHost(),
config.getName(),
metrics.getSupervisorMetrics(),
namespace,
config.getDomain(),
config.getDns());
final ReactorFactory reactorFactory = new ReactorFactory();
final PortAllocator portAllocator = new PortAllocator(config.getPortRangeStart(),
config.getPortRangeEnd());
final PersistentAtomicReference<Map<JobId, Execution>> executions;
try {
executions = PersistentAtomicReference.create(stateDirectory.resolve("executions.json"),
JOBID_EXECUTIONS_MAP,
Suppliers.ofInstance(EMPTY_EXECUTIONS));
} catch (IOException e) {
throw new RuntimeException(e);
}
final Reaper reaper = new Reaper(dockerClient, namespace);
this.agent = new Agent(model, supervisorFactory, reactorFactory, executions, portAllocator,
reaper);
final ZooKeeperHealthChecker zkHealthChecker = new ZooKeeperHealthChecker(zooKeeperClient);
final DockerDaemonHealthChecker dockerDaemonHealthChecker =
new DockerDaemonHealthChecker(dockerClient);
if (!config.getNoHttp()) {
environment.healthChecks().register("docker", dockerHealthChecker);
environment.healthChecks().register("zookeeper", zkHealthChecker);
environment.healthChecks().register("dockerd", dockerDaemonHealthChecker);
// Report each individual healthcheck as a gauge metric
environment.healthChecks().getNames().forEach(
name -> environment.metrics().register(
"helios." + name + ".ok", new HealthCheckGauge(environment.healthChecks(), name)));
// and add one gauge for the overall health, similar to what HealthCheckServlet does - if
// any healthcheck fails, then report overall health of false.
// this causes each healthcheck to be executed twice each time metrics are reported, but
// this feels ok since each check is cheap.
environment.metrics().register("helios.healthy",
new TotalHealthCheckGauge(environment.healthChecks()));
environment.jersey().register(new AgentModelTaskResource(model));
environment.jersey().register(new AgentModelTaskStatusResource(model));
environment.lifecycle().manage(this);
this.server = ServiceUtil.createServerFactory(config.getHttpEndpoint(),
config.getAdminEndpoint(),
config.getNoHttp())
.build(environment);
} else {
this.server = null;
}
environment.lifecycle().manage(this);
}
private DockerClient createDockerClient(final AgentConfig config) {
final DefaultDockerClient.Builder builder = DefaultDockerClient.builder()
.uri(config.getDockerHost().uri());
if (config.getConnectionPoolSize() != -1) {
builder.connectionPoolSize(config.getConnectionPoolSize());
}
if (!isNullOrEmpty(config.getDockerHost().dockerCertPath())) {
final Path dockerCertPath = java.nio.file.Paths.get(config.getDockerHost().dockerCertPath());
final DockerCertificates dockerCertificates;
try {
dockerCertificates = new DockerCertificates(dockerCertPath);
} catch (DockerCertificateException e) {
throw new RuntimeException(e);
}
builder.dockerCertificates(dockerCertificates);
}
return new PollingDockerClient(builder);
}
/**
* Create a Zookeeper client and create the control and state nodes if needed.
*
* @param config The service configuration.
* @return A zookeeper client.
*/
private ZooKeeperClient setupZookeeperClient(final AgentConfig config, final String id,
final CountDownLatch zkRegistrationSignal) {
ACLProvider aclProvider = null;
List<AuthInfo> authorization = null;
final String agentUser = config.getZookeeperAclAgentUser();
final String agentPassword = config.getZooKeeperAclAgentPassword();
final String masterUser = config.getZookeeperAclMasterUser();
final String masterDigest = config.getZooKeeperAclMasterDigest();
if (!isNullOrEmpty(agentPassword)) {
if (isNullOrEmpty(agentUser)) {
throw new HeliosRuntimeException(
"Agent username must be set if a password is set");
}
authorization = Lists.newArrayList(new AuthInfo(
"digest", String.format("%s:%s", agentUser, agentPassword).getBytes()));
}
if (config.isZooKeeperEnableAcls()) {
if (isNullOrEmpty(agentUser) || isNullOrEmpty(agentPassword)) {
throw new HeliosRuntimeException(
"ZooKeeper ACLs enabled but agent username and/or password not set");
}
if (isNullOrEmpty(masterUser) || isNullOrEmpty(masterDigest)) {
throw new HeliosRuntimeException(
"ZooKeeper ACLs enabled but master username and/or digest not set");
}
aclProvider = heliosAclProvider(
masterUser, masterDigest,
agentUser, digest(agentUser, agentPassword));
}
final RetryPolicy zooKeeperRetryPolicy = new ExponentialBackoffRetry(1000, 3);
final CuratorFramework curator = new CuratorClientFactoryImpl().newClient(
config.getZooKeeperConnectionString(),
config.getZooKeeperSessionTimeoutMillis(),
config.getZooKeeperConnectionTimeoutMillis(),
zooKeeperRetryPolicy,
aclProvider,
authorization);
final ZooKeeperClient client = new DefaultZooKeeperClient(curator,
config.getZooKeeperClusterId());
client.start();
// Register the agent
final AgentZooKeeperRegistrar agentZooKeeperRegistrar = new AgentZooKeeperRegistrar(
config.getName(), id, config.getZooKeeperRegistrationTtlMinutes(), new SystemClock());
zkRegistrar = ZooKeeperRegistrarService.newBuilder()
.setZooKeeperClient(client)
.setZooKeeperRegistrar(agentZooKeeperRegistrar)
.setZkRegistrationSignal(zkRegistrationSignal)
.build();
return client;
}
@Override
protected void startUp() throws Exception {
logBanner();
zkRegistrar.startAsync().awaitRunning();
model.startAsync().awaitRunning();
agent.startAsync().awaitRunning();
hostInfoReporter.startAsync();
agentInfoReporter.startAsync();
environmentVariableReporter.startAsync();
labelReporter.startAsync();
metrics.start();
if (server != null) {
try {
server.start();
} catch (Exception e) {
log.error("Unable to start server, shutting down", e);
server.stop();
}
}
}
private void logBanner() {
try {
final String banner = Resources.toString(Resources.getResource("agent-banner.txt"), UTF_8);
log.info("\n{}", banner);
} catch (IllegalArgumentException | IOException ignored) {
// ignore
}
}
@Override
protected void shutDown() throws Exception {
if (server != null) {
server.stop();
}
hostInfoReporter.stopAsync().awaitTerminated();
agentInfoReporter.stopAsync().awaitTerminated();
environmentVariableReporter.stopAsync().awaitTerminated();
labelReporter.stopAsync().awaitTerminated();
agent.stopAsync().awaitTerminated();
if (serviceRegistrar != null) {
serviceRegistrar.close();
}
zkRegistrar.stopAsync().awaitTerminated();
model.stopAsync().awaitTerminated();
metrics.stop();
zooKeeperClient.close();
try {
stateLock.release();
} catch (IOException e) {
log.error("Failed to release state lock", e);
}
try {
stateLockFile.close();
} catch (IOException e) {
log.error("Failed to close state lock file", e);
}
}
@Override
public void start() throws Exception {
}
@Override
public void stop() throws Exception {
shutDown();
}
}