/*- * -\-\- * Helios Services * -- * Copyright (C) 2016 Spotify AB * -- * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * -/-/- */ package com.spotify.helios.agent; import static java.util.concurrent.TimeUnit.SECONDS; import com.google.common.base.Splitter; import com.google.common.base.Strings; import com.google.common.base.Throwables; import com.google.common.collect.Iterables; import com.spotify.docker.client.DockerClient; import com.spotify.docker.client.DockerHost; import com.spotify.docker.client.LogStream; import com.spotify.docker.client.exceptions.DockerException; import com.spotify.docker.client.messages.ExecCreation; import com.spotify.helios.common.descriptors.ExecHealthCheck; import com.spotify.helios.common.descriptors.HealthCheck; import com.spotify.helios.common.descriptors.HttpHealthCheck; import com.spotify.helios.common.descriptors.TcpHealthCheck; import java.io.IOException; import java.net.HttpURLConnection; import java.net.InetSocketAddress; import java.net.MalformedURLException; import java.net.Socket; import java.net.URL; import java.util.Arrays; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public final class HealthCheckerFactory { private HealthCheckerFactory() { } public static HealthChecker create(final TaskConfig taskConfig, final DockerClient docker, final DockerHost dockerHost, final boolean agentInContainer) { final HealthCheck healthCheck = taskConfig.healthCheck(); if (healthCheck == null) { return null; } else if (healthCheck instanceof ExecHealthCheck) { return new ExecHealthChecker((ExecHealthCheck) healthCheck, docker); } else if (healthCheck instanceof HttpHealthCheck) { return new HttpHealthChecker((HttpHealthCheck) healthCheck, taskConfig, docker, dockerHost, agentInContainer); } else if (healthCheck instanceof TcpHealthCheck) { return new TcpHealthChecker((TcpHealthCheck) healthCheck, taskConfig, docker, dockerHost); } throw new IllegalArgumentException("Unknown healthCheck type"); } static class ExecHealthChecker implements HealthChecker { private static final Logger log = LoggerFactory.getLogger(ExecHealthChecker.class); private final ExecHealthCheck healthCheck; private final DockerClient docker; ExecHealthChecker(final ExecHealthCheck healthCheck, final DockerClient docker) { this.healthCheck = healthCheck; this.docker = docker; } @Override public boolean check(final String containerId) { // Make sure we are on a docker version that supports exec health checks if (!compatibleDockerVersion(docker)) { throw new UnsupportedOperationException( "docker exec healthcheck is not supported on your docker version"); } final String[] cmd = healthCheck.getCommand().toArray(new String[healthCheck.getCommand().size()]); try { final ExecCreation execCreation = docker.execCreate( containerId, cmd, DockerClient.ExecCreateParam.attachStdout(), DockerClient.ExecCreateParam.attachStderr()); final String execId = execCreation.id(); String output = ""; try (LogStream stream = docker.execStart(execId)) { output = stream.readFully(); } final int exitCode = docker.execInspect(execId).exitCode(); if (exitCode != 0) { log.warn("exec healthcheck containerId={} cmd={} failed with exitCode={} output={}", containerId, Arrays.toString(cmd), exitCode, output); return false; } return true; } catch (DockerException e) { log.warn("exec healthcheck containerId={} cmd={} failed due to DockerException", containerId, Arrays.toString(cmd), e); return false; } catch (InterruptedException e) { log.warn("exec healthcheck containerId={} cmd={} failed due to InterruptedException", containerId, Arrays.toString(cmd), e); Thread.currentThread().interrupt(); return false; } } private static boolean compatibleDockerVersion(final DockerClient docker) { final String apiVersion; try { apiVersion = docker.version().apiVersion(); } catch (DockerException e) { return false; } catch (InterruptedException e) { Thread.currentThread().interrupt(); return false; } if (Strings.isNullOrEmpty(apiVersion)) { return false; } final Iterable<String> split = Splitter.on(".").split(apiVersion); final int major = Integer.parseInt(Iterables.get(split, 0, "0")); final int minor = Integer.parseInt(Iterables.get(split, 1, "0")); return major == 1 && minor >= 18; } } private abstract static class NetworkHealthchecker implements HealthChecker { private final DockerClient dockerClient; protected NetworkHealthchecker(final DockerClient dockerClient) { this.dockerClient = dockerClient; } protected String getBridgeAddress(String containerId) throws DockerException, InterruptedException { return dockerClient.inspectContainer(containerId).networkSettings().gateway(); } } private static class HttpHealthChecker extends NetworkHealthchecker { private static final Logger log = LoggerFactory.getLogger(HttpHealthChecker.class); private static final int CONNECT_TIMEOUT_MILLIS = 500; private static final long READ_TIMEOUT_MILLIS = SECONDS.toMillis(10); private final HttpHealthCheck healthCheck; private final TaskConfig taskConfig; private final DockerHost dockerHost; private final boolean agentInContainer; private HttpHealthChecker(final HttpHealthCheck healthCheck, final TaskConfig taskConfig, final DockerClient dockerClient, final DockerHost dockerHost, final boolean agentInContainer) { super(dockerClient); this.healthCheck = healthCheck; this.taskConfig = taskConfig; this.dockerHost = dockerHost; this.agentInContainer = agentInContainer; } @Override public boolean check(final String containerId) throws InterruptedException, DockerException { final String host; // Special case for running the agent inside helios-solo and DOCKER_HOST is a unix socket: // in this case we cannot reach the job's container with "localhost" at the external port // since "localhost" will refer to the agent's container and it's network namespace. // The agent is only run in a container sibling to the job's container when in helios-solo. if (agentInContainer && dockerHost.host().startsWith("unix://")) { host = getBridgeAddress(containerId); log.info("Using bridge address {} for healthchecks", host); } else { host = dockerHost.address(); } final URL url; // TODO (mbrown): is port always non-null? it is unconditionally unboxed on the next line final Integer port = taskConfig.ports().get(healthCheck.getPort()).getExternalPort(); try { url = new URL("http", host, port, healthCheck.getPath()); } catch (MalformedURLException e) { log.warn("MalformedURLException in http healthchecking containerId={}", containerId, e); throw new RuntimeException(e); } log.info("about to http healthcheck containerId={} with url={} for task={}", containerId, url, taskConfig); try { final HttpURLConnection conn = (HttpURLConnection) url.openConnection(); conn.setConnectTimeout(CONNECT_TIMEOUT_MILLIS); conn.setReadTimeout((int) READ_TIMEOUT_MILLIS); final int response = conn.getResponseCode(); log.warn("http healthcheck for containerId={} with url={} returned status={}", containerId, url, response); return response >= 200 && response <= 399; } catch (Exception e) { log.warn("exception in http healthchecking containerId={} with url={}", containerId, url, e); return false; } } } private static class TcpHealthChecker extends NetworkHealthchecker { private static final Logger log = LoggerFactory.getLogger(TcpHealthChecker.class); private static final int CONNECT_TIMEOUT_MILLIS = 500; private final TcpHealthCheck healthCheck; private final TaskConfig taskConfig; private final DockerHost dockerHost; private TcpHealthChecker(final TcpHealthCheck healthCheck, final TaskConfig taskConfig, final DockerClient docker, final DockerHost dockerHost) { super(docker); this.healthCheck = healthCheck; this.taskConfig = taskConfig; this.dockerHost = dockerHost; } @Override public boolean check(final String containerId) throws InterruptedException, DockerException { final Integer port = taskConfig.ports().get(healthCheck.getPort()).getExternalPort(); InetSocketAddress address = new InetSocketAddress(dockerHost.address(), port); if (address.getAddress().isLoopbackAddress()) { // tcp connections to a container-mapped port on loopback always succeed, // regardless of if the container is listening or not. use the bridge address instead. address = new InetSocketAddress(getBridgeAddress(containerId), port); } log.info("about to tcp healthcheck containerId={} with address={} for task={}", containerId, address, taskConfig); try (final Socket s = new Socket()) { s.connect(address, CONNECT_TIMEOUT_MILLIS); } catch (Exception e) { log.warn("tcp healthcheck failed for containerId={} due to exception={}", containerId, e.toString()); return false; } return true; } } }