package com.intrbiz.bergamot.health; import java.util.Map; import java.util.Set; import java.util.Timer; import java.util.TimerTask; import java.util.TreeSet; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.function.Consumer; import org.apache.log4j.Logger; import com.intrbiz.bergamot.health.model.KnownDaemon; import com.intrbiz.bergamot.model.message.health.HealthCheckHeartbeat; import com.intrbiz.bergamot.model.message.health.HealthCheckJoin; import com.intrbiz.bergamot.model.message.health.HealthCheckKill; import com.intrbiz.bergamot.model.message.health.HealthCheckMessage; import com.intrbiz.bergamot.model.message.health.HealthCheckRequestJoin; import com.intrbiz.bergamot.model.message.health.HealthCheckUnjoin; import com.intrbiz.bergamot.queue.HealthCheckQueue; import com.intrbiz.queue.Producer; import com.intrbiz.queue.name.NullKey; public class HealthTracker { private static final HealthTracker US = new HealthTracker(); public static HealthTracker getInstance() { return US; } private ConcurrentMap<UUID, KnownDaemon> knownDaemons = new ConcurrentHashMap<UUID, KnownDaemon>(); private volatile boolean inited = false; private HealthCheckQueue queue; @SuppressWarnings("unused") private com.intrbiz.queue.Consumer<HealthCheckMessage, NullKey> healthcheckConsumer; private Producer<HealthCheckMessage> healthcheckControlProducer; private Producer<HealthCheckMessage> healthcheckEventProducer; private Logger logger = Logger.getLogger(HealthTracker.class); private Timer timer; private final CopyOnWriteArrayList<Consumer<KnownDaemon>> alertHandlers = new CopyOnWriteArrayList<Consumer<KnownDaemon>>(); private HealthTracker() { super(); } public void init() { synchronized (this) { if (! this.inited) { this.inited = true; // setup queues this.setupQueue(); // request daemons to join this.requestJoin(); // setup the check timer this.setupTimer(); } } } public Set<KnownDaemon> getDaemons() { return new TreeSet<KnownDaemon>(this.knownDaemons.values()); } public KnownDaemon getDaemon(UUID instanceId) { return this.knownDaemons.get(instanceId); } public void removeDaemon(UUID daemon) { this.knownDaemons.remove(daemon); } public void addAlertHandler(Consumer<KnownDaemon> alertHandler) { this.alertHandlers.add(alertHandler); } public void removeAlertHandler(Consumer<KnownDaemon> alertHandler) { this.alertHandlers.remove(alertHandler); } /** * Unjoin the given instance from this health check cluster * @param instanceId the instance id to unjoin * @param daemonKind the daemin kind * @param daemonName the daemon name */ public void unjoinDaemon(UUID instanceId, String daemonKind, String daemonName) { this.healthcheckEventProducer.publish(new HealthCheckUnjoin(instanceId, daemonKind, daemonName)); } /** * Unjoin the given instance from this health check cluster * @param instanceId the instance id to unjoin */ public void unjoinDaemon(UUID instanceId) { this.unjoinDaemon(instanceId, null, "unknown"); } /** * Request that the given running daemon with the given instance and runtime id immediately terminates * @param instanceId the daemon instance id * @param runtimeId the daemon runtime id */ public void killDaemon(UUID instanceId, UUID runtimeId, String password) { this.healthcheckControlProducer.publish(new HealthCheckKill(instanceId, runtimeId, password)); } private void setupTimer() { this.timer = new Timer(); this.timer.scheduleAtFixedRate(new TimerTask() { @Override public void run() { checkDaemons(); } }, 11_000L, 5_000L); } private void checkDaemons() { for (KnownDaemon daemon : this.knownDaemons.values()) { if (daemon.isAlive() && daemon.isLastHeartbeatTooOld()) { // daemon was alive but we've missed enough heartbeats // for it to now be considered dead daemon.setAlive(false); daemon.incAlertCount(); daemon.setLastAlertTime(System.currentTimeMillis()); // raise alert for (Consumer<KnownDaemon> handler : this.alertHandlers) { handler.accept(daemon); } } else if ((! daemon.isAlive()) && daemon.isDaemonLongGone()) { // remove the daemon from our list this.knownDaemons.remove(daemon.getInstanceId()); } } } private void setupQueue() { this.queue = HealthCheckQueue.open(); this.healthcheckControlProducer = this.queue.publishHealthCheckControlEvents(); this.healthcheckEventProducer = this.queue.publishHealthCheckEvents(); this.healthcheckConsumer = this.queue.consumeHealthCheckEvents(this::handleMessage); } private void handleMessage(Map<String, Object> headers, HealthCheckMessage message) { if (message instanceof HealthCheckHeartbeat) { // heartbeat this.processHeartbeat((HealthCheckHeartbeat) message); } else if (message instanceof HealthCheckJoin) { // join this.processJoin((HealthCheckJoin) message); } else if (message instanceof HealthCheckUnjoin) { // unjoin this.processUnjoin((HealthCheckUnjoin) message); } } private void processHeartbeat(HealthCheckHeartbeat heartbeat) { // lookup the daemon KnownDaemon daemon = this.knownDaemons.get(heartbeat.getInstanceId()); if (daemon != null) { // update the state daemon.setLastHeartbeatSequence(heartbeat.getSequence()); // use nanoTime as that is monotonic daemon.setLastHeartbeatAt(System.nanoTime()); daemon.setLastHeartbeatTime(System.currentTimeMillis()); // recovery handling if (! daemon.isAlive()) { if (daemon.incRecoveryHeartbeatCount() >= 60) { // we've have 5 minutes of successful heartbeats daemon.incRecoveryCount(); daemon.setRecoveryHeartbeatCount(0); daemon.setAlive(true); daemon.setLastRecoveryTime(System.currentTimeMillis()); } } } else { logger.warn("Got heartbeat for unkown daemon " + heartbeat.getInstanceId() + ", requesting join"); this.requestJoin(); } } private void processUnjoin(HealthCheckUnjoin unjoin) { this.knownDaemons.remove(unjoin.getInstanceId()); logger.info("Received unjoin event from " + unjoin.getInstanceId() + " daemon " + unjoin.getDaemonName()); } private void processJoin(HealthCheckJoin join) { this.knownDaemons.put(join.getInstanceId(), new KnownDaemon(join.getInstanceId(), join.getRuntimeId(), join.getDaemonKind(), join.getDaemonName(), join.getStarted(), join.getHostId(), join.getHostName())); logger.info("Received join event from " + join.getInstanceId() + " daemon " + join.getDaemonKind() + "::" + join.getDaemonName()); } private void requestJoin() { this.healthcheckControlProducer.publish(new HealthCheckRequestJoin()); } }