package com.hubspot.blazar.externalservice; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; import org.apache.curator.framework.recipes.leader.LeaderLatchListener; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Optional; import com.google.inject.Inject; import com.google.inject.Singleton; import com.hubspot.blazar.config.BlazarConfiguration; import com.hubspot.singularity.SingularityState; import com.hubspot.singularity.client.SingularityClient; import io.dropwizard.lifecycle.Managed; import io.reactivex.Observable; import io.reactivex.disposables.Disposable; import io.reactivex.schedulers.Schedulers; @Singleton public class BuildClusterHealthChecker implements LeaderLatchListener, Managed { private static final int HEALTH_CHECK_INTERVAL_SECONDS = 10; private static final Logger LOG = LoggerFactory.getLogger(BuildClusterHealthChecker.class); private final AtomicBoolean leader; private final AtomicBoolean running; private final Map<String, SingularityClient> singularityClusterClients; private final BlazarConfiguration blazarConfiguration; private final Map<String, ClusterHealthCheck> clusterHealthCheckMap = new ConcurrentHashMap<>(); private final AtomicReference<Disposable> clusterHealthObserver; @Inject public BuildClusterHealthChecker(Map<String, SingularityClient> singularityClusterClients, BlazarConfiguration blazarConfiguration) { this.singularityClusterClients = singularityClusterClients; this.blazarConfiguration = blazarConfiguration; leader = new AtomicBoolean(false); running = new AtomicBoolean(false); clusterHealthObserver = new AtomicReference<>(); } @Override public void start() throws Exception { running.compareAndSet(false, true); LOG.info("We have been started"); } @Override public void stop() throws Exception { running.compareAndSet(true, false); LOG.info("We have been stopped. Stopping build clusters health checks"); Disposable clusterHealthCheckObserver = clusterHealthObserver.get(); if (clusterHealthCheckObserver != null) { clusterHealthObserver.get().dispose(); } } @Override public void isLeader() { LOG.info("We are the leader. Starting build cluster health check monitoring"); leader.set(true); Disposable disposable = getObservableOfAllClustersHealth() .subscribe(clusterHealthCheck -> { clusterHealthCheckMap.put(clusterHealthCheck.getClusterName(), clusterHealthCheck); }); clusterHealthObserver.getAndSet(disposable); } @Override public void notLeader() { LOG.info("We are not the leader. Stopping build cluster health check monitoring"); leader.set(false); Disposable clusterHealthCheckObserver = clusterHealthObserver.get(); if (clusterHealthCheckObserver != null) { clusterHealthObserver.get().dispose(); } } public boolean isSomeClusterAvailable() { return clusterHealthCheckMap.values().stream().anyMatch((ClusterHealthCheck::isHealthy)); } public boolean isClusterAvailable(String clusterName) { return clusterHealthCheckMap.get(clusterName) != null && clusterHealthCheckMap.get(clusterName).isHealthy(); } private Observable<ClusterHealthCheck> getObservableOfAllClustersHealth() { List<Observable<ClusterHealthCheck>> clusterHealthCheckObservables = singularityClusterClients.keySet().stream() .map(clusterName -> getObservableSingularityClusterHealth(clusterName)).collect(Collectors.toList()); return Observable.merge(clusterHealthCheckObservables).subscribeOn(Schedulers.io()).observeOn(Schedulers.computation()); } private Observable<ClusterHealthCheck> getObservableSingularityClusterHealth(String clusterName) { return Observable.interval(0, HEALTH_CHECK_INTERVAL_SECONDS, TimeUnit.SECONDS) .observeOn(Schedulers.io()) .map(tick -> { SingularityClient singularityClient = singularityClusterClients.get(clusterName); SingularityState singularityState; try { singularityState = singularityClient.getState(Optional.of(false), Optional.of(false)); if (singularityState != null) { if (singularityClusterHasAvailableResources(singularityState)) { LOG.debug("Cluster {} is healthy", clusterName); return new ClusterHealthCheck(clusterName, true); } else { LOG.warn("Cluster {} has not enough resources and will not be used for running builds in this cycle. The ratio of overdue tasks over active tasks is greater than 10% ({}%)", clusterName, getRatioOfOverdueOverActiveTasks(singularityState)); return new ClusterHealthCheck(clusterName, false); } } LOG.warn("Could not retrieve cluster state for cluster {}. It will not be used for running builds in this cycle. ", clusterName); return new ClusterHealthCheck(clusterName, false); } catch (Exception e) { LOG.warn("An error occurred while checking health of cluster {}. It will be marked as not healthy and will retry in next cycle.", clusterName); return new ClusterHealthCheck(clusterName,false); } }); } private boolean singularityClusterHasAvailableResources(SingularityState singularityState) { // if overdue tasks are more than more than 10% of active tasks we consider the cluster overloaded and we will not send builds return getRatioOfOverdueOverActiveTasks(singularityState) < 10; } private double getRatioOfOverdueOverActiveTasks(SingularityState singularityState) { return (double) singularityState.getLateTasks() / singularityState.getActiveTasks() * 100; } private static final class ClusterHealthCheck { private final String clusterName; private final boolean healthy; public String getClusterName() { return clusterName; } public boolean isHealthy() { return healthy; } public ClusterHealthCheck(String clusterName, boolean healthy) { this.clusterName = clusterName; this.healthy = healthy; } } }