package com.linkedin.d2.balancer.strategies.degrader; import com.linkedin.common.callback.Callback; import com.linkedin.common.util.None; import com.linkedin.d2.balancer.clients.TrackerClient; import com.linkedin.d2.balancer.util.RateLimitedLogger; import com.linkedin.d2.balancer.util.healthcheck.HealthCheck; import com.linkedin.d2.balancer.util.healthcheck.HealthCheckClientBuilder; import com.linkedin.util.clock.Clock; import java.net.URISyntaxException; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * DegraderLoadBalancerQuarantine quarantines the TrackerClients with problems. The advantages * of using quarantine includes: * * . Quick isolating the single host/service failures * . Can use sideband idempotent requests (instead of real traffic) to check/monitor the hosts * with problem. * . Exponential backoff checking avoid unnecessary operations for bad hosts/networks. * * The quarantine state transition: * * +-----------------+ Send Reqs +----------------+ +------------------+ * | +--------------->| | success | | * | FAILURE | | WAIT +-------->| SUCCESS | * | |<---------------+ | | | * +-----------------+ Req Failed +----------------+ +------------------+ * (exponential backoff before send req again) * * * Note: DegraderLoadBalancerQuarantine is not thread safe and supposed to updated only under the * lock of PartitionState update. */ public class DegraderLoadBalancerQuarantine { private enum QuarantineStates { FAILURE, WAIT, SUCCESS, DISABLED, } private static final Logger _log = LoggerFactory.getLogger(DegraderLoadBalancerQuarantine.class); private static final long ERROR_REPORT_PERIOD = 60 * 1000; // Millisecond = 1 minute private volatile QuarantineStates _quarantineState; // TrackerClient with problem final private TrackerClient _trackerClient; final private HealthCheck _healthCheckClient; final private String _serviceName; final private ScheduledExecutorService _executorService; final private Clock _clock; final private long _timeBetweenHC; private volatile boolean _isShutdown; private long _lastChecked; // Waiting duration, ie the exponential back off time private long _timeTilNextCheck; final private DegraderLoadBalancerStrategyConfig _config; private final RateLimitedLogger _rateLimitedLogger; DegraderLoadBalancerQuarantine(TrackerClientUpdater client, DegraderLoadBalancerStrategyConfig config, String serviceName) { _trackerClient = client.getTrackerClient(); _config = config; _executorService = config.getExecutorService(); _clock = config.getClock(); _timeBetweenHC = DegraderLoadBalancerStrategyConfig.DEFAULT_QUARANTINE_CHECK_INTERVAL; _serviceName = serviceName; _quarantineState = QuarantineStates.FAILURE; // Initial interval is the same as update interval _timeTilNextCheck = config.getUpdateIntervalMs(); _lastChecked = Integer.MIN_VALUE; _isShutdown = false; _rateLimitedLogger = new RateLimitedLogger(_log, ERROR_REPORT_PERIOD, config.getClock()); if (_timeBetweenHC < _config.getQuarantineLatency()) { _log.error("Illegal quarantine configurations for service {}: Interval {} too short", _serviceName, _timeBetweenHC); throw new IllegalArgumentException("Quarantine interval too short"); } // create healthCheckClient for the trackerClient. The quarantine object will be saved for future // use so this only need once for each trackerClient. HealthCheck healthCheckClient = null; try { healthCheckClient = new HealthCheckClientBuilder() .setHealthCheckOperations(config.getHealthCheckOperations()) .setHealthCheckPath(config.getHealthCheckPath()) .setServicePath(config.getServicePath()) .setClock(config.getClock()) .setLatency(config.getQuarantineLatency()) .setMethod(config.getHealthCheckMethod()) .setClient(_trackerClient) .build(); } catch (URISyntaxException e) { _log.error("Error to generate healthCheckClient", e); } _healthCheckClient = healthCheckClient; } /** * healthCheckNTimes responsible for checking the health of the transportClient multiple times * at the given interval. * * @param n: repeat times */ private void healthCheckNTimes(int n) { if (n <= 0 || _isShutdown) { return; } final long startTime = _clock.currentTimeMillis(); Callback<None> healthCheckCallback = new Callback<None>() { @Override public void onError(Throwable e) { _rateLimitedLogger.warn("Healthchecking failed for {} (service={}): {}", new Object[] {_trackerClient.getUri(), _serviceName, e}); _quarantineState = QuarantineStates.FAILURE; } @Override public void onSuccess(None result) { if (n > 1) { // do not schedule next checking if _isShutdown flag is set if (!_isShutdown) { // schedule next check long nextCheckDelay = _timeBetweenHC - (_clock.currentTimeMillis() - startTime); if (nextCheckDelay > 0) { _executorService.schedule(() -> healthCheckNTimes(n - 1), nextCheckDelay, TimeUnit.MILLISECONDS); } else { // should never happen since the delay time should be within the range for a successful callback. _log.error("Delay exceeded the defined checking interval"); } } } else { _quarantineState = QuarantineStates.SUCCESS; } } }; _healthCheckClient.checkHealth(healthCheckCallback); } /** * Check and update the quarantine state * @return: true if current client is ready to exist quarantine, false otherwise. */ boolean checkUpdateQuarantineState() { long currentTime = _config.getClock().currentTimeMillis(); _lastChecked = currentTime; int repeatNum = DegraderLoadBalancerStrategyConfig.DEFAULT_QUARANTINE_CHECKNUM; switch(_quarantineState) { case DISABLED: throw new IllegalStateException("State update for disabled quarantine"); case FAILURE: if (_isShutdown) { _log.error("Could not check quarantine state since the executor is shutdown"); } else { // Either this is a newly quarantined host, or previous checking fails. // Schedule new health checking task _executorService.schedule(() -> healthCheckNTimes(repeatNum), _timeTilNextCheck, TimeUnit.MILLISECONDS); // exponential backoff: double the interval time _timeTilNextCheck *= 2; _quarantineState = QuarantineStates.WAIT; } break; case WAIT: // Nothing to do for now. Just keep waiting if (_timeTilNextCheck > ERROR_REPORT_PERIOD) { _rateLimitedLogger.error("Client {} for service {} is being kept in quarantine for {} seconds, " + "Please check to make sure it is healthy", new Object[] {_trackerClient.getUri(), _serviceName, (1.0 *_timeTilNextCheck / 1000)}); } break; case SUCCESS: // success! ready to evict current trackerclient out of quarantine _quarantineState = QuarantineStates.DISABLED; _log.info("checkUpdateQuarantineState: quarantine state for client {} service {} is DISABLED", _trackerClient.getUri(), _serviceName); return true; } return false; } /** * To shutdown quarantine, we only need to stop sending new requests. * Shutting down the executor is not feasible, because it is shared among strategies. */ public void shutdown() { if (_isShutdown) { _log.error("Quarantine already shutdown"); return; } _isShutdown = true; } /** * When resetInterval set to true, reset the interval time to Update Interval time. * Otherwise reuse the existing interval time * @param resetInterval: */ public void reset(boolean resetInterval) { _quarantineState = QuarantineStates.FAILURE; if (resetInterval) { _timeTilNextCheck = _config.getUpdateIntervalMs(); } else { _log.warn("HealthCheck: Interval {}ms for client {}", _timeTilNextCheck, _trackerClient.getUri()); } } long getLastChecked() { return _lastChecked; } // For testing only HealthCheck getHealthCheckClient() { return _healthCheckClient; } @Override public String toString() { return "TrackerClientQuarantine [_client=" + _trackerClient.getUri() + ", _quarantineState=" + _quarantineState + ", _timeTilNextCheck=" + (_timeTilNextCheck / 1000) + "s" +"]"; } }