/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.brooklyn.policy.ha; import java.util.Map; import java.util.Map.Entry; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import org.apache.brooklyn.api.sensor.SensorEvent; import org.apache.brooklyn.config.ConfigKey; import org.apache.brooklyn.core.config.BasicConfigKey; import org.apache.brooklyn.core.config.ConfigKeys; import org.apache.brooklyn.core.entity.Attributes; import org.apache.brooklyn.core.entity.EntityInternal; import org.apache.brooklyn.core.entity.lifecycle.Lifecycle; import org.apache.brooklyn.core.entity.lifecycle.ServiceStateLogic; import org.apache.brooklyn.core.entity.lifecycle.ServiceStateLogic.ComputeServiceState; import org.apache.brooklyn.core.sensor.BasicNotificationSensor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.brooklyn.policy.ha.HASensors.FailureDescriptor; import org.apache.brooklyn.util.collections.MutableMap; import org.apache.brooklyn.util.core.config.ConfigBag; import org.apache.brooklyn.util.core.flags.SetFromFlag; import org.apache.brooklyn.util.core.task.BasicTask; import org.apache.brooklyn.util.core.task.ScheduledTask; import org.apache.brooklyn.util.exceptions.Exceptions; import org.apache.brooklyn.util.time.Duration; import org.apache.brooklyn.util.time.Time; /** * Emits {@link HASensors#ENTITY_FAILED} whenever the parent's default logic ({@link ComputeServiceState}) would detect a problem, * and similarly {@link HASensors#ENTITY_RECOVERED} when recovered. * <p> * gives more control over suppressing {@link Lifecycle#ON_FIRE}, * for some period of time * (or until another process manually sets {@link Attributes#SERVICE_STATE_ACTUAL} to {@value Lifecycle#ON_FIRE}, * which this enricher will not clear until all problems have gone away) */ //@Catalog(name="Service Failure Detector", description="HA policy for deteting failure of a service") public class ServiceFailureDetector extends ServiceStateLogic.ComputeServiceState { // TODO Remove duplication between this and MemberFailureDetectionPolicy. // The latter could be re-written to use this. Or could even be deprecated // in favour of this. public enum LastPublished { NONE, FAILED, RECOVERED; } private static final Logger LOG = LoggerFactory.getLogger(ServiceFailureDetector.class); private static final long MIN_PERIOD_BETWEEN_EXECS_MILLIS = 100; public static final BasicNotificationSensor<FailureDescriptor> ENTITY_FAILED = HASensors.ENTITY_FAILED; @SetFromFlag("onlyReportIfPreviouslyUp") public static final ConfigKey<Boolean> ENTITY_FAILED_ONLY_IF_PREVIOUSLY_UP = ConfigKeys.newBooleanConfigKey("onlyReportIfPreviouslyUp", "Prevents the policy from emitting ENTITY_FAILED if the entity fails on startup (ie has never been up)", true); public static final ConfigKey<Boolean> MONITOR_SERVICE_PROBLEMS = ConfigKeys.newBooleanConfigKey("monitorServiceProblems", "Whether to monitor service problems, and emit on failures there (if set to false, this monitors only service up)", true); @SetFromFlag("serviceOnFireStabilizationDelay") public static final ConfigKey<Duration> SERVICE_ON_FIRE_STABILIZATION_DELAY = BasicConfigKey.builder(Duration.class) .name("serviceOnFire.stabilizationDelay") .description("Time period for which the service must be consistently down for (e.g. doesn't report down-up-down) before concluding ON_FIRE") .defaultValue(Duration.ZERO) .build(); @SetFromFlag("entityFailedStabilizationDelay") public static final ConfigKey<Duration> ENTITY_FAILED_STABILIZATION_DELAY = BasicConfigKey.builder(Duration.class) .name("entityFailed.stabilizationDelay") .description("Time period for which the service must be consistently down for (e.g. doesn't report down-up-down) before emitting ENTITY_FAILED") .defaultValue(Duration.ZERO) .build(); @SetFromFlag("entityRecoveredStabilizationDelay") public static final ConfigKey<Duration> ENTITY_RECOVERED_STABILIZATION_DELAY = BasicConfigKey.builder(Duration.class) .name("entityRecovered.stabilizationDelay") .description("For a failed entity, time period for which the service must be consistently up for (e.g. doesn't report up-down-up) before emitting ENTITY_RECOVERED") .defaultValue(Duration.ZERO) .build(); @SetFromFlag("entityFailedRepublishTime") public static final ConfigKey<Duration> ENTITY_FAILED_REPUBLISH_TIME = BasicConfigKey.builder(Duration.class) .name("entityFailed.republishTime") .description("Publish failed state periodically at the specified intervals, null to disable.") .build(); protected Long firstUpTime; protected Long currentFailureStartTime = null; protected Long currentRecoveryStartTime = null; protected Long publishEntityFailedTime = null; protected Long publishEntityRecoveredTime = null; protected Long setEntityOnFireTime = null; protected LastPublished lastPublished = LastPublished.NONE; private final AtomicBoolean executorQueued = new AtomicBoolean(false); private volatile long executorTime = 0; /** * TODO Really don't want this mutex! * ServiceStateLogic.setExpectedState() will call into `onEvent(null)`, so could get concurrent calls. * How to handle that? I don't think `ServiceStateLogic.setExpectedState` should be making the call, but * presumably that is their to remove a race condition so it is set before method returns. Caller shouldn't * rely on that though. * e.g. see `ServiceFailureDetectorTest.testNotifiedOfFailureOnStateOnFire`, where we get two notifications. */ private final Object mutex = new Object(); public ServiceFailureDetector() { this(new ConfigBag()); } public ServiceFailureDetector(Map<String,?> flags) { this(new ConfigBag().putAll(flags)); } public ServiceFailureDetector(ConfigBag configBag) { // TODO hierarchy should use ConfigBag, and not change flags super(configBag.getAllConfigMutable()); } @Override public void onEvent(SensorEvent<Object> event) { if (firstUpTime==null) { if (event!=null && Attributes.SERVICE_UP.equals(event.getSensor()) && Boolean.TRUE.equals(event.getValue())) { firstUpTime = event.getTimestamp(); } else if (event == null && Boolean.TRUE.equals(entity.getAttribute(Attributes.SERVICE_UP))) { // If this enricher is registered after the entity is up, then we'll get a "synthetic" onEvent(null) firstUpTime = System.currentTimeMillis(); } } super.onEvent(event); } @Override protected void setActualState(Lifecycle state) { long now = System.currentTimeMillis(); synchronized (mutex) { if (state==Lifecycle.ON_FIRE) { if (lastPublished == LastPublished.FAILED) { if (currentRecoveryStartTime != null) { if (LOG.isDebugEnabled()) LOG.debug("{} health-check for {}, component was recovering, now failing: {}", new Object[] {this, entity, getExplanation(state)}); currentRecoveryStartTime = null; publishEntityRecoveredTime = null; } else { if (LOG.isTraceEnabled()) LOG.trace("{} health-check for {}, component still failed: {}", new Object[] {this, entity, getExplanation(state)}); } } else { if (firstUpTime == null && getConfig(ENTITY_FAILED_ONLY_IF_PREVIOUSLY_UP)) { // suppress; won't publish } else if (currentFailureStartTime == null) { if (LOG.isDebugEnabled()) LOG.debug("{} health-check for {}, component now failing: {}", new Object[] {this, entity, getExplanation(state)}); currentFailureStartTime = now; publishEntityFailedTime = currentFailureStartTime + getConfig(ENTITY_FAILED_STABILIZATION_DELAY).toMilliseconds(); } else { if (LOG.isTraceEnabled()) LOG.trace("{} health-check for {}, component continuing failing: {}", new Object[] {this, entity, getExplanation(state)}); } } if (setEntityOnFireTime == null) { setEntityOnFireTime = now + getConfig(SERVICE_ON_FIRE_STABILIZATION_DELAY).toMilliseconds(); } currentRecoveryStartTime = null; publishEntityRecoveredTime = null; } else if (state == Lifecycle.RUNNING) { if (lastPublished == LastPublished.FAILED) { if (currentRecoveryStartTime == null) { if (LOG.isDebugEnabled()) LOG.debug("{} health-check for {}, component now recovering: {}", new Object[] {this, entity, getExplanation(state)}); currentRecoveryStartTime = now; publishEntityRecoveredTime = currentRecoveryStartTime + getConfig(ENTITY_RECOVERED_STABILIZATION_DELAY).toMilliseconds(); } else { if (LOG.isTraceEnabled()) LOG.trace("{} health-check for {}, component continuing recovering: {}", new Object[] {this, entity, getExplanation(state)}); } } else { if (currentFailureStartTime != null) { if (LOG.isDebugEnabled()) LOG.debug("{} health-check for {}, component was failing, now healthy: {}", new Object[] {this, entity, getExplanation(state)}); } else { if (LOG.isTraceEnabled()) LOG.trace("{} health-check for {}, component still healthy: {}", new Object[] {this, entity, getExplanation(state)}); } } currentFailureStartTime = null; publishEntityFailedTime = null; setEntityOnFireTime = null; } else { if (LOG.isTraceEnabled()) LOG.trace("{} health-check for {}, in unconfirmed sate: {}", new Object[] {this, entity, getExplanation(state)}); } long recomputeIn = Long.MAX_VALUE; // For whether to call recomputeAfterDelay if (publishEntityFailedTime != null) { long delayBeforeCheck = publishEntityFailedTime - now; if (delayBeforeCheck<=0) { if (LOG.isDebugEnabled()) LOG.debug("{} publishing failed (state={}; currentFailureStartTime={}; now={}", new Object[] {this, state, Time.makeDateString(currentFailureStartTime), Time.makeDateString(now)}); Duration republishDelay = getConfig(ENTITY_FAILED_REPUBLISH_TIME); if (republishDelay == null) { publishEntityFailedTime = null; } else { publishEntityFailedTime = now + republishDelay.toMilliseconds(); recomputeIn = Math.min(recomputeIn, republishDelay.toMilliseconds()); } lastPublished = LastPublished.FAILED; entity.sensors().emit(HASensors.ENTITY_FAILED, new HASensors.FailureDescriptor(entity, getFailureDescription(now))); } else { recomputeIn = Math.min(recomputeIn, delayBeforeCheck); } } else if (publishEntityRecoveredTime != null) { long delayBeforeCheck = publishEntityRecoveredTime - now; if (delayBeforeCheck<=0) { if (LOG.isDebugEnabled()) LOG.debug("{} publishing recovered (state={}; currentRecoveryStartTime={}; now={}", new Object[] {this, state, Time.makeDateString(currentRecoveryStartTime), Time.makeDateString(now)}); publishEntityRecoveredTime = null; lastPublished = LastPublished.RECOVERED; entity.sensors().emit(HASensors.ENTITY_RECOVERED, new HASensors.FailureDescriptor(entity, null)); } else { recomputeIn = Math.min(recomputeIn, delayBeforeCheck); } } if (setEntityOnFireTime != null) { long delayBeforeCheck = setEntityOnFireTime - now; if (delayBeforeCheck<=0) { if (LOG.isDebugEnabled()) LOG.debug("{} setting on-fire, now that deferred period has passed (state={})", new Object[] {this, state}); setEntityOnFireTime = null; super.setActualState(state); } else { recomputeIn = Math.min(recomputeIn, delayBeforeCheck); } } else { super.setActualState(state); } if (recomputeIn < Long.MAX_VALUE) { recomputeAfterDelay(recomputeIn); } } } protected String getExplanation(Lifecycle state) { Duration serviceFailedStabilizationDelay = getConfig(ENTITY_FAILED_STABILIZATION_DELAY); Duration serviceRecoveredStabilizationDelay = getConfig(ENTITY_RECOVERED_STABILIZATION_DELAY); return String.format("location=%s; status=%s; lastPublished=%s; timeNow=%s; "+ "currentFailurePeriod=%s; currentRecoveryPeriod=%s", entity.getLocations(), (state != null ? state : "<unreported>"), lastPublished, Time.makeDateString(System.currentTimeMillis()), (currentFailureStartTime != null ? getTimeStringSince(currentFailureStartTime) : "<none>") + " (stabilization "+Time.makeTimeStringRounded(serviceFailedStabilizationDelay) + ")", (currentRecoveryStartTime != null ? getTimeStringSince(currentRecoveryStartTime) : "<none>") + " (stabilization "+Time.makeTimeStringRounded(serviceRecoveredStabilizationDelay) + ")"); } private String getFailureDescription(long now) { String description = null; Map<String, Object> serviceProblems = entity.getAttribute(Attributes.SERVICE_PROBLEMS); if (serviceProblems!=null && !serviceProblems.isEmpty()) { Entry<String, Object> problem = serviceProblems.entrySet().iterator().next(); description = problem.getKey()+": "+problem.getValue(); if (serviceProblems.size()>1) { description = serviceProblems.size()+" service problems, including "+description; } else { description = "service problem: "+description; } } else if (Boolean.FALSE.equals(entity.getAttribute(Attributes.SERVICE_UP))) { description = "service not up"; } else { description = "service failure detected"; } if (publishEntityFailedTime!=null && currentFailureStartTime!=null && publishEntityFailedTime > currentFailureStartTime) description = " (stabilized for "+Duration.of(now - currentFailureStartTime, TimeUnit.MILLISECONDS)+")"; return description; } @SuppressWarnings({ "unchecked", "rawtypes" }) protected void recomputeAfterDelay(long delay) { if (isRunning() && executorQueued.compareAndSet(false, true)) { long now = System.currentTimeMillis(); delay = Math.max(0, Math.max(delay, (executorTime + MIN_PERIOD_BETWEEN_EXECS_MILLIS) - now)); if (LOG.isTraceEnabled()) LOG.trace("{} scheduling publish in {}ms", this, delay); Runnable job = new Runnable() { @Override public void run() { try { executorTime = System.currentTimeMillis(); executorQueued.set(false); onEvent(null); } catch (Exception e) { if (isRunning()) { LOG.error("Error in enricher "+this+": "+e, e); } else { if (LOG.isDebugEnabled()) LOG.debug("Error in enricher "+this+" (but no longer running): "+e, e); } } catch (Throwable t) { LOG.error("Error in enricher "+this+": "+t, t); throw Exceptions.propagate(t); } } }; ScheduledTask task = new ScheduledTask(MutableMap.of("delay", Duration.of(delay, TimeUnit.MILLISECONDS)), new BasicTask(job)); ((EntityInternal)entity).getExecutionContext().submit(task); } } private String getTimeStringSince(Long time) { return time == null ? null : Time.makeTimeStringRounded(System.currentTimeMillis() - time); } }