/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.brooklyn.policy.ha; import java.util.Map; import java.util.concurrent.atomic.AtomicReference; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.brooklyn.api.catalog.Catalog; import org.apache.brooklyn.api.entity.EntityLocal; import org.apache.brooklyn.api.sensor.Sensor; import org.apache.brooklyn.api.sensor.SensorEvent; import org.apache.brooklyn.api.sensor.SensorEventListener; import org.apache.brooklyn.config.ConfigKey; import org.apache.brooklyn.core.config.ConfigKeys; import org.apache.brooklyn.core.entity.Entities; import org.apache.brooklyn.core.entity.EntityInternal; import org.apache.brooklyn.core.entity.lifecycle.Lifecycle; import org.apache.brooklyn.core.entity.lifecycle.ServiceStateLogic; import org.apache.brooklyn.core.entity.trait.Startable; import org.apache.brooklyn.core.policy.AbstractPolicy; import org.apache.brooklyn.core.sensor.BasicNotificationSensor; import org.apache.brooklyn.policy.ha.HASensors.FailureDescriptor; import org.apache.brooklyn.util.collections.MutableMap; import org.apache.brooklyn.util.core.config.ConfigBag; import org.apache.brooklyn.util.core.flags.SetFromFlag; import org.apache.brooklyn.util.javalang.JavaClassNames; import org.apache.brooklyn.util.time.Duration; import org.apache.brooklyn.util.time.Time; import com.google.common.base.Preconditions; /** attaches to a SoftwareProcess (or anything Startable, emitting ENTITY_FAILED or other configurable sensor), * and invokes restart on failure; * if there is a subsequent failure within a configurable time interval, or if the restart fails, * this gives up and emits {@link #ENTITY_RESTART_FAILED} */ @Catalog(name="Service Restarter", description="HA policy for restarting a service automatically, " + "and for emitting an events if the service repeatedly fails") public class ServiceRestarter extends AbstractPolicy { private static final Logger LOG = LoggerFactory.getLogger(ServiceRestarter.class); public static final BasicNotificationSensor<FailureDescriptor> ENTITY_RESTART_FAILED = new BasicNotificationSensor<FailureDescriptor>( FailureDescriptor.class, "ha.entityFailed.restart", "Indicates that an entity restart attempt has failed"); /** skips retry if a failure re-occurs within this time interval */ @SetFromFlag("failOnRecurringFailuresInThisDuration") public static final ConfigKey<Duration> FAIL_ON_RECURRING_FAILURES_IN_THIS_DURATION = ConfigKeys.newConfigKey( Duration.class, "failOnRecurringFailuresInThisDuration", "Reports entity as failed if it fails two or more times in this time window", Duration.minutes(3)); @SetFromFlag("setOnFireOnFailure") public static final ConfigKey<Boolean> SET_ON_FIRE_ON_FAILURE = ConfigKeys.newBooleanConfigKey("setOnFireOnFailure", "", true); /** monitors this sensor, by default ENTITY_FAILED */ @SetFromFlag("failureSensorToMonitor") @SuppressWarnings({ "rawtypes", "unchecked" }) public static final ConfigKey<Sensor<?>> FAILURE_SENSOR_TO_MONITOR = (ConfigKey) ConfigKeys.newConfigKey(Sensor.class, "failureSensorToMonitor", "", HASensors.ENTITY_FAILED); protected final AtomicReference<Long> lastFailureTime = new AtomicReference<Long>(); public ServiceRestarter() { this(new ConfigBag()); } public ServiceRestarter(Map<String,?> flags) { this(new ConfigBag().putAll(flags)); } public ServiceRestarter(ConfigBag configBag) { // TODO hierarchy should use ConfigBag, and not change flags super(configBag.getAllConfigMutable()); uniqueTag = JavaClassNames.simpleClassName(getClass())+":"+getConfig(FAILURE_SENSOR_TO_MONITOR).getName(); } public ServiceRestarter(Sensor<?> failureSensorToMonitor) { this(new ConfigBag().configure(FAILURE_SENSOR_TO_MONITOR, failureSensorToMonitor)); } @Override public void setEntity(final EntityLocal entity) { Preconditions.checkArgument(entity instanceof Startable, "Restarter must take a Startable, not "+entity); super.setEntity(entity); subscriptions().subscribe(entity, getConfig(FAILURE_SENSOR_TO_MONITOR), new SensorEventListener<Object>() { @Override public void onEvent(final SensorEvent<Object> event) { // Must execute in another thread - if we called entity.restart in the event-listener's thread // then we'd block all other events being delivered to this entity's other subscribers. // Relies on synchronization of `onDetectedFailure`. // See same pattern used in ServiceReplacer. // TODO Could use BasicExecutionManager.setTaskSchedulerForTag to prevent race of two // events being received in rapid succession, and onDetectedFailure being executed out-of-order // for them; or could write events to a blocking queue and have onDetectedFailure read from that. if (isRunning()) { LOG.info("ServiceRestarter notified; dispatching job for "+entity+" ("+event.getValue()+")"); ((EntityInternal)entity).getExecutionContext().submit(MutableMap.of(), new Runnable() { @Override public void run() { onDetectedFailure(event); }}); } else { LOG.warn("ServiceRestarter not running, so not acting on failure detected at "+entity+" ("+event.getValue()+")"); } } }); } // TODO semaphores would be better to allow at-most-one-blocking behaviour // FIXME as this is called in message-dispatch (single threaded) we should do most of this in a new submitted task // (as has been done in ServiceReplacer) protected synchronized void onDetectedFailure(SensorEvent<Object> event) { if (isSuspended()) { LOG.warn("ServiceRestarter suspended, so not acting on failure detected at "+entity+" ("+event.getValue()+")"); return; } LOG.warn("ServiceRestarter acting on failure detected at "+entity+" ("+event.getValue()+")"); long current = System.currentTimeMillis(); Long last = lastFailureTime.getAndSet(current); long elapsed = last==null ? -1 : current-last; if (elapsed>=0 && elapsed <= getConfig(FAIL_ON_RECURRING_FAILURES_IN_THIS_DURATION).toMilliseconds()) { onRestartFailed("Restart failure (failed again after "+Time.makeTimeStringRounded(elapsed)+") at "+entity+": "+event.getValue()); return; } try { ServiceStateLogic.setExpectedState(entity, Lifecycle.STARTING); Entities.invokeEffector(entity, entity, Startable.RESTART).get(); } catch (Exception e) { onRestartFailed("Restart failure (error "+e+") at "+entity+": "+event.getValue()); } } protected void onRestartFailed(String msg) { LOG.warn("ServiceRestarter failed for "+entity+": "+msg); if (getConfig(SET_ON_FIRE_ON_FAILURE)) { ServiceStateLogic.setExpectedState(entity, Lifecycle.ON_FIRE); } entity.sensors().emit(ENTITY_RESTART_FAILED, new FailureDescriptor(entity, msg)); } }