/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.brooklyn.policy.ha; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.brooklyn.api.catalog.Catalog; import org.apache.brooklyn.api.entity.Entity; import org.apache.brooklyn.api.entity.EntityLocal; import org.apache.brooklyn.api.entity.Group; import org.apache.brooklyn.api.sensor.Sensor; import org.apache.brooklyn.api.sensor.SensorEvent; import org.apache.brooklyn.api.sensor.SensorEventListener; import org.apache.brooklyn.config.ConfigKey; import org.apache.brooklyn.core.config.BasicConfigKey; import org.apache.brooklyn.core.config.ConfigKeys; import org.apache.brooklyn.core.entity.Entities; import org.apache.brooklyn.core.entity.EntityInternal; import org.apache.brooklyn.core.entity.lifecycle.ServiceStateLogic.ServiceProblemsLogic; import org.apache.brooklyn.core.entity.trait.MemberReplaceable; import org.apache.brooklyn.core.policy.AbstractPolicy; import org.apache.brooklyn.core.sensor.BasicNotificationSensor; import org.apache.brooklyn.entity.group.StopFailedRuntimeException; import org.apache.brooklyn.policy.ha.HASensors.FailureDescriptor; import org.apache.brooklyn.util.collections.MutableMap; import org.apache.brooklyn.util.core.config.ConfigBag; import org.apache.brooklyn.util.core.flags.SetFromFlag; import org.apache.brooklyn.util.exceptions.Exceptions; import com.google.common.base.Ticker; import com.google.common.collect.Lists; /** attaches to a DynamicCluster and replaces a failed member in response to HASensors.ENTITY_FAILED or other sensor; * if this fails, it sets the Cluster state to on-fire */ @Catalog(name="Service Replacer", description="HA policy for replacing a failed member of a group") public class ServiceReplacer extends AbstractPolicy { private static final Logger LOG = LoggerFactory.getLogger(ServiceReplacer.class); // TODO if there are multiple failures perhaps we should abort quickly public static final BasicNotificationSensor<FailureDescriptor> ENTITY_REPLACEMENT_FAILED = new BasicNotificationSensor<FailureDescriptor>( FailureDescriptor.class, "ha.entityFailed.replacement", "Indicates that an entity replacement attempt has failed"); @SetFromFlag("setOnFireOnFailure") public static final ConfigKey<Boolean> SET_ON_FIRE_ON_FAILURE = ConfigKeys.newBooleanConfigKey("setOnFireOnFailure", "", true); /** monitors this sensor, by default ENTITY_RESTART_FAILED */ @SetFromFlag("failureSensorToMonitor") @SuppressWarnings("rawtypes") public static final ConfigKey<Sensor> FAILURE_SENSOR_TO_MONITOR = new BasicConfigKey<Sensor>(Sensor.class, "failureSensorToMonitor", "", ServiceRestarter.ENTITY_RESTART_FAILED); /** skips replace if replacement has failed this many times failure re-occurs within this time interval */ @SetFromFlag("failOnRecurringFailuresInThisDuration") public static final ConfigKey<Long> FAIL_ON_RECURRING_FAILURES_IN_THIS_DURATION = ConfigKeys.newLongConfigKey( "failOnRecurringFailuresInThisDuration", "abandon replace if replacement has failed many times within this time interval", 5*60*1000L); /** skips replace if replacement has failed this many times failure re-occurs within this time interval */ @SetFromFlag("failOnNumRecurringFailures") public static final ConfigKey<Integer> FAIL_ON_NUM_RECURRING_FAILURES = ConfigKeys.newIntegerConfigKey( "failOnNumRecurringFailures", "abandon replace if replacement has failed this many times (100% of attempts) within the time interval", 5); @SetFromFlag("ticker") public static final ConfigKey<Ticker> TICKER = ConfigKeys.newConfigKey(Ticker.class, "ticker", "A time source (defaults to system-clock, which is almost certainly what's wanted, except in tests)", null); protected final List<Long> consecutiveReplacementFailureTimes = Lists.newCopyOnWriteArrayList(); public ServiceReplacer() { this(new ConfigBag()); } public ServiceReplacer(Map<String,?> flags) { this(new ConfigBag().putAll(flags)); } public ServiceReplacer(ConfigBag configBag) { // TODO hierarchy should use ConfigBag, and not change flags super(configBag.getAllConfigMutable()); } public ServiceReplacer(Sensor<?> failureSensorToMonitor) { this(new ConfigBag().configure(FAILURE_SENSOR_TO_MONITOR, failureSensorToMonitor)); } @Override public void setEntity(final EntityLocal entity) { checkArgument(entity instanceof MemberReplaceable, "ServiceReplacer must take a MemberReplaceable, not %s", entity); Sensor<?> failureSensorToMonitor = checkNotNull(getConfig(FAILURE_SENSOR_TO_MONITOR), "failureSensorToMonitor"); super.setEntity(entity); subscriptions().subscribeToMembers((Group)entity, failureSensorToMonitor, new SensorEventListener<Object>() { @Override public void onEvent(final SensorEvent<Object> event) { // Must execute in another thread - if we called entity.replaceMember in the event-listener's thread // then we'd block all other events being delivered to this entity's other subscribers. // Relies on synchronization of `onDetectedFailure`. // See same pattern used in ServiceRestarter. // TODO Could use BasicExecutionManager.setTaskSchedulerForTag to prevent race of two // events being received in rapid succession, and onDetectedFailure being executed out-of-order // for them; or could write events to a blocking queue and have onDetectedFailure read from that. if (isRunning()) { LOG.warn("ServiceReplacer notified; dispatching job for "+entity+" ("+event.getValue()+")"); ((EntityInternal)entity).getExecutionContext().submit(MutableMap.of(), new Runnable() { @Override public void run() { onDetectedFailure(event); }}); } else { LOG.warn("ServiceReplacer not running, so not acting on failure detected at "+entity+" ("+event.getValue()+", child of "+entity+")"); } } }); } // TODO semaphores would be better to allow at-most-one-blocking behaviour protected synchronized void onDetectedFailure(SensorEvent<Object> event) { final Entity failedEntity = event.getSource(); final Object reason = event.getValue(); if (isSuspended()) { LOG.warn("ServiceReplacer suspended, so not acting on failure detected at "+failedEntity+" ("+reason+", child of "+entity+")"); return; } if (isRepeatedlyFailingTooMuch()) { LOG.error("ServiceReplacer not acting on failure detected at "+failedEntity+" ("+reason+", child of "+entity+"), because too many recent replacement failures"); return; } LOG.warn("ServiceReplacer acting on failure detected at "+failedEntity+" ("+reason+", child of "+entity+")"); ((EntityInternal)entity).getManagementSupport().getExecutionContext().submit(MutableMap.of(), new Runnable() { @Override public void run() { try { Entities.invokeEffectorWithArgs(entity, entity, MemberReplaceable.REPLACE_MEMBER, failedEntity.getId()).get(); consecutiveReplacementFailureTimes.clear(); } catch (Exception e) { if (Exceptions.getFirstThrowableOfType(e, StopFailedRuntimeException.class) != null) { LOG.info("ServiceReplacer: ignoring error reported from stopping failed node "+failedEntity); return; } onReplacementFailed("Replace failure ("+Exceptions.collapseText(e)+") at "+entity+": "+reason); } } }); } private boolean isRepeatedlyFailingTooMuch() { Integer failOnNumRecurringFailures = getConfig(FAIL_ON_NUM_RECURRING_FAILURES); long failOnRecurringFailuresInThisDuration = getConfig(FAIL_ON_RECURRING_FAILURES_IN_THIS_DURATION); long oldestPermitted = currentTimeMillis() - failOnRecurringFailuresInThisDuration; // trim old ones for (Iterator<Long> iter = consecutiveReplacementFailureTimes.iterator(); iter.hasNext();) { Long timestamp = iter.next(); if (timestamp < oldestPermitted) { iter.remove(); } else { break; } } return (consecutiveReplacementFailureTimes.size() >= failOnNumRecurringFailures); } protected long currentTimeMillis() { Ticker ticker = getConfig(TICKER); return (ticker == null) ? System.currentTimeMillis() : TimeUnit.NANOSECONDS.toMillis(ticker.read()); } protected void onReplacementFailed(String msg) { LOG.warn("ServiceReplacer failed for "+entity+": "+msg); consecutiveReplacementFailureTimes.add(currentTimeMillis()); if (getConfig(SET_ON_FIRE_ON_FAILURE)) { ServiceProblemsLogic.updateProblemsIndicator(entity, "ServiceReplacer", "replacement failed: "+msg); } entity.sensors().emit(ENTITY_REPLACEMENT_FAILED, new FailureDescriptor(entity, msg)); } }