/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.brooklyn.policy.ha;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertNotEquals;
import static org.testng.Assert.assertTrue;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.brooklyn.api.entity.Entity;
import org.apache.brooklyn.api.entity.EntitySpec;
import org.apache.brooklyn.api.location.Location;
import org.apache.brooklyn.api.location.LocationSpec;
import org.apache.brooklyn.api.mgmt.ManagementContext;
import org.apache.brooklyn.api.policy.PolicySpec;
import org.apache.brooklyn.api.sensor.SensorEvent;
import org.apache.brooklyn.api.sensor.SensorEventListener;
import org.apache.brooklyn.core.entity.Attributes;
import org.apache.brooklyn.core.entity.Entities;
import org.apache.brooklyn.core.entity.EntityInternal;
import org.apache.brooklyn.core.entity.factory.ApplicationBuilder;
import org.apache.brooklyn.core.entity.lifecycle.Lifecycle;
import org.apache.brooklyn.core.entity.lifecycle.ServiceStateLogic.ComputeServiceIndicatorsFromChildrenAndMembers;
import org.apache.brooklyn.core.entity.trait.FailingEntity;
import org.apache.brooklyn.core.location.SimulatedLocation;
import org.apache.brooklyn.core.test.entity.LocalManagementContextForTests;
import org.apache.brooklyn.core.test.entity.TestApplication;
import org.apache.brooklyn.core.test.entity.TestEntity;
import org.apache.brooklyn.entity.group.DynamicCluster;
import org.apache.brooklyn.test.Asserts;
import org.apache.brooklyn.test.EntityTestUtils;
import org.apache.brooklyn.util.collections.QuorumCheck;
import org.apache.brooklyn.util.core.config.ConfigBag;
import org.apache.brooklyn.util.javalang.JavaClassNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.Assert;
import org.testng.annotations.AfterMethod;
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
import org.apache.brooklyn.policy.ha.HASensors.FailureDescriptor;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
public class ServiceReplacerTest {
private static final Logger log = LoggerFactory.getLogger(ServiceReplacerTest.class);
private ManagementContext managementContext;
private TestApplication app;
private SimulatedLocation loc;
private SensorEventListener<Object> eventListener;
private List<SensorEvent<?>> events;
@BeforeMethod(alwaysRun=true)
public void setUp() throws Exception {
managementContext = new LocalManagementContextForTests();
app = ApplicationBuilder.newManagedApp(TestApplication.class, managementContext);
loc = managementContext.getLocationManager().createLocation(LocationSpec.create(SimulatedLocation.class));
events = Lists.newCopyOnWriteArrayList();
eventListener = new SensorEventListener<Object>() {
@Override public void onEvent(SensorEvent<Object> event) {
events.add(event);
}
};
}
@AfterMethod(alwaysRun=true)
public void tearDown() throws Exception {
if (managementContext != null) Entities.destroyAll(managementContext);
}
@Test
public void testReplacesFailedMember() throws Exception {
final DynamicCluster cluster = app.createAndManageChild(EntitySpec.create(DynamicCluster.class)
.configure(DynamicCluster.MEMBER_SPEC, EntitySpec.create(TestEntity.class))
.configure(DynamicCluster.INITIAL_SIZE, 3));
app.start(ImmutableList.<Location>of(loc));
ServiceReplacer policy = new ServiceReplacer(new ConfigBag().configure(ServiceReplacer.FAILURE_SENSOR_TO_MONITOR, HASensors.ENTITY_FAILED));
cluster.policies().add(policy);
final Set<Entity> initialMembers = ImmutableSet.copyOf(cluster.getMembers());
final TestEntity e1 = (TestEntity) Iterables.get(initialMembers, 1);
e1.sensors().emit(HASensors.ENTITY_FAILED, new FailureDescriptor(e1, "simulate failure"));
// Expect e1 to be replaced
Asserts.succeedsEventually(new Runnable() {
@Override public void run() {
Set<Entity> newMembers = Sets.difference(ImmutableSet.copyOf(cluster.getMembers()), initialMembers);
Set<Entity> removedMembers = Sets.difference(initialMembers, ImmutableSet.copyOf(cluster.getMembers()));
assertEquals(removedMembers, ImmutableSet.of(e1));
assertEquals(newMembers.size(), 1);
assertEquals(((TestEntity)Iterables.getOnlyElement(newMembers)).getCallHistory(), ImmutableList.of("start"));
assertEquals(e1.getCallHistory(), ImmutableList.of("start", "stop"));
assertFalse(Entities.isManaged(e1));
}});
}
@Test(invocationCount=100)
public void testSetsOnFireWhenFailToReplaceMemberManyTimes() throws Exception {
testSetsOnFireWhenFailToReplaceMember();
}
// fails the startup of the replacement entity (but not the original).
@Test
public void testSetsOnFireWhenFailToReplaceMember() throws Exception {
app.subscriptions().subscribe(null, ServiceReplacer.ENTITY_REPLACEMENT_FAILED, eventListener);
final DynamicCluster cluster = app.createAndManageChild(EntitySpec.create(DynamicCluster.class)
.configure(DynamicCluster.MEMBER_SPEC, EntitySpec.create(FailingEntity.class)
.configure(FailingEntity.FAIL_ON_START_CONDITION, predicateOnlyTrueForCallAtOrAfter(2)))
.configure(DynamicCluster.INITIAL_SIZE, 1)
.configure(DynamicCluster.QUARANTINE_FAILED_ENTITIES, true)
.configure(ComputeServiceIndicatorsFromChildrenAndMembers.UP_QUORUM_CHECK, QuorumCheck.QuorumChecks.alwaysTrue())
.configure(ComputeServiceIndicatorsFromChildrenAndMembers.RUNNING_QUORUM_CHECK, QuorumCheck.QuorumChecks.alwaysTrue()));
app.start(ImmutableList.<Location>of(loc));
// should not be on fire
Assert.assertNotEquals(cluster.getAttribute(Attributes.SERVICE_STATE_ACTUAL), Lifecycle.ON_FIRE);
// and should eventually be running
EntityTestUtils.assertAttributeEqualsEventually(cluster, Attributes.SERVICE_STATE_ACTUAL, Lifecycle.RUNNING);
log.info("started "+app+" for "+JavaClassNames.niceClassAndMethod());
ServiceReplacer policy = new ServiceReplacer(new ConfigBag().configure(ServiceReplacer.FAILURE_SENSOR_TO_MONITOR, HASensors.ENTITY_FAILED));
cluster.policies().add(policy);
final Set<Entity> initialMembers = ImmutableSet.copyOf(cluster.getMembers());
final TestEntity e1 = (TestEntity) Iterables.get(initialMembers, 0);
e1.sensors().emit(HASensors.ENTITY_FAILED, new FailureDescriptor(e1, "simulate failure"));
// Expect cluster to go on-fire when fails to start replacement
// Note that we've set up-quorum and running-quorum to be "alwaysTrue" so that we don't get a transient onFire
// when the failed node fails to start (but before it has been removed from the group to be put in quarantine).
EntityTestUtils.assertAttributeEqualsEventually(cluster, Attributes.SERVICE_STATE_ACTUAL, Lifecycle.ON_FIRE);
// Expect to have the second failed entity still kicking around as proof (in quarantine)
// The cluster should NOT go on fire until after the 2nd failure
Iterable<Entity> members = Iterables.filter(managementContext.getEntityManager().getEntities(), Predicates.instanceOf(FailingEntity.class));
assertEquals(Iterables.size(members), 2);
// e2 failed to start, so it won't have called stop on e1
TestEntity e2 = (TestEntity) Iterables.getOnlyElement(Sets.difference(ImmutableSet.copyOf(members), initialMembers));
assertEquals(e1.getCallHistory(), ImmutableList.of("start"), "e1.history="+e1.getCallHistory());
assertEquals(e2.getCallHistory(), ImmutableList.of("start"), "e2.history="+e2.getCallHistory());
// And will have received notification event about it
assertEventuallyHasEntityReplacementFailedEvent(cluster);
}
@Test(groups="Integration") // has a 1 second wait
public void testDoesNotOnFireWhenFailToReplaceMember() throws Exception {
app.subscriptions().subscribe(null, ServiceReplacer.ENTITY_REPLACEMENT_FAILED, eventListener);
final DynamicCluster cluster = app.createAndManageChild(EntitySpec.create(DynamicCluster.class)
.configure(DynamicCluster.MEMBER_SPEC, EntitySpec.create(FailingEntity.class)
.configure(FailingEntity.FAIL_ON_START_CONDITION, predicateOnlyTrueForCallAtOrAfter(2)))
.configure(DynamicCluster.INITIAL_SIZE, 1)
.configure(DynamicCluster.QUARANTINE_FAILED_ENTITIES, true));
app.start(ImmutableList.<Location>of(loc));
ServiceReplacer policy = new ServiceReplacer(new ConfigBag()
.configure(ServiceReplacer.FAILURE_SENSOR_TO_MONITOR, HASensors.ENTITY_FAILED)
.configure(ServiceReplacer.SET_ON_FIRE_ON_FAILURE, false));
cluster.policies().add(policy);
final Set<Entity> initialMembers = ImmutableSet.copyOf(cluster.getMembers());
final TestEntity e1 = (TestEntity) Iterables.get(initialMembers, 0);
e1.sensors().emit(HASensors.ENTITY_FAILED, new FailureDescriptor(e1, "simulate failure"));
// Configured to not mark cluster as on fire
Asserts.succeedsContinually(new Runnable() {
@Override public void run() {
assertNotEquals(cluster.getAttribute(Attributes.SERVICE_STATE_ACTUAL), Lifecycle.ON_FIRE);
}});
// And will have received notification event about it
assertEventuallyHasEntityReplacementFailedEvent(cluster);
}
@Test(groups="Integration") // 1s wait
public void testStopFailureOfOldEntityDoesNotSetClusterOnFire() throws Exception {
app.subscriptions().subscribe(null, ServiceReplacer.ENTITY_REPLACEMENT_FAILED, eventListener);
final DynamicCluster cluster = app.createAndManageChild(EntitySpec.create(DynamicCluster.class)
.configure(DynamicCluster.MEMBER_SPEC, EntitySpec.create(FailingEntity.class)
.configure(FailingEntity.FAIL_ON_STOP_CONDITION, predicateOnlyTrueForCallAt(1)))
.configure(DynamicCluster.INITIAL_SIZE, 2));
app.start(ImmutableList.<Location>of(loc));
cluster.policies().add(PolicySpec.create(ServiceReplacer.class)
.configure(ServiceReplacer.FAILURE_SENSOR_TO_MONITOR, HASensors.ENTITY_FAILED));
final Set<Entity> initialMembers = ImmutableSet.copyOf(cluster.getMembers());
final TestEntity e1 = (TestEntity) Iterables.get(initialMembers, 0);
e1.sensors().emit(HASensors.ENTITY_FAILED, new FailureDescriptor(e1, "simulate failure"));
// Expect e1 to be replaced
Asserts.succeedsEventually(new Runnable() {
@Override public void run() {
Set<Entity> newMembers = Sets.difference(ImmutableSet.copyOf(cluster.getMembers()), initialMembers);
Set<Entity> removedMembers = Sets.difference(initialMembers, ImmutableSet.copyOf(cluster.getMembers()));
assertEquals(removedMembers, ImmutableSet.of(e1));
assertEquals(newMembers.size(), 1);
assertEquals(((TestEntity)Iterables.getOnlyElement(newMembers)).getCallHistory(), ImmutableList.of("start"));
assertEquals(e1.getCallHistory(), ImmutableList.of("start", "stop"));
assertFalse(Entities.isManaged(e1));
}});
// Failure to stop the failed member should not cause "on-fire" of cluster
Asserts.succeedsContinually(new Runnable() {
@Override public void run() {
assertNotEquals(cluster.getAttribute(Attributes.SERVICE_STATE_ACTUAL), Lifecycle.ON_FIRE);
}});
}
/**
* If we keep on getting failure reports, never managing to replace the failed node, then don't keep trying
* (i.e. avoid infinite loop).
*
* TODO This code + configuration needs some work; it's not testing quite the scenarios that I
* was thinking of!
* I saw problem where a node failed, and the replacements failed, and we ended up trying thousands of times.
* (describing this scenario is made more complex by me having temporarily disabled the cluster from
* removing failed members, for debugging purposes!)
* Imagine these two scenarios:
* <ol>
* <li>Entity fails during call to start().
* Here, the cluster removes it as a member (either unmanages it or puts it in quarantine)
* So the ENTITY_FAILED is ignored because the entity is not a member at that point.
* <li>Entity returns from start(), but quickly goes to service-down.
* Here we'll keep trying to replace that entity. Depending how long that takes, we'll either
* enter a horrible infinite loop, or we'll just provision a huge number of VMs over a long
* time period.
* Unfortunately this scenario is not catered for in the code yet.
* </ol>
*/
@Test(groups="Integration") // because takes 1.2 seconds
public void testAbandonsReplacementAfterNumFailures() throws Exception {
app.subscriptions().subscribe(null, ServiceReplacer.ENTITY_REPLACEMENT_FAILED, eventListener);
final DynamicCluster cluster = app.createAndManageChild(EntitySpec.create(DynamicCluster.class)
.configure(DynamicCluster.MEMBER_SPEC, EntitySpec.create(FailingEntity.class)
.configure(FailingEntity.FAIL_ON_START_CONDITION, predicateOnlyTrueForCallAtOrAfter(11)))
.configure(DynamicCluster.INITIAL_SIZE, 10)
.configure(DynamicCluster.QUARANTINE_FAILED_ENTITIES, true));
app.start(ImmutableList.<Location>of(loc));
ServiceReplacer policy = new ServiceReplacer(new ConfigBag()
.configure(ServiceReplacer.FAILURE_SENSOR_TO_MONITOR, HASensors.ENTITY_FAILED)
.configure(ServiceReplacer.FAIL_ON_NUM_RECURRING_FAILURES, 3));
cluster.policies().add(policy);
final Set<Entity> initialMembers = ImmutableSet.copyOf(cluster.getMembers());
for (int i = 0; i < 5; i++) {
final int counter = i+1;
EntityInternal entity = (EntityInternal) Iterables.get(initialMembers, i);
entity.sensors().emit(HASensors.ENTITY_FAILED, new FailureDescriptor(entity, "simulate failure"));
if (i <= 3) {
Asserts.succeedsEventually(new Runnable() {
@Override public void run() {
Set<FailingEntity> all = ImmutableSet.copyOf(Iterables.filter(managementContext.getEntityManager().getEntities(), FailingEntity.class));
Set<FailingEntity> replacements = Sets.difference(all, initialMembers);
Set<?> replacementMembers = Sets.intersection(ImmutableSet.of(cluster.getMembers()), replacements);
assertTrue(replacementMembers.isEmpty());
assertEquals(replacements.size(), counter);
}});
} else {
Asserts.succeedsContinually(new Runnable() {
@Override public void run() {
Set<FailingEntity> all = ImmutableSet.copyOf(Iterables.filter(managementContext.getEntityManager().getEntities(), FailingEntity.class));
Set<FailingEntity> replacements = Sets.difference(all, initialMembers);
assertEquals(replacements.size(), 4);
}});
}
}
}
private Predicate<Object> predicateOnlyTrueForCallAt(final int callNumber) {
return predicateOnlyTrueForCallRange(callNumber, callNumber);
}
private Predicate<Object> predicateOnlyTrueForCallAtOrAfter(final int callLowerNumber) {
return predicateOnlyTrueForCallRange(callLowerNumber, Integer.MAX_VALUE);
}
private Predicate<Object> predicateOnlyTrueForCallRange(final int callLowerNumber, final int callUpperNumber) {
return new Predicate<Object>() {
private final AtomicInteger counter = new AtomicInteger(0);
@Override public boolean apply(Object input) {
int num = counter.incrementAndGet();
return num >= callLowerNumber && num <= callUpperNumber;
}
};
}
private void assertEventuallyHasEntityReplacementFailedEvent(final Entity expectedCluster) {
Asserts.succeedsEventually(new Runnable() {
@Override public void run() {
assertEquals(Iterables.getOnlyElement(events).getSensor(), ServiceReplacer.ENTITY_REPLACEMENT_FAILED, "events="+events);
assertEquals(Iterables.getOnlyElement(events).getSource(), expectedCluster, "events="+events);
assertEquals(((FailureDescriptor)Iterables.getOnlyElement(events).getValue()).getComponent(), expectedCluster, "events="+events);
}});
}
}