/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.brooklyn.core.mgmt.ha; import static org.testng.Assert.assertEquals; import static org.testng.Assert.fail; import java.util.Collections; import java.util.Date; import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; import org.apache.brooklyn.api.entity.EntitySpec; import org.apache.brooklyn.api.location.Location; import org.apache.brooklyn.api.mgmt.ha.HighAvailabilityMode; import org.apache.brooklyn.api.mgmt.ha.ManagementNodeState; import org.apache.brooklyn.api.mgmt.ha.ManagementNodeSyncRecord; import org.apache.brooklyn.api.mgmt.ha.ManagementPlaneSyncRecord; import org.apache.brooklyn.api.mgmt.ha.ManagementPlaneSyncRecordPersister; import org.apache.brooklyn.core.BrooklynFeatureEnablement; import org.apache.brooklyn.core.entity.Entities; import org.apache.brooklyn.core.entity.factory.ApplicationBuilder; import org.apache.brooklyn.core.mgmt.ha.HighAvailabilityManagerImpl; import org.apache.brooklyn.core.mgmt.ha.ManagementPlaneSyncRecordPersisterToObjectStore; import org.apache.brooklyn.core.mgmt.ha.TestEntityFailingRebind.RebindException; import org.apache.brooklyn.core.mgmt.internal.ManagementContextInternal; import org.apache.brooklyn.core.mgmt.persist.BrooklynMementoPersisterToObjectStore; import org.apache.brooklyn.core.mgmt.persist.InMemoryObjectStore; import org.apache.brooklyn.core.mgmt.persist.ListeningObjectStore; import org.apache.brooklyn.core.mgmt.persist.PersistMode; import org.apache.brooklyn.core.mgmt.persist.PersistenceObjectStore; import org.apache.brooklyn.core.mgmt.rebind.PersistenceExceptionHandlerImpl; import org.apache.brooklyn.core.test.entity.LocalManagementContextForTests; import org.apache.brooklyn.core.test.entity.TestApplication; import org.apache.brooklyn.test.Asserts; import org.apache.brooklyn.util.collections.MutableList; import org.apache.brooklyn.util.collections.MutableMap; import org.apache.brooklyn.util.exceptions.Exceptions; import org.apache.brooklyn.util.time.Duration; import org.apache.brooklyn.util.time.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; import com.google.common.base.Stopwatch; import com.google.common.base.Ticker; import com.google.common.collect.ImmutableList; @Test public class HighAvailabilityManagerSplitBrainTest { private static final Logger log = LoggerFactory.getLogger(HighAvailabilityManagerSplitBrainTest.class); private List<HaMgmtNode> nodes = new MutableList<HighAvailabilityManagerSplitBrainTest.HaMgmtNode>(); Map<String,String> sharedBackingStore = MutableMap.of(); Map<String,Date> sharedBackingStoreDates = MutableMap.of(); private AtomicLong sharedTime; // used to set the ticker's return value private ClassLoader classLoader = getClass().getClassLoader(); public class HaMgmtNode { // TODO share with HotStandbyTest and WarmStandbyTest and a few others (minor differences but worth it ultimately) private ManagementContextInternal mgmt; private String ownNodeId; private String nodeName; private ListeningObjectStore objectStore; private ManagementPlaneSyncRecordPersister persister; private HighAvailabilityManagerImpl ha; private Ticker ticker; private AtomicLong currentTime; // used to set the ticker's return value public void setUp() throws Exception { if (sharedTime==null) currentTime = new AtomicLong(System.currentTimeMillis()); ticker = new Ticker() { // strictly not a ticker because returns millis UTC, but it works fine even so @Override public long read() { if (sharedTime!=null) return sharedTime.get(); return currentTime.get(); } }; nodeName = "node "+nodes.size(); mgmt = newLocalManagementContext(); ownNodeId = mgmt.getManagementNodeId(); objectStore = new ListeningObjectStore(newPersistenceObjectStore()); objectStore.injectManagementContext(mgmt); objectStore.prepareForSharedUse(PersistMode.CLEAN, HighAvailabilityMode.DISABLED); persister = new ManagementPlaneSyncRecordPersisterToObjectStore(mgmt, objectStore, classLoader); ((ManagementPlaneSyncRecordPersisterToObjectStore)persister).preferRemoteTimestampInMemento(); BrooklynMementoPersisterToObjectStore persisterObj = new BrooklynMementoPersisterToObjectStore(objectStore, mgmt.getBrooklynProperties(), classLoader); mgmt.getRebindManager().setPersister(persisterObj, PersistenceExceptionHandlerImpl.builder().build()); ha = ((HighAvailabilityManagerImpl)mgmt.getHighAvailabilityManager()) .setPollPeriod(Duration.PRACTICALLY_FOREVER) .setHeartbeatTimeout(Duration.THIRTY_SECONDS) .setLocalTicker(ticker) .setRemoteTicker(ticker) .setPersister(persister); log.info("Created "+nodeName+" "+ownNodeId); } public void tearDown() throws Exception { if (ha != null) ha.stop(); if (mgmt != null) Entities.destroyAll(mgmt); if (objectStore != null) objectStore.deleteCompletely(); } private long tickerCurrentMillis() { return ticker.read(); } private long tickerAdvance(Duration duration) { if (sharedTime!=null) throw new IllegalStateException("Using shared ticker; cannot advance private node clock"); currentTime.addAndGet(duration.toMilliseconds()); return tickerCurrentMillis(); } @Override public String toString() { return nodeName+" "+ownNodeId; } } private Boolean prevThrowOnRebind; @BeforeMethod(alwaysRun=true) public void setUp() throws Exception { prevThrowOnRebind = TestEntityFailingRebind.getThrowOnRebind(); TestEntityFailingRebind.setThrowOnRebind(true); nodes.clear(); sharedBackingStore.clear(); } @AfterMethod(alwaysRun=true) public void tearDown() throws Exception { try { for (HaMgmtNode n: nodes) n.tearDown(); } finally { if (prevThrowOnRebind != null) TestEntityFailingRebind.setThrowOnRebind(prevThrowOnRebind); } } public HaMgmtNode newNode() throws Exception { HaMgmtNode node = new HaMgmtNode(); node.setUp(); nodes.add(node); return node; } private void sharedTickerAdvance(Duration duration) { if (sharedTime==null) { for (HaMgmtNode n: nodes) n.tickerAdvance(duration); } else { sharedTime.addAndGet(duration.toMilliseconds()); } } private long sharedTickerCurrentMillis() { return sharedTime.get(); } protected void useSharedTime() { if (!nodes.isEmpty()) throw new IllegalStateException("shared time must be set up before any nodes created"); sharedTime = new AtomicLong(System.currentTimeMillis()); } protected ManagementContextInternal newLocalManagementContext() { return new LocalManagementContextForTests(); } protected PersistenceObjectStore newPersistenceObjectStore() { return new InMemoryObjectStore(sharedBackingStore, sharedBackingStoreDates); } @Test public void testDoubleRebindFails() throws Exception { useSharedTime(); HaMgmtNode n1 = newNode(); HaMgmtNode n2 = newNode(); // first auto should become master n1.ha.start(HighAvailabilityMode.AUTO); n2.ha.start(HighAvailabilityMode.AUTO); assertEquals(n1.ha.getNodeState(), ManagementNodeState.MASTER); TestApplication app = ApplicationBuilder.newManagedApp( EntitySpec.create(TestApplication.class).impl(TestEntityFailingRebind.class), n1.mgmt); app.start(ImmutableList.<Location>of()); n1.mgmt.getRebindManager().forcePersistNow(false, null); //don't publish state for a while (i.e. long store delays, failures) sharedTickerAdvance(Duration.ONE_MINUTE); try { n2.ha.publishAndCheck(false); fail("n2 rebind failure expected"); } catch (Exception e) { assertNestedRebindException(e); } // re-check should re-assert successfully, no rebind expected as he was previously master n1.ha.publishAndCheck(false); ManagementPlaneSyncRecord memento; memento = n1.ha.loadManagementPlaneSyncRecord(true); assertEquals(memento.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.MASTER); assertEquals(memento.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.FAILED); // hot backup permitted by the TestEntityFailingRebind n1.ha.changeMode(HighAvailabilityMode.HOT_BACKUP); memento = n1.ha.loadManagementPlaneSyncRecord(true); assertEquals(memento.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.HOT_BACKUP); try { n1.ha.changeMode(HighAvailabilityMode.MASTER); fail("n1 rebind failure expected"); } catch (Exception e) { assertNestedRebindException(e); } memento = n1.ha.loadManagementPlaneSyncRecord(true); assertEquals(memento.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.FAILED); assertEquals(memento.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.FAILED); } @Test public void testStandbyRebind() throws Exception { useSharedTime(); HaMgmtNode n1 = newNode(); HaMgmtNode n2 = newNode(); // first auto should become master n1.ha.start(HighAvailabilityMode.AUTO); n2.ha.start(HighAvailabilityMode.AUTO); TestApplication app = ApplicationBuilder.newManagedApp( EntitySpec.create(TestApplication.class).impl(TestEntityFailingRebind.class), n1.mgmt); app.start(ImmutableList.<Location>of()); n1.mgmt.getRebindManager().forcePersistNow(false, null); //don't publish state for a while (i.e. long store delays, failures) sharedTickerAdvance(Duration.ONE_MINUTE); try { n2.ha.publishAndCheck(false); fail("n2 rebind failure expected"); } catch (Exception e) { assertNestedRebindException(e); } TestEntityFailingRebind.setThrowOnRebind(false); n1.ha.publishAndCheck(false); ManagementPlaneSyncRecord memento = n1.ha.loadManagementPlaneSyncRecord(true); assertEquals(memento.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.MASTER); assertEquals(memento.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.FAILED); } private void assertNestedRebindException(Throwable t) { Throwable ptr = t; while (ptr != null) { if (ptr instanceof RebindException) { return; } ptr = ptr.getCause(); } Exceptions.propagate(t); } @Test public void testIfNodeStopsBeingAbleToWrite() throws Exception { useSharedTime(); log.info("time at start "+sharedTickerCurrentMillis()); HaMgmtNode n1 = newNode(); HaMgmtNode n2 = newNode(); // first auto should become master n1.ha.start(HighAvailabilityMode.AUTO); ManagementPlaneSyncRecord memento1 = n1.ha.loadManagementPlaneSyncRecord(true); log.info(n1+" HA: "+memento1); assertEquals(memento1.getMasterNodeId(), n1.ownNodeId); Long time0 = sharedTickerCurrentMillis(); assertEquals(memento1.getManagementNodes().get(n1.ownNodeId).getRemoteTimestamp(), time0); assertEquals(memento1.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.MASTER); // second - make explicit hot; that's a strictly more complex case than cold standby, so provides pretty good coverage n2.ha.start(HighAvailabilityMode.HOT_STANDBY); ManagementPlaneSyncRecord memento2 = n2.ha.loadManagementPlaneSyncRecord(true); log.info(n2+" HA: "+memento2); assertEquals(memento2.getMasterNodeId(), n1.ownNodeId); assertEquals(memento2.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.MASTER); assertEquals(memento2.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.HOT_STANDBY); assertEquals(memento2.getManagementNodes().get(n1.ownNodeId).getRemoteTimestamp(), time0); assertEquals(memento2.getManagementNodes().get(n2.ownNodeId).getRemoteTimestamp(), time0); // and no entities at either assertEquals(n1.mgmt.getApplications().size(), 0); assertEquals(n2.mgmt.getApplications().size(), 0); // create TestApplication app = ApplicationBuilder.newManagedApp(EntitySpec.create(TestApplication.class), n1.mgmt); app.start(ImmutableList.<Location>of()); app.sensors().set(TestApplication.MY_ATTRIBUTE, "hello"); assertEquals(n1.mgmt.getApplications().size(), 1); assertEquals(n2.mgmt.getApplications().size(), 0); log.info("persisting "+n1.ownNodeId); n1.mgmt.getRebindManager().forcePersistNow(false, null); n1.objectStore.setWritesFailSilently(true); log.info(n1+" writes off"); sharedTickerAdvance(Duration.ONE_MINUTE); log.info("time now "+sharedTickerCurrentMillis()); Long time1 = sharedTickerCurrentMillis(); log.info("publish "+n2.ownNodeId); n2.ha.publishAndCheck(false); ManagementPlaneSyncRecord memento2b = n2.ha.loadManagementPlaneSyncRecord(true); log.info(n2+" HA now: "+memento2b); // n2 infers n1 as failed assertEquals(memento2b.getManagementNodes().get(n1.ownNodeId).getStatus(), ManagementNodeState.FAILED); assertEquals(memento2b.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.MASTER); assertEquals(memento2b.getMasterNodeId(), n2.ownNodeId); assertEquals(memento2b.getManagementNodes().get(n1.ownNodeId).getRemoteTimestamp(), time0); assertEquals(memento2b.getManagementNodes().get(n2.ownNodeId).getRemoteTimestamp(), time1); assertEquals(n1.mgmt.getApplications().size(), 1); assertEquals(n2.mgmt.getApplications().size(), 1); assertEquals(n1.mgmt.getApplications().iterator().next().getAttribute(TestApplication.MY_ATTRIBUTE), "hello"); n1.objectStore.setWritesFailSilently(false); log.info(n1+" writes on"); sharedTickerAdvance(Duration.ONE_SECOND); log.info("time now "+sharedTickerCurrentMillis()); Long time2 = sharedTickerCurrentMillis(); log.info("publish "+n1.ownNodeId); n1.ha.publishAndCheck(false); ManagementPlaneSyncRecord memento1b = n1.ha.loadManagementPlaneSyncRecord(true); log.info(n1+" HA now: "+memento1b); ManagementNodeState expectedStateAfterDemotion = BrooklynFeatureEnablement.isEnabled(BrooklynFeatureEnablement.FEATURE_DEFAULT_STANDBY_IS_HOT_PROPERTY) ? ManagementNodeState.HOT_STANDBY : ManagementNodeState.STANDBY; // n1 comes back and demotes himself assertEquals(memento1b.getManagementNodes().get(n1.ownNodeId).getStatus(), expectedStateAfterDemotion); assertEquals(memento1b.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.MASTER); assertEquals(memento1b.getMasterNodeId(), n2.ownNodeId); assertEquals(memento1b.getManagementNodes().get(n1.ownNodeId).getRemoteTimestamp(), time2); assertEquals(memento1b.getManagementNodes().get(n2.ownNodeId).getRemoteTimestamp(), time1); // n2 now sees itself as master, with n1 in standby again ManagementPlaneSyncRecord memento2c = n2.ha.loadManagementPlaneSyncRecord(true); log.info(n2+" HA now: "+memento2c); assertEquals(memento2c.getManagementNodes().get(n1.ownNodeId).getStatus(), expectedStateAfterDemotion); assertEquals(memento2c.getManagementNodes().get(n2.ownNodeId).getStatus(), ManagementNodeState.MASTER); assertEquals(memento2c.getMasterNodeId(), n2.ownNodeId); assertEquals(memento2c.getManagementNodes().get(n1.ownNodeId).getRemoteTimestamp(), time2); assertEquals(memento2c.getManagementNodes().get(n2.ownNodeId).getRemoteTimestamp(), time2); // right number of entities at n2; n1 may or may not depending whether hot standby is default assertEquals(n2.mgmt.getApplications().size(), 1); assertEquals(n1.mgmt.getApplications().size(), BrooklynFeatureEnablement.isEnabled(BrooklynFeatureEnablement.FEATURE_DEFAULT_STANDBY_IS_HOT_PROPERTY) ? 1 : 0); } @Test(invocationCount=50, groups="Integration") public void testIfNodeStopsBeingAbleToWriteManyTimes() throws Exception { testIfNodeStopsBeingAbleToWrite(); } @Test public void testSimultaneousStartup() throws Exception { doTestConcurrentStartup(5, null); } @Test public void testNearSimultaneousStartup() throws Exception { doTestConcurrentStartup(20, Duration.millis(20)); } @Test(invocationCount=50, groups="Integration") public void testNearSimultaneousStartupManyTimes() throws Exception { doTestConcurrentStartup(20, Duration.millis(20)); } protected void doTestConcurrentStartup(int size, final Duration staggerStart) throws Exception { useSharedTime(); List<Thread> spawned = MutableList.of(); for (int i=0; i<size; i++) { final HaMgmtNode n = newNode(); Thread t = new Thread() { public void run() { if (staggerStart!=null) Time.sleep(staggerStart.multiply(Math.random())); n.ha.start(HighAvailabilityMode.AUTO); n.ha.setPollPeriod(Duration.millis(20)); } }; spawned.add(t); t.start(); } try { final Stopwatch timer = Stopwatch.createStarted(); Asserts.succeedsEventually(new Runnable() { @Override public void run() { ManagementPlaneSyncRecord memento = nodes.get(0).ha.loadManagementPlaneSyncRecord(true); List<ManagementNodeState> counts = MutableList.of(), savedCounts = MutableList.of(); for (HaMgmtNode n: nodes) { counts.add(n.ha.getNodeState()); ManagementNodeSyncRecord m = memento.getManagementNodes().get(n.ownNodeId); if (m!=null) { savedCounts.add(m.getStatus()); } } log.info("while starting "+nodes.size()+" nodes: " +Collections.frequency(counts, ManagementNodeState.MASTER)+" M + " +Collections.frequency(counts, ManagementNodeState.HOT_STANDBY)+" hot + " +Collections.frequency(counts, ManagementNodeState.STANDBY)+" warm + " +Collections.frequency(counts, ManagementNodeState.INITIALIZING)+" init; " + memento.getManagementNodes().size()+" saved, " +Collections.frequency(savedCounts, ManagementNodeState.MASTER)+" M + " +Collections.frequency(savedCounts, ManagementNodeState.HOT_STANDBY)+" hot + " +Collections.frequency(savedCounts, ManagementNodeState.STANDBY)+" warm + " +Collections.frequency(savedCounts, ManagementNodeState.INITIALIZING)+" init"); if (timer.isRunning() && Duration.of(timer).compareTo(Duration.TEN_SECONDS)>0) { log.warn("we seem to have a problem stabilizing"); //handy place to set a suspend-VM breakpoint! timer.stop(); } assertEquals(Collections.frequency(counts, ManagementNodeState.MASTER), 1); assertEquals(Collections.frequency(counts, ManagementNodeState.HOT_STANDBY)+Collections.frequency(counts, ManagementNodeState.STANDBY), nodes.size()-1); assertEquals(Collections.frequency(savedCounts, ManagementNodeState.MASTER), 1); assertEquals(Collections.frequency(savedCounts, ManagementNodeState.HOT_STANDBY)+Collections.frequency(savedCounts, ManagementNodeState.STANDBY), nodes.size()-1); }}); } catch (Throwable t) { log.warn("Failed to stabilize (rethrowing): "+t, t); throw Exceptions.propagate(t); } for (Thread t: spawned) t.join(Duration.THIRTY_SECONDS.toMilliseconds()); } }