HighAvailabilityManagerImpl.java example

Explorer
incubator-brooklyn-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.brooklyn.core.mgmt.ha;

import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

import java.io.IOException;
import java.net.URI;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;

import javax.annotation.Nullable;

import org.apache.brooklyn.api.entity.Application;
import org.apache.brooklyn.api.entity.Entity;
import org.apache.brooklyn.api.location.Location;
import org.apache.brooklyn.api.mgmt.Task;
import org.apache.brooklyn.api.mgmt.ha.HighAvailabilityManager;
import org.apache.brooklyn.api.mgmt.ha.HighAvailabilityMode;
import org.apache.brooklyn.api.mgmt.ha.ManagementNodeState;
import org.apache.brooklyn.api.mgmt.ha.ManagementNodeSyncRecord;
import org.apache.brooklyn.api.mgmt.ha.ManagementPlaneSyncRecord;
import org.apache.brooklyn.api.mgmt.ha.ManagementPlaneSyncRecordPersister;
import org.apache.brooklyn.api.mgmt.ha.MementoCopyMode;
import org.apache.brooklyn.api.mgmt.ha.ManagementPlaneSyncRecordPersister.Delta;
import org.apache.brooklyn.api.mgmt.rebind.RebindManager;
import org.apache.brooklyn.config.ConfigKey;
import org.apache.brooklyn.core.BrooklynFeatureEnablement;
import org.apache.brooklyn.core.BrooklynVersion;
import org.apache.brooklyn.core.catalog.internal.BasicBrooklynCatalog;
import org.apache.brooklyn.core.catalog.internal.CatalogDto;
import org.apache.brooklyn.core.config.ConfigKeys;
import org.apache.brooklyn.core.entity.EntityInternal;
import org.apache.brooklyn.core.mgmt.BrooklynTaskTags;
import org.apache.brooklyn.core.mgmt.ha.BasicMasterChooser.AlphabeticMasterChooser;
import org.apache.brooklyn.core.mgmt.ha.dto.BasicManagementNodeSyncRecord;
import org.apache.brooklyn.core.mgmt.ha.dto.ManagementPlaneSyncRecordImpl;
import org.apache.brooklyn.core.mgmt.ha.dto.ManagementPlaneSyncRecordImpl.Builder;
import org.apache.brooklyn.core.mgmt.internal.BrooklynObjectManagementMode;
import org.apache.brooklyn.core.mgmt.internal.LocalEntityManager;
import org.apache.brooklyn.core.mgmt.internal.LocationManagerInternal;
import org.apache.brooklyn.core.mgmt.internal.ManagementContextInternal;
import org.apache.brooklyn.core.mgmt.internal.ManagementTransitionMode;
import org.apache.brooklyn.core.mgmt.persist.BrooklynPersistenceUtils;
import org.apache.brooklyn.core.mgmt.persist.PersistenceActivityMetrics;
import org.apache.brooklyn.core.mgmt.persist.BrooklynPersistenceUtils.CreateBackupMode;
import org.apache.brooklyn.core.mgmt.rebind.RebindManagerImpl;
import org.apache.brooklyn.core.server.BrooklynServerConfig;
import org.apache.brooklyn.util.collections.MutableList;
import org.apache.brooklyn.util.collections.MutableMap;
import org.apache.brooklyn.util.core.task.ScheduledTask;
import org.apache.brooklyn.util.core.task.Tasks;
import org.apache.brooklyn.util.exceptions.Exceptions;
import org.apache.brooklyn.util.exceptions.ReferenceWithError;
import org.apache.brooklyn.util.text.Strings;
import org.apache.brooklyn.util.time.Duration;
import org.apache.brooklyn.util.time.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.Beta;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Objects;
import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import com.google.common.base.Ticker;
import com.google.common.collect.Iterables;

/**
 * This is the guts of the high-availability solution in Brooklyn.
 * <p>
 * Multiple brooklyn nodes can be started to form a single management plane, where one node is 
 * designated master and the others are "warm standbys". On termination or failure of the master,
 * the standbys deterministically decide which standby should become master (see {@link MasterChooser}).
 * That standby promotes itself.
 * <p>
 * The management nodes communicate their health/status via the {@link ManagementPlaneSyncRecordPersister}.
 * For example, if using {@link ManagementPlaneSyncRecordPersisterToObjectStore} with a shared blobstore or 
 * filesystem/NFS mount, then each management-node periodically writes its state. 
 * This acts as a heartbeat, being read by the other management-nodes.
 * <p>
 * Promotion to master involves:
 * <ol>
 *   <li>notifying the other management-nodes that it is now master
 *   <li>calling {@link RebindManager#rebind(ClassLoader, org.apache.brooklyn.api.mgmt.rebind.RebindExceptionHandler, ManagementNodeState)} to read all persisted entity state, and thus reconstitute the entities.
 * </ol>
 * <p>
 * Future improvements in this area will include brooklyn-managing-brooklyn to decide + promote
 * the standby.
 * 
 * @since 0.7.0
 * 
 * @author aled
 */
@Beta
public class HighAvailabilityManagerImpl implements HighAvailabilityManager {

    public final ConfigKey<Duration> POLL_PERIOD = ConfigKeys.newConfigKey(Duration.class, "brooklyn.ha.pollPeriod",
        "How often nodes should poll to detect whether master is healthy", Duration.seconds(1));
    public final ConfigKey<Duration> HEARTBEAT_TIMEOUT = ConfigKeys.newConfigKey(Duration.class, "brooklyn.ha.heartbeatTimeout",
        "Maximum allowable time for detection of a peer's heartbeat; if no sign of master after this time, "
        + "another node may promote itself", Duration.THIRTY_SECONDS);
    
    @VisibleForTesting /* only used in tests currently */
    public static interface PromotionListener {
        public void promotingToMaster();
    }
    
    private static final Logger LOG = LoggerFactory.getLogger(HighAvailabilityManagerImpl.class);

    private final ManagementContextInternal managementContext;
    private volatile String ownNodeId;
    private volatile ManagementPlaneSyncRecordPersister persister;
    private volatile PromotionListener promotionListener;
    private volatile MasterChooser masterChooser = new AlphabeticMasterChooser();
    private volatile Ticker localTickerUtc = new Ticker() {
        // strictly not a ticker because returns millis UTC, but it works fine even so
        @Override
        public long read() {
            return System.currentTimeMillis();
        }
    };
    private volatile Ticker optionalRemoteTickerUtc = null;
    
    private volatile Task<?> pollingTask;
    private volatile boolean disabled;
    private volatile boolean running;
    private volatile ManagementNodeState nodeState = ManagementNodeState.INITIALIZING;
    private volatile boolean nodeStateTransitionComplete = false;
    private volatile long priority = 0;
    
    private final static int MAX_NODE_STATE_HISTORY = 200;
    private final List<Map<String,Object>> nodeStateHistory = MutableList.of();
    
    private volatile transient Duration pollPeriodLocalOverride;
    private volatile transient Duration heartbeatTimeoutOverride;

    private volatile ManagementPlaneSyncRecord lastSyncRecord;
    
    private volatile PersistenceActivityMetrics managementStateWritePersistenceMetrics = new PersistenceActivityMetrics();
    private volatile PersistenceActivityMetrics managementStateReadPersistenceMetrics = new PersistenceActivityMetrics();
    private final long startTimeUtc;

    public HighAvailabilityManagerImpl(ManagementContextInternal managementContext) {
        this.managementContext = managementContext;
        startTimeUtc = localTickerUtc.read();
    }

    @Override
    public HighAvailabilityManagerImpl setPersister(ManagementPlaneSyncRecordPersister persister) {
        this.persister = checkNotNull(persister, "persister");
        return this;
    }
    
    @Override
    public ManagementPlaneSyncRecordPersister getPersister() {
        return persister;
    }
    
    protected synchronized Duration getPollPeriod() {
        if (pollPeriodLocalOverride!=null) return pollPeriodLocalOverride;
        return managementContext.getBrooklynProperties().getConfig(POLL_PERIOD);
    }
    
    /** Overrides {@link #POLL_PERIOD} from brooklyn config, 
     * including e.g. {@link Duration#PRACTICALLY_FOREVER} to disable polling;
     * or <code>null</code> to clear a local override */
    public HighAvailabilityManagerImpl setPollPeriod(Duration val) {
        this.pollPeriodLocalOverride = val;
        if (running) {
            registerPollTask();
        }
        return this;
    }

    public HighAvailabilityManagerImpl setMasterChooser(MasterChooser val) {
        this.masterChooser = checkNotNull(val, "masterChooser");
        return this;
    }

    public synchronized Duration getHeartbeatTimeout() {
        if (heartbeatTimeoutOverride!=null) return heartbeatTimeoutOverride;
        return managementContext.getBrooklynProperties().getConfig(HEARTBEAT_TIMEOUT);
    }
    
    /** Overrides {@link #HEARTBEAT_TIMEOUT} from brooklyn config, 
     * including e.g. {@link Duration#PRACTICALLY_FOREVER} to prevent failover due to heartbeat absence;
     * or <code>null</code> to clear a local override */
    public HighAvailabilityManagerImpl setHeartbeatTimeout(Duration val) {
        this.heartbeatTimeoutOverride = val;
        return this;
    }

    /** A ticker that reads in milliseconds, for populating local timestamps.
     * Defaults to System.currentTimeMillis(); may be overridden e.g. for testing. */
    public HighAvailabilityManagerImpl setLocalTicker(Ticker val) {
        this.localTickerUtc = checkNotNull(val);
        return this;
    }

    /** A ticker that reads in milliseconds, for overriding remote timestamps.
     * Defaults to null which means to use the remote timestamp. 
     * Only for testing as this records the remote timestamp in the object.
     * <p>
     * If this is supplied, one must also set {@link ManagementPlaneSyncRecordPersisterToObjectStore#useRemoteTimestampInMemento()}. */
    @VisibleForTesting
    public HighAvailabilityManagerImpl setRemoteTicker(Ticker val) {
        this.optionalRemoteTickerUtc = val;
        return this;
    }

    public HighAvailabilityManagerImpl setPromotionListener(PromotionListener val) {
        this.promotionListener = checkNotNull(val, "promotionListener");
        return this;
    }
    
    @Override
    public boolean isRunning() {
        return running;
    }

    @Override
    public void disabled() {
        disabled = true;
        ownNodeId = managementContext.getManagementNodeId();
        // this is notionally the master, just not running; see javadoc for more info
        stop(ManagementNodeState.MASTER);
        
    }

    @Override
    public void start(HighAvailabilityMode startMode) {
        nodeStateTransitionComplete = true;
        disabled = false;
        running = true;
        changeMode(startMode, true, true);
    }
    
    @Override
    public void changeMode(HighAvailabilityMode startMode) {
        changeMode(startMode, false, false);
    }
    
    @VisibleForTesting
    @Beta
    public void changeMode(HighAvailabilityMode startMode, boolean preventElectionOnExplicitStandbyMode, boolean failOnExplicitModesIfUnusual) {
        if (!running) {
            // if was not running then start as disabled mode, then proceed as normal
            LOG.info("HA changing mode to "+startMode+" from "+getInternalNodeState()+" when not running, forcing an intermediate start as DISABLED then will convert to "+startMode);
            start(HighAvailabilityMode.DISABLED);
        }
        if (getNodeState()==ManagementNodeState.FAILED || getNodeState()==ManagementNodeState.INITIALIZING) {
            if (startMode!=HighAvailabilityMode.DISABLED) {
                // if coming from FAILED (or INITIALIZING because we skipped start call) then treat as initializing
                setInternalNodeState(ManagementNodeState.INITIALIZING);
            }
        }
        
        ownNodeId = managementContext.getManagementNodeId();
        // TODO Small race in that we first check, and then we'll do checkMaster() on first poll,
        // so another node could have already become master or terminated in that window.
        ManagementNodeSyncRecord existingMaster = hasHealthyMaster();
        boolean weAreRecognisedAsMaster = existingMaster!=null && ownNodeId.equals(existingMaster.getNodeId());
        boolean weAreMasterLocally = getInternalNodeState()==ManagementNodeState.MASTER;
        
        // catch error in some tests where mgmt context has a different HA manager
        if (managementContext.getHighAvailabilityManager()!=this)
            throw new IllegalStateException("Cannot start an HA manager on a management context with a different HA manager!");
        
        if (weAreMasterLocally) {
            // demotion may be required; do this before triggering an election
            switch (startMode) {
            case MASTER:
            case AUTO:
            case DISABLED:
                // no action needed, will do anything necessary below (or above)
                break;
            case HOT_STANDBY: 
            case HOT_BACKUP: 
            case STANDBY: 
                demoteTo(ManagementNodeState.of(startMode).get()); break;
            default:
                throw new IllegalStateException("Unexpected high availability mode "+startMode+" requested for "+this);
            }
        }
        
        ManagementNodeState oldState = getInternalNodeState();
        
        // now do election
        switch (startMode) {
        case AUTO:
            // don't care; let's start and see if we promote ourselves
            if (getInternalNodeState()==ManagementNodeState.INITIALIZING) {
                setInternalNodeState(ManagementNodeState.STANDBY);
            }
            publishAndCheck(true);
            switch (getInternalNodeState()) {
            case HOT_BACKUP:
                if (!nodeStateTransitionComplete) throw new IllegalStateException("Cannot switch to AUTO when in the middle of a transition to "+getInternalNodeState());
                // else change us to standby, desiring to go to hot standby, and continue to below
                setInternalNodeState(ManagementNodeState.STANDBY);
                startMode = HighAvailabilityMode.HOT_BACKUP;
            case HOT_STANDBY:
            case STANDBY:
                if (getInternalNodeState()==ManagementNodeState.STANDBY && oldState==ManagementNodeState.INITIALIZING && startMode!=HighAvailabilityMode.HOT_BACKUP
                        && BrooklynFeatureEnablement.isEnabled(BrooklynFeatureEnablement.FEATURE_DEFAULT_STANDBY_IS_HOT_PROPERTY)) {
                    // auto requested; not promoted; so it should become hot standby
                    startMode = HighAvailabilityMode.HOT_STANDBY;
                }
                ManagementPlaneSyncRecord newState = loadManagementPlaneSyncRecord(true);
                String masterNodeId = newState.getMasterNodeId();
                ManagementNodeSyncRecord masterNodeDetails = newState.getManagementNodes().get(masterNodeId);
                LOG.info("Management node "+ownNodeId+" running as HA " + getInternalNodeState() + " autodetected"
                        + (startMode == HighAvailabilityMode.HOT_STANDBY || startMode == HighAvailabilityMode.HOT_BACKUP ? 
                            " (will change to "+startMode+")" : "")
                        + ", " +
                    (Strings.isBlank(masterNodeId) ? "no master currently (other node should promote itself soon)" : "master "
                        + (existingMaster==null ? "(new) " : "")
                        + "is "+masterNodeId +
                        (masterNodeDetails==null || masterNodeDetails.getUri()==null ? " (no url)" : " at "+masterNodeDetails.getUri())));
                break;
            case MASTER:
                LOG.info("Management node "+ownNodeId+" running as HA MASTER autodetected");
                break;
            default:
                throw new IllegalStateException("Management node "+ownNodeId+" set to HA AUTO, encountered unexpected mode "+getInternalNodeState());
            }
            break;
        case MASTER:
            if (!failOnExplicitModesIfUnusual || existingMaster==null) {
                promoteToMaster();
                if (existingMaster!=null) {
                    LOG.info("Management node "+ownNodeId+" running as HA MASTER explicitly");
                } else {
                    LOG.info("Management node "+ownNodeId+" running as HA MASTER explicitly, stealing from "+existingMaster);
                }
            } else if (!weAreRecognisedAsMaster) {
                throw new IllegalStateException("Master already exists; cannot run as master (master "+existingMaster.toVerboseString()+"); "
                    + "to trigger a promotion, set a priority and demote the current master");
            } else {
                LOG.info("Management node "+ownNodeId+" already running as HA MASTER, when set explicitly");
            }
            break;
        case HOT_BACKUP:
            setInternalNodeState(ManagementNodeState.HOT_BACKUP);
            // then continue into next block
        case STANDBY:
        case HOT_STANDBY:
            if (startMode!=HighAvailabilityMode.HOT_BACKUP) {
                if (ManagementNodeState.isHotProxy(getInternalNodeState()) && startMode==HighAvailabilityMode.HOT_STANDBY) {
                    // if was hot_backup, we can immediately go hot_standby
                    setInternalNodeState(ManagementNodeState.HOT_STANDBY);
                } else {
                    // from any other state, set standby, then perhaps switch to hot_standby later on (or might become master in the next block)
                    setInternalNodeState(ManagementNodeState.STANDBY);
                }
            }
            if (ManagementNodeState.isStandby(getInternalNodeState())) {
                if (!preventElectionOnExplicitStandbyMode) {
                    publishAndCheck(true);
                }
                if (failOnExplicitModesIfUnusual && existingMaster==null) {
                    LOG.error("Management node "+ownNodeId+" detected no master when "+startMode+" requested and existing master required; failing.");
                    throw new IllegalStateException("No existing master; cannot start as "+startMode);
                }
            }
            String message = "Management node "+ownNodeId+" running as HA "+getNodeState()+" (";
            if (getNodeState().toString().equals(startMode.toString()))
                message += "explicitly requested";
            else if (startMode==HighAvailabilityMode.HOT_STANDBY && getNodeState()==ManagementNodeState.STANDBY)
                message += "caller requested "+startMode+", will attempt rebind for HOT_STANDBY next";
            else
                message += "caller requested "+startMode;
            
            if (getNodeState()==ManagementNodeState.MASTER) {
                message += " but election re-promoted this node)";
            } else {
                ManagementPlaneSyncRecord newState = loadManagementPlaneSyncRecord(true);
                if (Strings.isBlank(newState.getMasterNodeId())) {
                    message += "); no master currently"; 
                    if (startMode != HighAvailabilityMode.HOT_BACKUP) message += " (subsequent election may repair)";
                } else {
                    message += "); master "+newState.getMasterNodeId();
                }
            }
            LOG.info(message);
            break;
        case DISABLED:
            // safe just to run even if we weren't master
            LOG.info("Management node "+ownNodeId+" HA DISABLED (was "+getInternalNodeState()+")");
            demoteTo(ManagementNodeState.FAILED);
            if (pollingTask!=null) pollingTask.cancel(true);
            break;
        default:
            throw new IllegalStateException("Unexpected high availability mode "+startMode+" requested for "+this);
        }
        
        if ((startMode==HighAvailabilityMode.HOT_STANDBY || startMode==HighAvailabilityMode.HOT_BACKUP)) {
            if (!ManagementNodeState.isHotProxy(oldState)) {
                // now transition to hot proxy
                nodeStateTransitionComplete = false;
                if (startMode==HighAvailabilityMode.HOT_STANDBY) {
                    // if it should be hot standby, then we may need to promote
                    // inform the world that we are transitioning (but not eligible for promotion while going in to hot standby)
                    // (no harm in doing this twice)
                    publishHealth();
                }
                try {
                    activateHotProxy(ManagementNodeState.of(startMode).get()).get();
                    // error above now throws
                    nodeStateTransitionComplete = true;
                    publishHealth();

                    if (getNodeState()==ManagementNodeState.HOT_STANDBY || getNodeState()==ManagementNodeState.HOT_BACKUP) {
                        LOG.info("Management node "+ownNodeId+" now running as HA "+getNodeState()+"; "
                            + managementContext.getApplications().size()+" application"+Strings.s(managementContext.getApplications().size())+" loaded");
                    } else {
                        // shouldn't come here, we should have gotten an error above
                        LOG.warn("Management node "+ownNodeId+" unable to promote to "+startMode+" (currently "+getNodeState()+"); "
                            + "(see log for further details)");
                    }
                } catch (Exception e) {
                    LOG.warn("Management node "+ownNodeId+" unable to promote to "+startMode+" (currently "+getNodeState()+"); rethrowing: "+Exceptions.collapseText(e));
                    nodeStateTransitionComplete = true;
                    throw Exceptions.propagate(e);
                }
            } else {
                // transitioning among hot proxy states - tell the rebind manager
                managementContext.getRebindManager().stopReadOnly();
                managementContext.getRebindManager().startReadOnly(ManagementNodeState.of(startMode).get());
                nodeStateTransitionComplete = true;
            }
        } else {
            nodeStateTransitionComplete = true;
        }
        if (startMode!=HighAvailabilityMode.DISABLED)
            registerPollTask();
    }

    @Override
    public void setPriority(long priority) {
        this.priority = priority;
        if (persister!=null) publishHealth();
    }
    
    @Override
    public long getPriority() {
        return priority;
    }
    
    @Override
    public void stop() {
        LOG.debug("Stopping "+this);
        stop(ManagementNodeState.TERMINATED);
    }
    
    private void stop(ManagementNodeState newState) {
        boolean wasRunning = running;
        
        running = false;
        setInternalNodeState(newState);
        if (pollingTask != null) pollingTask.cancel(true);
        
        if (wasRunning) {
            try {
                publishHealth();
            } catch (Exception e) {
                Exceptions.propagateIfFatal(e);
                LOG.error("Problem publishing manager-node health on termination (continuing)", e);
            }
        }
    }
    
    /** returns the node state this node is trying to be in */
    public ManagementNodeState getTransitionTargetNodeState() {
        return getInternalNodeState();
    }
    
    protected ManagementNodeState getInternalNodeState() {
        return nodeState;
    }
    
    protected void setInternalNodeState(ManagementNodeState newState) {
        ManagementNodeState oldState = getInternalNodeState();
        synchronized (nodeStateHistory) {
            if (this.nodeState != newState) {
                nodeStateHistory.add(0, MutableMap.<String,Object>of("state", newState, "timestamp", currentTimeMillis()));
                while (nodeStateHistory.size()>MAX_NODE_STATE_HISTORY) {
                    nodeStateHistory.remove(nodeStateHistory.size()-1);
                }
            }
            ((RebindManagerImpl)managementContext.getRebindManager()).setAwaitingInitialRebind(running &&
                (ManagementNodeState.isHotProxy(newState) || newState==ManagementNodeState.MASTER));
            this.nodeState = newState;
        }
        
        if (ManagementNodeState.isHotProxy(oldState) && !ManagementNodeState.isHotProxy(newState)) {
            // could perhaps promote standby items on some transitions; but for now we stop the old read-only and re-load them
            // TODO ideally there'd be an incremental rebind as well as an incremental persist
            managementContext.getRebindManager().stopReadOnly();
            clearManagedItems(ManagementTransitionMode.transitioning(BrooklynObjectManagementMode.LOADED_READ_ONLY, BrooklynObjectManagementMode.UNMANAGED_PERSISTED));
        }
    }

    @Override
    public ManagementNodeState getNodeState() {
        ManagementNodeState myNodeState = getInternalNodeState();
        if (myNodeState==ManagementNodeState.FAILED) return getInternalNodeState();
        // if target is master then we claim already being master, to prevent other nodes from taking it
        // (we may fail subsequently of course)
        if (myNodeState==ManagementNodeState.MASTER) return myNodeState;
        
        if (!nodeStateTransitionComplete) return ManagementNodeState.INITIALIZING;
        return myNodeState;
    }

    public ManagementPlaneSyncRecord getLastManagementPlaneSyncRecord() {
        return lastSyncRecord;
    }
    
    @SuppressWarnings("unchecked")
    protected void registerPollTask() {
        final Runnable job = new Runnable() {
            private boolean lastFailed;
            
            @Override public void run() {
                try {
                    publishAndCheck(false);
                    lastFailed = false;
                } catch (Exception e) {
                    if (running) {
                        if (lastFailed) {
                            if (LOG.isDebugEnabled()) LOG.debug("Recurring problem in HA-poller: "+e, e);
                        } else {
                            LOG.error("Problem in HA-poller: "+e, e);
                            lastFailed = true;
                        }
                    } else {
                        if (LOG.isDebugEnabled()) LOG.debug("Problem in HA-poller, but no longer running: "+e, e);
                    }
                } catch (Throwable t) {
                    LOG.error("Problem in HA-poller: "+t, t);
                    throw Exceptions.propagate(t);
                }
            }
        };
        Callable<Task<?>> taskFactory = new Callable<Task<?>>() {
            @Override public Task<?> call() {
                return Tasks.builder().dynamic(false).body(job).displayName("HA poller task").tag(BrooklynTaskTags.TRANSIENT_TASK_TAG)
                    .description("polls HA status to see whether this node should promote").build();
            }
        };
        
        Duration pollPeriod = getPollPeriod();
        LOG.debug("Registering poll task for "+this+", period "+pollPeriod);
        if (pollPeriod.equals(Duration.PRACTICALLY_FOREVER)) {
            // don't schedule - used for tests
            // (scheduling fires off one initial task in the background before the delay, 
            // which affects tests that want to know exactly when publishing happens;
            // TODO would be nice if scheduled task had a "no initial submission" flag )
        } else {
            if (pollingTask!=null) pollingTask.cancel(true);
            
            ScheduledTask task = new ScheduledTask(MutableMap.of("period", pollPeriod, "displayName", "scheduled:[HA poller task]"), taskFactory);
            pollingTask = managementContext.getExecutionManager().submit(task);
        }
    }
    
    /** invoked manually when initializing, and periodically thereafter */
    @VisibleForTesting
    public synchronized void publishAndCheck(boolean initializing) {
        publishHealth();
        checkMaster(initializing);
    }
    
    protected synchronized void publishHealth() {
        if (persister == null) {
            LOG.info("Cannot publish management-node health as no persister");
            return;
        }
        
        Stopwatch timer = Stopwatch.createStarted();
        try {
            ManagementNodeSyncRecord memento = createManagementNodeSyncRecord(false);
            Delta delta = ManagementPlaneSyncRecordDeltaImpl.builder().node(memento).build();
            persister.delta(delta);
            managementStateWritePersistenceMetrics.noteSuccess(Duration.of(timer));
            if (LOG.isTraceEnabled()) LOG.trace("Published management-node health: {}", memento);
        } catch (Throwable t) {
            managementStateWritePersistenceMetrics.noteFailure(Duration.of(timer));
            managementStateWritePersistenceMetrics.noteError(t.toString());
            LOG.debug("Error publishing management-node health (rethrowing): "+t);
            throw Exceptions.propagate(t);
        }
    }
    
    public void publishClearNonMaster() {
        ManagementPlaneSyncRecord plane = getLastManagementPlaneSyncRecord();
        if (plane==null || persister==null) {
            LOG.warn("Cannot clear HA node records; HA not active (or not yet loaded)");
            return;
        }
        org.apache.brooklyn.core.mgmt.ha.ManagementPlaneSyncRecordDeltaImpl.Builder db = ManagementPlaneSyncRecordDeltaImpl.builder();
        for (Map.Entry<String,ManagementNodeSyncRecord> node: plane.getManagementNodes().entrySet()) {
            // only keep a node if it both claims master and is recognised as master;
            // else ex-masters who died are kept around!
            if (!ManagementNodeState.MASTER.equals(node.getValue().getStatus()) || 
                    !Objects.equal(plane.getMasterNodeId(), node.getValue().getNodeId())) {
                db.removedNodeId(node.getKey());
            }
        }
        persister.delta(db.build());
        // then get, so model is updated
        loadManagementPlaneSyncRecord(true);
    }
    
    protected synchronized void publishDemotion(boolean demotingFromMaster) {
        checkState(getNodeState() != ManagementNodeState.MASTER, "node status must not be master when demoting", getNodeState());
        
        if (persister == null) {
            LOG.info("Cannot publish management-node health as no persister");
            return;
        }
        
        ManagementNodeSyncRecord memento = createManagementNodeSyncRecord(false);
        ManagementPlaneSyncRecordDeltaImpl.Builder deltaBuilder = ManagementPlaneSyncRecordDeltaImpl.builder()
                .node(memento);
        if (demotingFromMaster) {
            deltaBuilder.clearMaster(ownNodeId);
        }
        
        Delta delta = deltaBuilder.build();
        persister.delta(delta);
        if (LOG.isTraceEnabled()) LOG.trace("Published management-node health: {}", memento);
    }
    
    /**
     * Publishes (via {@link #persister}) the state of this management node with itself set to master.
     */
    protected synchronized void publishPromotionToMaster() {
        checkState(getNodeState() == ManagementNodeState.MASTER, "node status must be master on publish, but is %s", getNodeState());
        
        if (persister == null) {
            LOG.info("Cannot publish management-node health as no persister");
            return;
        }
        
        ManagementNodeSyncRecord memento = createManagementNodeSyncRecord(false);
        Delta delta = ManagementPlaneSyncRecordDeltaImpl.builder()
                .node(memento)
                .setMaster(ownNodeId)
                .build();
        persister.delta(delta);
        if (LOG.isTraceEnabled()) LOG.trace("Published management-node health: {}", memento);
    }
    
    protected boolean isHeartbeatOk(ManagementNodeSyncRecord masterNode, ManagementNodeSyncRecord meNode) {
        if (masterNode==null) return false;
        if (meNode==null) {
            // we can't confirm it's healthy, but it appears so as far as we can tell
            return true;
        }
        Long timestampMaster = masterNode.getRemoteTimestamp();
        Long timestampMe = meNode.getRemoteTimestamp();
        if (timestampMaster==null || timestampMe==null) return false;
        return (timestampMe - timestampMaster) <= getHeartbeatTimeout().toMilliseconds();
    }
    
    protected ManagementNodeSyncRecord hasHealthyMaster() {
        ManagementPlaneSyncRecord memento = loadManagementPlaneSyncRecord(false);
        
        String nodeId = memento.getMasterNodeId();
        ManagementNodeSyncRecord masterMemento = (nodeId == null) ? null : memento.getManagementNodes().get(nodeId);
        
        ManagementNodeSyncRecord ourMemento = memento.getManagementNodes().get(ownNodeId);
        boolean result = masterMemento != null && masterMemento.getStatus() == ManagementNodeState.MASTER
                && isHeartbeatOk(masterMemento, ourMemento);
        
        if (LOG.isDebugEnabled()) LOG.debug("Healthy-master check result={}; masterId={}; masterMemento={}; ourMemento={}",
                new Object[] {result, nodeId, (masterMemento == null ? "<none>" : masterMemento.toVerboseString()), (ourMemento == null ? "<none>" : ourMemento.toVerboseString())});
        
        return (result ? masterMemento : null);
    }
    
    /**
     * Looks up the state of all nodes in the management plane, and checks if the master is still ok.
     * If it's not then determines which node should be promoted to master. If it is ourself, then promotes.
     */
    protected void checkMaster(boolean initializing) {
        ManagementPlaneSyncRecord memento = loadManagementPlaneSyncRecord(false);
        
        if (getNodeState()==ManagementNodeState.FAILED || getNodeState()==ManagementNodeState.HOT_BACKUP) {
            // if failed or hot backup then we can't promote ourselves, so no point in checking who is master
            return;
        }
        
        String currMasterNodeId = memento.getMasterNodeId();
        ManagementNodeSyncRecord currMasterNodeRecord = memento.getManagementNodes().get(currMasterNodeId);
        ManagementNodeSyncRecord ownNodeRecord = memento.getManagementNodes().get(ownNodeId);
        
        ManagementNodeSyncRecord newMasterNodeRecord = null;
        boolean demotingSelfInFavourOfOtherMaster = false;
        
        if (currMasterNodeRecord != null && currMasterNodeRecord.getStatus() == ManagementNodeState.MASTER && isHeartbeatOk(currMasterNodeRecord, ownNodeRecord)) {
            // master seems healthy
            if (ownNodeId.equals(currMasterNodeId)) {
                if (LOG.isTraceEnabled()) LOG.trace("Existing master healthy (us): master={}", currMasterNodeRecord.toVerboseString());
                return;
            } else {
                if (ownNodeRecord!=null && ownNodeRecord.getStatus() == ManagementNodeState.MASTER) {
                    LOG.error("Management node "+ownNodeId+" detected master change, stolen from us, deferring to "+currMasterNodeId);
                    newMasterNodeRecord = currMasterNodeRecord;
                    demotingSelfInFavourOfOtherMaster = true;
                } else {
                    if (LOG.isTraceEnabled()) LOG.trace("Existing master healthy (remote): master={}", currMasterNodeRecord.toVerboseString());
                    return;
                }
            }
        } else if (ownNodeRecord == null || !isHeartbeatOk(ownNodeRecord, ownNodeRecord)) {
            // our heartbeats are also out-of-date! perhaps something wrong with persistence? just log, and don't over-react!
            if (ownNodeRecord == null) {
                LOG.error("No management node memento for self ("+ownNodeId+"); perhaps persister unwritable? "
                        + "Master ("+currMasterNodeId+") reported failed but no-op as cannot tell conclusively");
            } else {
                LOG.error("This management node ("+ownNodeId+") memento heartbeats out-of-date; perhaps perister unwritable? "
                        + "Master ("+currMasterNodeId+") reported failed but no-op as cannot tell conclusively"
                        + ": self="+ownNodeRecord.toVerboseString());
            }
            return;
        } else if (ownNodeId.equals(currMasterNodeId)) {
            // we are supposed to be the master, but seem to be unhealthy!
            LOG.warn("This management node ("+ownNodeId+") supposed to be master but reportedly unhealthy? "
                    + "no-op as expect other node to fix: self="+ownNodeRecord.toVerboseString());
            return;
        }
        
        if (demotingSelfInFavourOfOtherMaster) {
            LOG.debug("Master-change for this node only, demoting "+ownNodeRecord.toVerboseString()+" in favour of official master "+newMasterNodeRecord.toVerboseString());
            demoteTo(
                BrooklynFeatureEnablement.isEnabled(BrooklynFeatureEnablement.FEATURE_DEFAULT_STANDBY_IS_HOT_PROPERTY) ?
                    ManagementNodeState.HOT_STANDBY : ManagementNodeState.STANDBY);
            return;
        } else {
            LOG.debug("Detected master heartbeat timeout. Initiating a new master election. Master was " + currMasterNodeRecord);
        }
        
        // Need to choose a new master
        newMasterNodeRecord = masterChooser.choose(memento, getHeartbeatTimeout(), ownNodeId);
        
        String newMasterNodeId = (newMasterNodeRecord == null) ? null : newMasterNodeRecord.getNodeId();
        URI newMasterNodeUri = (newMasterNodeRecord == null) ? null : newMasterNodeRecord.getUri();
        boolean weAreNewMaster = ownNodeId.equals(newMasterNodeId);
        
        if (LOG.isDebugEnabled()) {
            LOG.debug("Management node master-change required: newMaster={}; oldMaster={}; plane={}, self={}; heartbeatTimeout={}", 
                new Object[] {
                    (newMasterNodeRecord == null ? "<none>" : newMasterNodeRecord.toVerboseString()),
                    (currMasterNodeRecord == null ? currMasterNodeId+" (no memento)": currMasterNodeRecord.toVerboseString()),
                    memento,
                    ownNodeRecord.toVerboseString(), 
                    getHeartbeatTimeout()
                });
        }
        String message = "Management node "+ownNodeId+" detected ";
        String currMasterSummary = currMasterNodeId + "(" + (currMasterNodeRecord==null ? "<none>" : timestampString(currMasterNodeRecord.getRemoteTimestamp())) + ")";
        if (weAreNewMaster && (ownNodeRecord.getStatus() == ManagementNodeState.MASTER)) {
            LOG.warn(message + "we must reassert master status, as was stolen and then failed at "+
                (currMasterNodeRecord==null ? "a node which has gone away" : currMasterSummary));
            publishPromotionToMaster();
            publishHealth();
            return;
        }
        
        if (!initializing) {
            if (weAreNewMaster) {
                message += "we should be master, changing from ";
            }
            else if (currMasterNodeRecord==null && newMasterNodeId==null) message += "master change attempted but no candidates ";
            else message += "master change, from ";
            message += currMasterSummary + " to "
                + (newMasterNodeId == null ? "<none>" :
                    (weAreNewMaster ? "us " : "")
                    + newMasterNodeId + " (" + timestampString(newMasterNodeRecord.getRemoteTimestamp()) + ")" 
                    + (newMasterNodeUri!=null ? " "+newMasterNodeUri : "")  );
            // always log, if you're looking at a standby node it's useful to see the new master's URL
            LOG.info(message);
        }

        // New master is ourself: promote
        if (weAreNewMaster) {
            promoteToMaster();
        }
    }
    
    private static String timestampString(Long remoteTimestamp) {
        if (remoteTimestamp==null) return null;
        return remoteTimestamp+" / "+Time.makeTimeStringRounded( Duration.sinceUtc(remoteTimestamp))+" ago";
    }

    protected void promoteToMaster() {
        if (!running) {
            LOG.warn("Ignoring promote-to-master request, as HighAvailabilityManager is not running");
            return;
        }
        
        if (promotionListener != null) {
            try {
                promotionListener.promotingToMaster();
            } catch (Exception e) {
                Exceptions.propagateIfFatal(e);
                LOG.warn("Problem in promption-listener (continuing)", e);
            }
        }
        setInternalNodeState(ManagementNodeState.MASTER);
        publishPromotionToMaster();
        try {
            managementContext.getRebindManager().rebind(managementContext.getCatalogClassLoader(), null, getInternalNodeState());
        } catch (Exception e) {
            LOG.error("Management node "+managementContext.getManagementNodeId()+" enountered problem during rebind when promoting self to master; demoting to FAILED and rethrowing: "+e);
            demoteTo(ManagementNodeState.FAILED);
            throw Exceptions.propagate(e);
        }
        managementContext.getRebindManager().start();
    }
    
    protected void backupOnDemotionIfNeeded() {
        if (managementContext.getBrooklynProperties().getConfig(BrooklynServerConfig.PERSISTENCE_BACKUPS_REQUIRED_ON_DEMOTION)) {
            BrooklynPersistenceUtils.createBackup(managementContext, CreateBackupMode.DEMOTION, MementoCopyMode.LOCAL);
        }
    }

    /** @deprecated since 0.7.0, use {@link #demoteTo(ManagementNodeState)} */ @Deprecated
    protected void demoteToFailed() {
        demoteTo(ManagementNodeState.FAILED);
    }
    /** @deprecated since 0.7.0, use {@link #demoteTo(ManagementNodeState)} */ @Deprecated
    protected void demoteToStandby(boolean hot) {
        demoteTo(hot ? ManagementNodeState.HOT_STANDBY : ManagementNodeState.STANDBY);
    }
    
    protected void demoteTo(ManagementNodeState toState) {
        if (toState!=ManagementNodeState.FAILED && !running) {
            LOG.warn("Ignoring demote-from-master request, as HighAvailabilityManager is no longer running");
            return;
        }
        boolean wasMaster = (getInternalNodeState() == ManagementNodeState.MASTER);
        if (wasMaster) backupOnDemotionIfNeeded();
        // TODO target may be RO ?
        ManagementTransitionMode mode = ManagementTransitionMode.transitioning(
            wasMaster ? BrooklynObjectManagementMode.MANAGED_PRIMARY : BrooklynObjectManagementMode.LOADED_READ_ONLY,
            BrooklynObjectManagementMode.UNMANAGED_PERSISTED);

        nodeStateTransitionComplete = false;
        
        switch (toState) {
        case FAILED: 
        case HOT_BACKUP:
        case STANDBY:
            setInternalNodeState(toState); break;
        case HOT_STANDBY:
            setInternalNodeState(ManagementNodeState.STANDBY); break;
        default:
            throw new IllegalStateException("Illegal target state: "+toState);
        }
        onDemotionStopItems(mode);
        nodeStateTransitionComplete = true;
        publishDemotion(wasMaster);
        
        if (toState==ManagementNodeState.HOT_BACKUP || toState==ManagementNodeState.HOT_STANDBY) {
            nodeStateTransitionComplete = false;
            try {
                activateHotProxy(toState).get();
            } finally {
                nodeStateTransitionComplete = true;
            }
            publishHealth();
        }
    }
    
    protected void onDemotionStopItems(ManagementTransitionMode mode) {
        // stop persistence and remove all apps etc
        managementContext.getRebindManager().stopPersistence();
        managementContext.getRebindManager().stopReadOnly();
        clearManagedItems(mode);
        
        // tasks are cleared as part of unmanaging entities above
    }

    /** clears all managed items from the management context; same items destroyed as in the course of a rebind cycle */
    protected void clearManagedItems(ManagementTransitionMode mode) {
        // start with the root applications
        for (Application app: managementContext.getApplications()) {
            if (((EntityInternal)app).getManagementSupport().isDeployed()) {
                ((LocalEntityManager)((EntityInternal)app).getManagementContext().getEntityManager()).unmanage(app, mode);
            }
        }
        // for active management, call above will remove recursively at present,
        // but for read-only, and if we stop recursively, go through them all
        for (Entity entity: managementContext.getEntityManager().getEntities()) {
            ((LocalEntityManager)managementContext.getEntityManager()).unmanage(entity, mode);
        }
    
        // again, for locations, call unmanage on parents first
        for (Location loc: managementContext.getLocationManager().getLocations()) {
            if (loc.getParent()==null)
                ((LocationManagerInternal)managementContext.getLocationManager()).unmanage(loc, mode);
        }
        for (Location loc: managementContext.getLocationManager().getLocations()) {
            ((LocationManagerInternal)managementContext.getLocationManager()).unmanage(loc, mode);
        }
        
        ((BasicBrooklynCatalog)managementContext.getCatalog()).reset(CatalogDto.newEmptyInstance("<reset-by-ha-status-change>"));
    }
    
    /** @deprecated since 0.7.0, use {@link #activateHotProxy(ManagementNodeState)} */ @Deprecated
    protected boolean attemptHotStandby() {
        return activateHotProxy(ManagementNodeState.HOT_STANDBY).getWithoutError();
    }
    
    /** Starts hot standby or hot backup, in foreground
     * <p>
     * In the case of the former, the caller is responsible for publishing health afterwards,
     * but if it fails, this method will {@link #demoteTo(ManagementNodeState)} {@link ManagementNodeState#FAILED}.
     * <p>
     * @return whether the requested {@link ManagementNodeState} was possible;
     * (if not, errors should be stored elsewhere), callers may want to rethrow */
    protected ReferenceWithError<Boolean> activateHotProxy(ManagementNodeState toState) {
        try {
            Preconditions.checkState(nodeStateTransitionComplete==false, "Must be in transitioning state to go into "+toState);
            setInternalNodeState(toState);
            managementContext.getRebindManager().startReadOnly(toState);
            
            return ReferenceWithError.newInstanceWithoutError(true);
        } catch (Exception e) {
            Exceptions.propagateIfFatal(e);
            LOG.warn("Unable to change "+ownNodeId+" to "+toState+", switching to FAILED: "+e, e);
            demoteTo(ManagementNodeState.FAILED);
            return ReferenceWithError.newInstanceThrowingError(false, e);
        }
    }
    
    @Override
    public ManagementPlaneSyncRecord loadManagementPlaneSyncRecord(boolean useLocalKnowledgeForThisNode) {
        ManagementPlaneSyncRecord record = loadManagementPlaneSyncRecordInternal(useLocalKnowledgeForThisNode);
        lastSyncRecord = record;
        return record; 
    }
    
    private ManagementPlaneSyncRecord loadManagementPlaneSyncRecordInternal(boolean useLocalKnowledgeForThisNode) {
        if (disabled) {
            // if HA is disabled, then we are the only node - no persistence; just load a memento to describe this node
            Builder builder = ManagementPlaneSyncRecordImpl.builder()
                .node(createManagementNodeSyncRecord(true));
            if (getTransitionTargetNodeState() == ManagementNodeState.MASTER) {
                builder.masterNodeId(ownNodeId);
            }
            return builder.build();
        }
        if (persister == null) {
            // e.g. web-console may be polling before we've started up
            LOG.debug("High availablity manager has no persister; returning empty record");
            return ManagementPlaneSyncRecordImpl.builder().build();
        }
        
        int maxLoadAttempts = 5;
        Exception lastException = null;
        Stopwatch timer = Stopwatch.createStarted();

        for (int i = 0; i < maxLoadAttempts; i++) {
            try {
                ManagementPlaneSyncRecord result = persister.loadSyncRecord();
                
                if (useLocalKnowledgeForThisNode) {
                    // Report this node's most recent state, and detect AWOL nodes
                    ManagementNodeSyncRecord me = BasicManagementNodeSyncRecord.builder()
                        .from(result.getManagementNodes().get(ownNodeId), true)
                        .from(createManagementNodeSyncRecord(false), true)
                        .build();
                    Iterable<ManagementNodeSyncRecord> allNodes = result.getManagementNodes().values();
                    if (me.getRemoteTimestamp()!=null)
                        allNodes = Iterables.transform(allNodes, new MarkAwolNodes(me));
                    Builder builder = ManagementPlaneSyncRecordImpl.builder()
                        .masterNodeId(result.getMasterNodeId())
                        .nodes(allNodes);
                    builder.node(me);
                    if (getTransitionTargetNodeState() == ManagementNodeState.MASTER) {
                        builder.masterNodeId(ownNodeId);
                    }
                    result = builder.build();
                }
                
                if (i>0) {
                    managementStateReadPersistenceMetrics.noteError("Succeeded only on attempt "+(i+1)+": "+lastException);
                }
                managementStateReadPersistenceMetrics.noteSuccess(Duration.of(timer));
                return result;
            } catch (IOException e) {
                if (i < (maxLoadAttempts - 1)) {
                    if (LOG.isDebugEnabled()) LOG.debug("Problem loading mangement-plane memento attempt "+(i+1)+"/"+maxLoadAttempts+"; retrying", e);
                }
                lastException = e;
            }
        }
        String message = "Failed to load mangement-plane memento "+maxLoadAttempts+" consecutive times";
        managementStateReadPersistenceMetrics.noteError(message+": "+lastException);
        managementStateReadPersistenceMetrics.noteFailure(Duration.of(timer));

        throw new IllegalStateException(message, lastException);
    }

    protected ManagementNodeSyncRecord createManagementNodeSyncRecord(boolean useLocalTimestampAsRemoteTimestamp) {
        long timestamp = currentTimeMillis();
        org.apache.brooklyn.core.mgmt.ha.dto.BasicManagementNodeSyncRecord.Builder builder = BasicManagementNodeSyncRecord.builder()
                .brooklynVersion(BrooklynVersion.get())
                .nodeId(ownNodeId)
                .status(getNodeState())
                .priority(getPriority())
                .localTimestamp(timestamp)
                .uri(managementContext.getManagementNodeUri().orNull());
        if (useLocalTimestampAsRemoteTimestamp)
            builder.remoteTimestamp(timestamp);
        else if (optionalRemoteTickerUtc!=null) {
            builder.remoteTimestamp(optionalRemoteTickerUtc.read());
        }
        return builder.build();
    }
    
    /**
     * Gets the current time, using the {@link #localTickerUtc}. Normally this is equivalent of {@link System#currentTimeMillis()},
     * but in test environments a custom {@link Ticker} can be injected via {@link #setLocalTicker(Ticker)} to allow testing of
     * specific timing scenarios.
     */
    protected long currentTimeMillis() {
        return localTickerUtc.read();
    }

    /**
     * Infers the health of a node - if it last reported itself as healthy (standby or master), but we haven't heard 
     * from it in a long time then report that node as failed; otherwise report its health as-is.
     */
    private class MarkAwolNodes implements Function<ManagementNodeSyncRecord, ManagementNodeSyncRecord> {
        private final ManagementNodeSyncRecord referenceNode;
        private MarkAwolNodes(ManagementNodeSyncRecord referenceNode) {
            this.referenceNode = referenceNode;
        }
        @Nullable
        @Override
        public ManagementNodeSyncRecord apply(@Nullable ManagementNodeSyncRecord input) {
            if (input == null) return null;
            if (!(input.getStatus() == ManagementNodeState.STANDBY || input.getStatus() == ManagementNodeState.HOT_STANDBY || input.getStatus() == ManagementNodeState.MASTER || input.getStatus() == ManagementNodeState.HOT_BACKUP)) return input;
            if (isHeartbeatOk(input, referenceNode)) return input;
            return BasicManagementNodeSyncRecord.builder()
                    .from(input)
                    .status(ManagementNodeState.FAILED)
                    .build();
        }
    }
    
    @Override
    public String toString() {
        return super.toString()+"[node:"+ownNodeId+";running="+running+"]";
    }
    
    @Override
    public Map<String,Object> getMetrics() {
        Map<String,Object> result = MutableMap.of();
        
        result.put("state", getNodeState());
        result.put("uptime", Time.makeTimeStringRounded(Duration.millis(currentTimeMillis()-startTimeUtc)));
        result.put("currentTimeUtc", currentTimeMillis());
        result.put("startTimeUtc", startTimeUtc);
        result.put("highAvailability", MutableMap.<String,Object>of(
            "priority", getPriority(),
            "pollPeriod", getPollPeriod().toMilliseconds(),
            "heartbeatTimeout", getHeartbeatTimeout().toMilliseconds(),
            "history", nodeStateHistory));
        
        result.putAll(managementContext.getRebindManager().getMetrics());
        result.put("managementStatePersistence", 
            MutableMap.of("read", managementStateReadPersistenceMetrics, "write", managementStateWritePersistenceMetrics));
        
        return result;
    }
    
    @Override
    public long getLastStateChange() {
        if (nodeStateHistory.size() > 0) {
            return (Long)nodeStateHistory.get(0).get("timestamp");
        } else {
            return 0;
        }
    }
    
}