/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.activemq.artemis.core.server.impl; import java.util.List; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import org.apache.activemq.artemis.api.core.ActiveMQAlreadyReplicatingException; import org.apache.activemq.artemis.api.core.ActiveMQException; import org.apache.activemq.artemis.api.core.ActiveMQIllegalStateException; import org.apache.activemq.artemis.api.core.DiscoveryGroupConfiguration; import org.apache.activemq.artemis.api.core.Pair; import org.apache.activemq.artemis.api.core.SimpleString; import org.apache.activemq.artemis.api.core.TransportConfiguration; import org.apache.activemq.artemis.api.core.client.ActiveMQClient; import org.apache.activemq.artemis.api.core.client.ClientSession; import org.apache.activemq.artemis.api.core.client.ClientSessionFactory; import org.apache.activemq.artemis.api.core.client.ClusterTopologyListener; import org.apache.activemq.artemis.api.core.client.ServerLocator; import org.apache.activemq.artemis.api.core.client.TopologyMember; import org.apache.activemq.artemis.core.client.impl.ClientSessionFactoryInternal; import org.apache.activemq.artemis.core.client.impl.ServerLocatorInternal; import org.apache.activemq.artemis.core.config.ClusterConnectionConfiguration; import org.apache.activemq.artemis.core.config.ConfigurationUtils; import org.apache.activemq.artemis.core.protocol.core.Channel; import org.apache.activemq.artemis.core.protocol.core.ChannelHandler; import org.apache.activemq.artemis.core.protocol.core.CoreRemotingConnection; import org.apache.activemq.artemis.core.protocol.core.Packet; import org.apache.activemq.artemis.core.protocol.core.impl.PacketImpl; import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.BackupRegistrationMessage; import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.BackupReplicationStartFailedMessage; import org.apache.activemq.artemis.core.protocol.core.impl.wireformat.ReplicationLiveIsStoppingMessage; import org.apache.activemq.artemis.core.remoting.CloseListener; import org.apache.activemq.artemis.core.remoting.FailureListener; import org.apache.activemq.artemis.core.remoting.server.RemotingService; import org.apache.activemq.artemis.core.replication.ReplicationManager; import org.apache.activemq.artemis.core.server.ActiveMQMessageBundle; import org.apache.activemq.artemis.core.server.ActiveMQServerLogger; import org.apache.activemq.artemis.core.server.NodeManager; import org.apache.activemq.artemis.core.server.cluster.ClusterConnection; import org.apache.activemq.artemis.core.server.cluster.ha.ReplicatedPolicy; import org.apache.activemq.artemis.core.server.cluster.qourum.QuorumManager; import org.apache.activemq.artemis.core.server.cluster.qourum.QuorumVoteServerConnect; import org.apache.activemq.artemis.spi.core.remoting.Acceptor; import org.jboss.logging.Logger; public class SharedNothingLiveActivation extends LiveActivation { private static final Logger logger = Logger.getLogger(SharedNothingLiveActivation.class); //this is how we act when we initially start as a live private ReplicatedPolicy replicatedPolicy; private ActiveMQServerImpl activeMQServer; private ReplicationManager replicationManager; private final Object replicationLock = new Object(); public SharedNothingLiveActivation(ActiveMQServerImpl activeMQServer, ReplicatedPolicy replicatedPolicy) { this.activeMQServer = activeMQServer; this.replicatedPolicy = replicatedPolicy; } @Override public void freezeConnections(RemotingService remotingService) { ReplicationManager localReplicationManager = replicationManager; if (remotingService != null && localReplicationManager != null) { remotingService.freeze(null, localReplicationManager.getBackupTransportConnection()); } else if (remotingService != null) { remotingService.freeze(null, null); } } @Override public void run() { try { if (replicatedPolicy.isCheckForLiveServer() && isNodeIdUsed()) { //set for when we failback if (logger.isTraceEnabled()) { logger.tracef("@@@ setting up replicatedPolicy.getReplicaPolicy for back start, replicaPolicy::%s, isBackup=%s, server=%s", replicatedPolicy.getReplicaPolicy(), replicatedPolicy.isBackup(), activeMQServer); } replicatedPolicy.getReplicaPolicy().setReplicatedPolicy(replicatedPolicy); activeMQServer.setHAPolicy(replicatedPolicy.getReplicaPolicy()); return; } logger.trace("@@@ did not do it now"); activeMQServer.initialisePart1(false); activeMQServer.initialisePart2(false); activeMQServer.completeActivation(); if (activeMQServer.getIdentity() != null) { ActiveMQServerLogger.LOGGER.serverIsLive(activeMQServer.getIdentity()); } else { ActiveMQServerLogger.LOGGER.serverIsLive(); } } catch (Exception e) { ActiveMQServerLogger.LOGGER.initializationError(e); activeMQServer.callActivationFailureListeners(e); } } @Override public ChannelHandler getActivationChannelHandler(final Channel channel, final Acceptor acceptorUsed) { return new ChannelHandler() { @Override public void handlePacket(Packet packet) { if (packet.getType() == PacketImpl.BACKUP_REGISTRATION) { BackupRegistrationMessage msg = (BackupRegistrationMessage) packet; ClusterConnection clusterConnection = acceptorUsed.getClusterConnection(); try { startReplication(channel.getConnection(), clusterConnection, getPair(msg.getConnector(), true), msg.isFailBackRequest()); } catch (ActiveMQAlreadyReplicatingException are) { channel.send(new BackupReplicationStartFailedMessage(BackupReplicationStartFailedMessage.BackupRegistrationProblem.ALREADY_REPLICATING)); } catch (ActiveMQException e) { logger.debug("Failed to process backup registration packet", e); channel.send(new BackupReplicationStartFailedMessage(BackupReplicationStartFailedMessage.BackupRegistrationProblem.EXCEPTION)); } } } }; } public void startReplication(CoreRemotingConnection rc, final ClusterConnection clusterConnection, final Pair<TransportConfiguration, TransportConfiguration> pair, final boolean isFailBackRequest) throws ActiveMQException { if (replicationManager != null) { throw new ActiveMQAlreadyReplicatingException(); } if (!activeMQServer.isStarted()) { throw new ActiveMQIllegalStateException(); } synchronized (replicationLock) { if (replicationManager != null) { throw new ActiveMQAlreadyReplicatingException(); } ReplicationFailureListener listener = new ReplicationFailureListener(); rc.addCloseListener(listener); rc.addFailureListener(listener); replicationManager = new ReplicationManager(rc, clusterConnection.getCallTimeout(), activeMQServer.getExecutorFactory()); replicationManager.start(); Thread t = new Thread(new Runnable() { @Override public void run() { try { activeMQServer.getStorageManager().startReplication(replicationManager, activeMQServer.getPagingManager(), activeMQServer.getNodeID().toString(), isFailBackRequest && replicatedPolicy.isAllowAutoFailBack(), replicatedPolicy.getInitialReplicationSyncTimeout()); clusterConnection.nodeAnnounced(System.currentTimeMillis(), activeMQServer.getNodeID().toString(), replicatedPolicy.getGroupName(), replicatedPolicy.getScaleDownGroupName(), pair, true); //todo, check why this was set here //backupUpToDate = false; if (isFailBackRequest && replicatedPolicy.isAllowAutoFailBack()) { BackupTopologyListener listener1 = new BackupTopologyListener(activeMQServer.getNodeID().toString(), clusterConnection.getConnector()); clusterConnection.addClusterTopologyListener(listener1); if (listener1.waitForBackup()) { //if we have to many backups kept or are not configured to restart just stop, otherwise restart as a backup activeMQServer.stop(true); ActiveMQServerLogger.LOGGER.restartingReplicatedBackupAfterFailback(); // activeMQServer.moveServerData(replicatedPolicy.getReplicaPolicy().getMaxSavedReplicatedJournalsSize()); activeMQServer.setHAPolicy(replicatedPolicy.getReplicaPolicy()); activeMQServer.start(); } else { ActiveMQServerLogger.LOGGER.failbackMissedBackupAnnouncement(); } } } catch (Exception e) { if (activeMQServer.getState() == ActiveMQServerImpl.SERVER_STATE.STARTED) { /* * The reasoning here is that the exception was either caused by (1) the * (interaction with) the backup, or (2) by an IO Error at the storage. If (1), we * can swallow the exception and ignore the replication request. If (2) the live * will crash shortly. */ ActiveMQServerLogger.LOGGER.errorStartingReplication(e); } try { ActiveMQServerImpl.stopComponent(replicationManager); } catch (Exception amqe) { ActiveMQServerLogger.LOGGER.errorStoppingReplication(amqe); } finally { synchronized (replicationLock) { replicationManager = null; } } } } }); t.start(); } } private final class ReplicationFailureListener implements FailureListener, CloseListener { @Override public void connectionFailed(ActiveMQException exception, boolean failedOver) { handleClose(true); } @Override public void connectionFailed(final ActiveMQException me, boolean failedOver, String scaleDownTargetNodeID) { connectionFailed(me, failedOver); } @Override public void connectionClosed() { handleClose(false); } private void handleClose(boolean failed) { ExecutorService executorService = activeMQServer.getThreadPool(); if (executorService != null) { executorService.execute(new Runnable() { @Override public void run() { synchronized (replicationLock) { if (replicationManager != null) { activeMQServer.getStorageManager().stopReplication(); replicationManager = null; if (failed && replicatedPolicy.isVoteOnReplicationFailure()) { QuorumManager quorumManager = activeMQServer.getClusterManager().getQuorumManager(); int size = replicatedPolicy.getQuorumSize() == -1 ? quorumManager.getMaxClusterSize() : replicatedPolicy.getQuorumSize(); QuorumVoteServerConnect quorumVote = new QuorumVoteServerConnect(size, activeMQServer.getStorageManager()); quorumManager.vote(quorumVote); try { quorumVote.await(5, TimeUnit.SECONDS); } catch (InterruptedException interruption) { // No-op. The best the quorum can do now is to return the latest number it has } quorumManager.voteComplete(quorumVote); if (!quorumVote.getDecision()) { try { Thread startThread = new Thread(new Runnable() { @Override public void run() { try { if (logger.isTraceEnabled()) { logger.trace("Calling activeMQServer.stop() to stop the server"); } activeMQServer.stop(); } catch (Exception e) { ActiveMQServerLogger.LOGGER.errorRestartingBackupServer(e, activeMQServer); } } }); startThread.start(); startThread.join(); } catch (Exception e) { e.printStackTrace(); } } } } } } }); } } } private Pair<TransportConfiguration, TransportConfiguration> getPair(TransportConfiguration conn, boolean isBackup) { if (isBackup) { return new Pair<>(null, conn); } return new Pair<>(conn, null); } /** * Determines whether there is another server already running with this server's nodeID. * <p> * This can happen in case of a successful fail-over followed by the live's restart * (attempting a fail-back). * * @throws Exception */ private boolean isNodeIdUsed() throws Exception { if (activeMQServer.getConfiguration().getClusterConfigurations().isEmpty()) return false; SimpleString nodeId0; try { nodeId0 = activeMQServer.getNodeManager().readNodeId(); } catch (ActiveMQIllegalStateException e) { nodeId0 = null; } ClusterConnectionConfiguration config = ConfigurationUtils.getReplicationClusterConfiguration(activeMQServer.getConfiguration(), replicatedPolicy.getClusterName()); NodeIdListener listener = new NodeIdListener(nodeId0, activeMQServer.getConfiguration().getClusterUser(), activeMQServer.getConfiguration().getClusterPassword()); try (ServerLocatorInternal locator = getLocator(config)) { locator.addClusterTopologyListener(listener); locator.setReconnectAttempts(0); try (ClientSessionFactoryInternal factory = locator.connectNoWarnings()) { // Just try connecting listener.latch.await(5, TimeUnit.SECONDS); } catch (Exception notConnected) { return false; } return listener.isNodePresent; } } @Override public void close(boolean permanently, boolean restarting) throws Exception { replicationManager = null; // To avoid a NPE cause by the stop NodeManager nodeManagerInUse = activeMQServer.getNodeManager(); if (nodeManagerInUse != null) { //todo does this actually make any difference, we only set a different flag in the lock file which replication doesn't use if (permanently) { nodeManagerInUse.crashLiveServer(); } else { nodeManagerInUse.pauseLiveServer(); } } } @Override public void sendLiveIsStopping() { final ReplicationManager localReplicationManager = replicationManager; if (localReplicationManager != null) { localReplicationManager.sendLiveIsStopping(ReplicationLiveIsStoppingMessage.LiveStopping.STOP_CALLED); // Schedule for 10 seconds // this pool gets a 'hard' shutdown, no need to manage the Future of this Runnable. activeMQServer.getScheduledPool().schedule(new Runnable() { @Override public void run() { localReplicationManager.clearReplicationTokens(); } }, 30, TimeUnit.SECONDS); } } @Override public ReplicationManager getReplicationManager() { synchronized (replicationLock) { return replicationManager; } } private ServerLocatorInternal getLocator(ClusterConnectionConfiguration config) throws ActiveMQException { ServerLocatorInternal locator; if (config.getDiscoveryGroupName() != null) { DiscoveryGroupConfiguration dg = activeMQServer.getConfiguration().getDiscoveryGroupConfigurations().get(config.getDiscoveryGroupName()); if (dg == null) { throw ActiveMQMessageBundle.BUNDLE.noDiscoveryGroupFound(dg); } locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(dg); } else { TransportConfiguration[] tcConfigs = config.getStaticConnectors() != null ? connectorNameListToArray(config.getStaticConnectors()) : null; locator = (ServerLocatorInternal) ActiveMQClient.createServerLocatorWithHA(tcConfigs); } return locator; } static final class NodeIdListener implements ClusterTopologyListener { volatile boolean isNodePresent = false; private final SimpleString nodeId; private final String user; private final String password; private final CountDownLatch latch = new CountDownLatch(1); NodeIdListener(SimpleString nodeId, String user, String password) { this.nodeId = nodeId; this.user = user; this.password = password; } @Override public void nodeUP(TopologyMember topologyMember, boolean last) { boolean isOurNodeId = nodeId != null && nodeId.toString().equals(topologyMember.getNodeId()); if (isOurNodeId && isActive(topologyMember.getLive())) { isNodePresent = true; } if (isOurNodeId || last) { latch.countDown(); } } /** * In a cluster of replicated live/backup pairs if a backup crashes and then its live crashes the cluster will * retain the topology information of the live such that when the live server restarts it will check the * cluster to see if its nodeID is present (which it will be) and then it will activate as a backup rather than * a live. To prevent this situation an additional check is necessary to see if the server with the matching * nodeID is actually active or not which is done by attempting to make a connection to it. * * @param transportConfiguration * @return */ private boolean isActive(TransportConfiguration transportConfiguration) { boolean result = false; try (ServerLocator serverLocator = ActiveMQClient.createServerLocator(false, transportConfiguration); ClientSessionFactory clientSessionFactory = serverLocator.createSessionFactory(); ClientSession clientSession = clientSessionFactory.createSession(user, password, false, false, false, false, 0)) { result = true; } catch (Exception e) { if (logger.isDebugEnabled()) { logger.debug("isActive check failed", e); } } return result; } @Override public void nodeDown(long eventUID, String nodeID) { // no-op } } private TransportConfiguration[] connectorNameListToArray(final List<String> connectorNames) { return activeMQServer.getConfiguration().getTransportConfigurations(connectorNames); } }