/* * ToroDB * Copyright © 2014 8Kdata Technology (www.8kdata.com) * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package com.torodb.mongodb.repl.topology; import com.eightkdata.mongowp.ErrorCode; import com.eightkdata.mongowp.Status; import com.eightkdata.mongowp.client.core.MongoConnection.ErroneousRemoteCommandResponse; import com.eightkdata.mongowp.client.core.MongoConnection.FromExceptionRemoteCommandRequest; import com.eightkdata.mongowp.client.core.MongoConnection.RemoteCommandResponse; import com.eightkdata.mongowp.client.core.UnreachableMongoServerException; import com.eightkdata.mongowp.exceptions.InconsistentReplicaSetNamesException; import com.eightkdata.mongowp.exceptions.MongoException; import com.eightkdata.mongowp.server.api.MongoRuntimeException; import com.eightkdata.mongowp.server.api.tools.Empty; import com.google.common.net.HostAndPort; import com.torodb.common.util.CompletionExceptions; import com.torodb.core.services.IdleTorodbService; import com.torodb.mongodb.commands.pojos.ReplicaSetConfig; import com.torodb.mongodb.commands.signatures.internal.ReplSetHeartbeatCommand.ReplSetHeartbeatArgument; import com.torodb.mongodb.commands.signatures.internal.ReplSetHeartbeatReply; import com.torodb.mongodb.repl.guice.ReplSetName; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.jooq.lambda.UncheckedException; import java.time.Clock; import java.time.Duration; import java.time.Instant; import java.util.concurrent.CancellationException; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ThreadFactory; import javax.annotation.Nonnull; import javax.annotation.Nullable; import javax.annotation.concurrent.GuardedBy; import javax.inject.Inject; import javax.inject.Singleton; @Singleton public class TopologyHeartbeatHandler extends IdleTorodbService { private static final Logger LOGGER = LogManager.getLogger(TopologyHeartbeatHandler.class); private final HostAndPort seed; private final Clock clock; private final String replSetName; private final HeartbeatNetworkHandler networkHandler; private final TopologyExecutor executor; private final TopologyErrorHandler errorHandler; private final VersionChangeListener versionChangeListener; @GuardedBy("executor") private boolean stopped; @Inject public TopologyHeartbeatHandler(Clock clock, @ReplSetName String replSetName, HeartbeatNetworkHandler heartbeatSender, TopologyExecutor executor, TopologyErrorHandler errorHandler, ThreadFactory threadFactory, @RemoteSeed HostAndPort seed) { super(threadFactory); this.clock = clock; this.replSetName = replSetName; this.networkHandler = heartbeatSender; this.executor = executor; this.errorHandler = errorHandler; this.versionChangeListener = this::scheduleHeartbeats; this.seed = seed; } @Override protected final String serviceName() { return "Heartbeat handler"; } @Override protected void startUp() throws Exception { LOGGER.debug("Starting up {}", serviceName()); boolean finished = false; while (!finished) { finished = start(seed) .handle(this::checkHeartbeatStarted) .join(); if (!finished) { LOGGER.debug("Retrying to start heartbeats in 1 second"); Thread.sleep(1000); } } LOGGER.debug("{} has been started up", serviceName()); } @Override protected void shutDown() throws Exception { LOGGER.debug("Shutting down {}", serviceName()); executor.onAnyVersion() .consumeAsync(coord -> stopped = true) .join(); LOGGER.debug("{} has been shutted down", serviceName()); } @GuardedBy("any") private boolean checkHeartbeatStarted(Status<?> status, Throwable t) { if (t == null) { if (status.isOk()) { LOGGER.trace("Heartbeat started correctly"); return true; } else { LOGGER.debug("Heartbeat start failed: {}", status); switch (status.getErrorCode()) { case NO_REPLICATION_ENABLED: LOGGER.warn("The sync source {} is not running with " + "replication enabled", seed); break; case INCONSISTENT_REPLICA_SET_NAMES: default: LOGGER.warn(status.getErrorMsg()); break; } return false; } } else { Throwable usefulThrowable = CompletionExceptions .getFirstNonCompletionException(t); if (usefulThrowable instanceof UncheckedException) { usefulThrowable = usefulThrowable.getCause() != null ? usefulThrowable.getCause() : usefulThrowable; } LOGGER.warn("Heartbeat start failed (sync source: " + seed + "): " + usefulThrowable .getLocalizedMessage(), usefulThrowable); return false; } } CompletableFuture<Status<ReplicaSetConfig>> start(HostAndPort seed) { executor.addVersionChangeListener(versionChangeListener); return executor.onCurrentVersion().andThenApplyAsync( networkHandler.askForConfig( new RemoteCommandRequest<>(seed, "admin", Empty.getInstance()) ), (coord, remoteConfig) -> { Status<ReplicaSetConfig> result = remoteConfig.asStatus(); if (!result.isOk()) { return result; } ReplicaSetConfig replConfig = result.getResult(); try { checkRemoteReplSetConfig(replConfig); updateConfig(coord, replConfig); return result; } catch (InconsistentReplicaSetNamesException ex) { return Status.from(ex); } } ); } private void checkRemoteReplSetConfig(ReplicaSetConfig remoteConfig) throws InconsistentReplicaSetNamesException { //TODO(gortiz): DRY. Implement a better way to do that once the config //is validated String remoteReplSetName = remoteConfig.getReplSetName(); if (!replSetName.equals(remoteReplSetName)) { throw new InconsistentReplicaSetNamesException( "The remote replica set configuration is named as '" + remoteReplSetName + "', which differs with the local " + "replica set name '" + replSetName + "'"); } } @GuardedBy("executor") private void scheduleHeartbeats(TopologyCoordinator coord, ReplicaSetConfig oldConf) { LOGGER.debug("Scheduling new heartbeats to nodes on config {}", coord.getRsConfig().getConfigVersion()); coord.getRsConfig().getMembers().stream() .forEach(member -> scheduleHeartbeatToTarget( member.getHostAndPort(), Duration.ZERO )); } @GuardedBy("any") private CompletableFuture<?> scheduleHeartbeatToTarget(final HostAndPort target, Duration delay) { LOGGER.trace("Scheduling heartbeat to {} in {}", target, delay); return executor.onCurrentVersion() .scheduleOnce((coord) -> doHeartbeat(coord, target), delay); } @GuardedBy("executor") private void doHeartbeat(final TopologyCoordinator coord, final HostAndPort target) { if (stopped) { LOGGER.trace("Ignoring heartbeat to {} because the handler has " + "been stopped", target); return; } Instant start = clock.instant(); RemoteCommandRequest<ReplSetHeartbeatArgument> request = coord .prepareHeartbeatRequest(start, replSetName, target); CompletableFuture<RemoteCommandResponse<ReplSetHeartbeatReply>> hbHandle = networkHandler.sendHeartbeat(request) .exceptionally(t -> onNetworkError(t, target, start)); executor.onCurrentVersion() .andThenAcceptAsync( hbHandle, (coord2, response) -> handleHeartbeatResponse( coord2, target, request.getCmdObj(), response)); } /** * Called when a heartbeat request fails on the network handler. * * It is important to not call this method more than once per request, otherwise more than one * request can be scheduled to the target. * * @param t * @param target */ @GuardedBy("any") private RemoteCommandResponse<ReplSetHeartbeatReply> onNetworkError( Throwable t, HostAndPort target, Instant start) { Throwable cause = CompletionExceptions.getFirstNonCompletionException(t); while (cause.getCause() != cause && cause instanceof UncheckedException) { cause = cause.getCause(); } if (cause instanceof CancellationException) { LOGGER.trace("Heartbeat handling to {} has been cancelled " + "before execution: {}", target, cause.getMessage()); throw (CancellationException) cause; } else { LOGGER.debug("Error while on the heartbeat request sent to " + target, t); if (errorHandler.reciveHeartbeatError(cause)) { RemoteCommandResponse<ReplSetHeartbeatReply> response = handleHeartbeatError(cause, start); LOGGER.trace("Handled with a response with error {}", response.getErrorCode()); return response; } else { String msg = "Aborting execution as requested by the topology " + "supervisor"; LOGGER.trace(msg); stopAsync(); throw new CancellationException(msg); } } } @Nonnull private RemoteCommandResponse<ReplSetHeartbeatReply> handleHeartbeatError( Throwable t, Instant start) { Duration d = Duration.between(clock.instant(), start); ErrorCode errorCode; if (t instanceof MongoException) { return new FromExceptionRemoteCommandRequest((MongoException) t, d); } else if (t instanceof UnreachableMongoServerException) { errorCode = ErrorCode.HOST_UNREACHABLE; } else { if (!(t instanceof MongoRuntimeException) && !(t instanceof UnreachableMongoServerException)) { LOGGER.warn("Unexpected exception {} catched by the topology " + "heartbeat handler", t.getClass().getSimpleName()); } errorCode = ErrorCode.UNKNOWN_ERROR; } return new ErroneousRemoteCommandResponse<>( errorCode, t.getLocalizedMessage(), d ); } @GuardedBy("executor") private void handleHeartbeatResponse(TopologyCoordinator coord, HostAndPort target, ReplSetHeartbeatArgument request, RemoteCommandResponse<ReplSetHeartbeatReply> response) { boolean isUnauthorized = (response.getErrorCode() == ErrorCode.UNAUTHORIZED) || (response .getErrorCode() == ErrorCode.AUTHENTICATION_FAILED); Instant now = clock.instant(); Duration networkTime = Duration.ZERO; if (response.isOk()) { networkTime = response.getNetworkTime(); } else { LOGGER.warn("Error in heartbeat request to {}; {}", target, response.asStatus()); if (response.getBson() != null) { LOGGER.debug("heartbeat response: ", response.getBson()); } if (isUnauthorized) { networkTime = response.getNetworkTime(); } } HeartbeatResponseAction action = coord.processHeartbeatResponse(now, networkTime, target, response); ReplSetHeartbeatReply hbReply = response.getCommandReply().orElse(null); assert hbReply != null || !response.isOk() : "Recived a null hbReply when the request didn't fail"; scheduleHeartbeatToTarget(target, action.getNextHeartbeatDelay()); handleHeartbeatResponseAction(coord, action, hbReply, response.getErrorCode()); } @GuardedBy("executor") private void handleHeartbeatResponseAction(TopologyCoordinator coord, HeartbeatResponseAction action, @Nullable ReplSetHeartbeatReply reply, ErrorCode responseStatus) throws UnsupportedHeartbeatResponseActionException { switch (action.getAction()) { case NO_ACTION: break; case RECONFIG: assert reply != null; assert reply.getConfig().isPresent(); updateConfig(coord, reply.getConfig().get()); break; case START_ELECTION: case STEP_DOWN_SELF: case STEP_DOWN_REMOTE_PRIMARY: throw new UnsupportedHeartbeatResponseActionException(action, reply); default: LOGGER.error("Illegal heartbeat response action code {}", action.getAction()); throw new AssertionError(); } } @GuardedBy("executor") private void updateConfig(TopologyCoordinator coord, ReplicaSetConfig config) { validateConfig(coord, config); coord.updateConfig(config, clock.instant()); } @GuardedBy("executor") private void validateConfig(TopologyCoordinator coord, ReplicaSetConfig config) { LOGGER.debug("Accepting the new replica set config (version is {}) without validating it first " + "(not supported yet)", config.getConfigVersion()); } private static class UnsupportedHeartbeatResponseActionException extends RuntimeException { private static final long serialVersionUID = 8879568483145061898L; private final HeartbeatResponseAction action; @Nullable private final transient ReplSetHeartbeatReply reply; public UnsupportedHeartbeatResponseActionException(HeartbeatResponseAction action, ReplSetHeartbeatReply reply) { super("Heartbeat action " + action.getAction() + " is not supported"); this.action = action; this.reply = reply; } public HeartbeatResponseAction getAction() { return action; } @Nullable public ReplSetHeartbeatReply getReply() { return reply; } } }