/*
* ToroDB
* Copyright © 2014 8Kdata Technology (www.8kdata.com)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package com.torodb.mongodb.repl.topology;
import com.eightkdata.mongowp.ErrorCode;
import com.eightkdata.mongowp.OpTime;
import com.eightkdata.mongowp.client.core.MongoConnection.RemoteCommandResponse;
import com.eightkdata.mongowp.exceptions.HostUnreachableException;
import com.eightkdata.mongowp.exceptions.InvalidOptionsException;
import com.eightkdata.mongowp.exceptions.MongoException;
import com.eightkdata.mongowp.exceptions.NodeNotFoundException;
import com.eightkdata.mongowp.exceptions.ShutdownInProgressException;
import com.eightkdata.mongowp.exceptions.UnauthorizedException;
import com.google.common.base.Preconditions;
import com.google.common.net.HostAndPort;
import com.google.common.primitives.UnsignedInteger;
import com.torodb.mongodb.commands.pojos.MemberConfig;
import com.torodb.mongodb.commands.pojos.MemberHeartbeatData;
import com.torodb.mongodb.commands.pojos.MemberHeartbeatData.Health;
import com.torodb.mongodb.commands.pojos.MemberState;
import com.torodb.mongodb.commands.pojos.ReplSetProtocolVersion;
import com.torodb.mongodb.commands.pojos.ReplicaSetConfig;
import com.torodb.mongodb.commands.signatures.internal.ReplSetHeartbeatCommand.ReplSetHeartbeatArgument;
import com.torodb.mongodb.commands.signatures.internal.ReplSetHeartbeatReply;
import com.torodb.mongodb.commands.signatures.repl.ReplSetSyncFromCommand.ReplSetSyncFromReply;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.Set;
import java.util.WeakHashMap;
import java.util.stream.Stream;
import javax.annotation.Nonnegative;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.annotation.concurrent.NotThreadSafe;
/**
* Objects of this class are responsible for managing the topology of the cluster when the node is
* not member of the replica set.
*
* Methods of this class should be non-blocking.
*/
@NotThreadSafe
@SuppressWarnings("checkstyle:MemberName")
class TopologyCoordinator {
private static final Logger LOGGER = LogManager.getLogger(TopologyCoordinator.class);
/**
* Maximum number of retries for a failed heartbeat.
*/
private static final int MAX_HEARTBEAT_RETRIES = 2;
/**
* Interval between the time the last heartbeat from a node was received successfully, or the time
* when we gave up retrying, and when the next heartbeat should be sent to a target.
*/
private static final Duration HEARTBEAT_INTERVAL = Duration.ofSeconds(2);
/**
* the index of the member we currently believe is primary, if one exists, otherwise -1
*/
private int _currentPrimaryIndex;
/**
* the hostandport we are currently syncing from or {@link Optional#empty()} if no sync source (we
* are cannot connect to anyone yet)
*/
@Nonnull
private Optional<HostAndPort> _syncSource;
/**
* These members are not chosen as sync sources for a period of time, due to connection issues
* with them
*/
private final Map<HostAndPort, Instant> _syncSourceBlacklist;
/**
* The next sync source to be chosen, requested via a replSetSyncFrom command
*/
private int _forceSyncSourceIndex;
/**
* How far this node must fall behind before considering switching sync sources
*/
private final int _maxSyncSourceLagSecs;
/**
* The current config, including a vector of MemberConfigs.
*/
private ReplicaSetConfig _rsConfig;
/**
* heartbeat data for each member. It is guaranteed that this vector will be maintained in the
* same order as the MemberConfigs in _currentConfig, therefore the member config index can be
* used to index into this vector as well.
*/
private List<MemberHeartbeatData> _hbdata;
/**
* Ping stats for each member by HostAndPort;
*/
private final Map<HostAndPort, PingStats> _pings;
private final long slaveDelaySecs;
private final Set<VersionChangeListener> versionListeners = Collections.newSetFromMap(
new WeakHashMap<>());
/**
*
* @param maxSyncSourceLagSecs
* @param slaveDelay our delay. It is rounded to seconds and must be non negative.
*/
public TopologyCoordinator(Duration maxSyncSourceLag, Duration slaveDelay) {
this._currentPrimaryIndex = -1;
this._syncSource = Optional.empty();
this._syncSourceBlacklist = new HashMap<>();
this._forceSyncSourceIndex = -1;
Preconditions.checkArgument(!maxSyncSourceLag.isNegative(),
"Negative max sync source lag is not accepted");
this._maxSyncSourceLagSecs = (int) maxSyncSourceLag.getSeconds();
this._pings = new HashMap<>();
this.slaveDelaySecs = slaveDelay.getSeconds();
Preconditions.checkArgument(slaveDelaySecs >= 0, "Slave delay must be "
+ "non negative, but %s was found", slaveDelay);
}
@Nonnull
private PingStats getPingOrDefault(HostAndPort hostAndPort) {
PingStats result = _pings.get(hostAndPort);
if (result == null) {
result = new PingStats();
_pings.put(hostAndPort, result);
}
return result;
}
int getCurrentPrimaryIndex() {
return _currentPrimaryIndex;
}
public ReplicaSetConfig getRsConfig() {
return _rsConfig;
}
public void addVersionChangeListener(VersionChangeListener listener) {
versionListeners.add(listener);
}
/**
* Returns the address of the current sync source, or an empty HostAndPort if there is no current
* sync source.
*/
@Nonnull
Optional<HostAndPort> getSyncSourceAddress() {
return _syncSource;
}
/**
* Retrieves a vector of HostAndPorts containing all nodes that are neither DOWN.
*/
List<HostAndPort> getMaybeUpHostAndPorts() {
List<HostAndPort> upHosts = new ArrayList<>(_hbdata.size());
for (int i = 0; i < _hbdata.size(); i++) {
MemberHeartbeatData it = _hbdata.get(i);
if (it.maybeUp()) {
continue; // skip DOWN nodes
}
upHosts.add(_rsConfig.getMembers().get(i).getHostAndPort());
}
return upHosts;
}
/**
* Sets the index into the config used when we next choose a sync source
*/
void setForceSyncSourceIndex(int index) {
assert _forceSyncSourceIndex < _rsConfig.getMembers().size();
_forceSyncSourceIndex = index;
}
/**
* Chooses and sets a new sync source, based on our current knowledge of the world.
*
* @return the new sync source or {@link Optional#empty()} if we cannot calculate a new sync
* source yet
*/
@Nonnull
Optional<HostAndPort> chooseNewSyncSource(Instant now, Optional<OpTime> lastOpApplied) {
// if we have a target we've requested to sync from, use it
if (_forceSyncSourceIndex != -1) {
assert _forceSyncSourceIndex < _rsConfig.getMembers().size();
HostAndPort syncSource = _rsConfig.getMembers().get(_forceSyncSourceIndex).getHostAndPort();
_syncSource = Optional.of(syncSource);
_forceSyncSourceIndex = -1;
String msg = "syncing from: " + syncSource + " by request";
LOGGER.info(msg);
return _syncSource;
}
// wait for 2N pings before choosing a sync target
if (_hbdata == null) { //we dont have a repl config yet
assert _rsConfig == null;
return Optional.empty();
}
int needMorePings = _hbdata.size() * 2 - getTotalPings();
if (needMorePings > 0) {
LOGGER.info("Waiting for {} pings from other members before syncing", needMorePings);
_syncSource = Optional.empty();
return _syncSource;
}
// If we are only allowed to sync from the primary, set that
if (!_rsConfig.isChainingAllowed()) {
if (_currentPrimaryIndex == -1) {
LOGGER.warn("Cannot select sync source because chaining is not allowed and primary "
+ "is unknown/down");
_syncSource = Optional.empty();
return _syncSource;
} else if (isBlacklistedMember(getCurrentPrimaryMember(), now)) {
LOGGER.warn("Cannot select sync source because chaining is not allowed and "
+ "primary is not currently accepting our updates");
_syncSource = Optional.empty();
return _syncSource;
} else {
HostAndPort syncSource = _rsConfig.getMembers().get(_currentPrimaryIndex).getHostAndPort();
_syncSource = Optional.of(syncSource);
String msg = "syncing from primary: " + syncSource;
LOGGER.info(msg);
return _syncSource;
}
}
// find the member with the lowest ping time that is ahead of me
// Find primary's oplog time. Reject sync candidates that are more than
// maxSyncSourceLagSecs seconds behind.
OpTime primaryOpTime;
if (_currentPrimaryIndex != -1) {
primaryOpTime = _hbdata.get(_currentPrimaryIndex).getOpTime();
assert primaryOpTime != null;
} else {
// choose a time that will exclude no candidates, since we don't see a primary
primaryOpTime = OpTime.ofSeconds(_maxSyncSourceLagSecs);
}
if (primaryOpTime.getSecs() < _maxSyncSourceLagSecs) {
// erh - I think this means there was just a new election
// and we don't yet know the new primary's optime
primaryOpTime = OpTime.ofSeconds(_maxSyncSourceLagSecs);
}
OpTime oldestSyncOpTime = OpTime.ofSeconds(primaryOpTime.getSecs() - _maxSyncSourceLagSecs);
Optional<MemberConfig> newSyncSourceMember = lookForSyncSource(now, lastOpApplied, true,
oldestSyncOpTime);
if (!newSyncSourceMember.isPresent()) {
newSyncSourceMember = lookForSyncSource(now, lastOpApplied, false, oldestSyncOpTime);
}
if (!newSyncSourceMember.isPresent()) {
// Did not find any members to sync from
String msg = "could not find member to sync from";
// Only log when we had a valid sync source before
if (_syncSource.isPresent()) {
LOGGER.info(msg);
}
_syncSource = Optional.empty();
return _syncSource;
} else {
_syncSource = Optional.of(newSyncSourceMember.get().getHostAndPort());
LOGGER.info("syncing from: {}", _syncSource.get());
return _syncSource;
}
}
private MemberConfig getMemberConfig(MemberHeartbeatData hbData) {
int indexOf = _hbdata.indexOf(hbData);
Preconditions.checkArgument(indexOf >= 0, "Unknown hb data");
return _rsConfig.getMembers().get(indexOf);
}
/**
* Looks for an optimal sync source to replicate from.
*
* The first attempt, we ignore those nodes with slave delay higher than our own, hidden nodes,
* and nodes that are excessively lagged. The second attempt includes such nodes, in case those
* are the only ones we can reach. This loop attempts to set 'closestIndex'.
*
* @param now the current time
* @param lastOpAppliedOp the last OpTime this node has apply
* @param onlyOptimal if true, slaves with more delay than ourselve, hidden nodes or
* excessively lagged nodes are ignored
* @param oldestSyncOpTime the oldest optime considered not excessively lagged. Only used if
* onlyOptimal is true.
* @return the new optimal sync source, which is not {@link Optional#isPresent() present} if no
* one can be chosen
*/
private Optional<MemberConfig> lookForSyncSource(Instant now, Optional<OpTime> lastOpAppliedOp,
boolean onlyOptimal, OpTime oldestSyncOpTime) {
OpTime lastOpApplied = lastOpAppliedOp.orElse(OpTime.EPOCH);
Stream<MemberHeartbeatData> hbCandidateStream = _hbdata.stream()
// candidate must be up to be considered
.filter(MemberHeartbeatData::isUp)
// candidate must be PRIMARY or SECONDARY state to be considered.
.filter(hbData -> hbData.getState().isReadable())
// only consider candidates that are ahead of where we are
.filter(hbData ->
hbData.getOpTime().isAfter(lastOpApplied)
);
if (onlyOptimal) {
hbCandidateStream = hbCandidateStream
// omit candidates that are excessively behind
.filter(hbData -> hbData.getOpTime().isEqualOrAfter(oldestSyncOpTime));
}
Stream<MemberConfig> mcCandidateStream = hbCandidateStream.map(this::getMemberConfig)
// omit candidates that are blacklisted
.filter(mc -> !isBlacklistedMember(mc, now));
if (onlyOptimal) {
mcCandidateStream = mcCandidateStream
// only candidates that are not hidden
.filter(mc -> !mc.isHidden())
// only candidates whose slave delay is shorter than ours
.filter(mc -> mc.getSlaveDelay() < slaveDelaySecs);
}
//If there are several candidates, the one whose ping is lower is returned
return mcCandidateStream.reduce((MemberConfig cand1, MemberConfig cand2) -> {
long ping1 = getPing(cand1.getHostAndPort());
long ping2 = getPing(cand2.getHostAndPort());
if (ping1 < ping2) {
return cand1;
}
return cand2;
});
}
/**
* Suppresses selecting "host" as sync source until "until".
*/
void blacklistSyncSource(HostAndPort host, Instant until) {
LOGGER.debug("blacklisting {} until {}", host, until);
_syncSourceBlacklist.put(host, until);
}
/**
* Removes a single entry "host" from the list of potential sync sources which we have
* blacklisted, if it is supposed to be unblacklisted by "now".
*
* @param host the host that is wanted to be removed from the black list
* @param now the node will be removed from the black list if it is supposed to be unblacklisted
* by 'now.
*/
void unblacklistSyncSource(HostAndPort host, Instant now) {
Instant oldInstant = _syncSourceBlacklist.get(host);
if (oldInstant != null && !now.isBefore(oldInstant)) {
LOGGER.debug("unblacklisting {}", host);
_syncSourceBlacklist.remove(host);
}
}
/**
* Clears the list of potential sync sources we have blacklisted.
*/
void clearSyncSourceBlacklist() {
_syncSourceBlacklist.clear();
}
/**
* Determines if a new sync source should be chosen, if a better candidate sync source is
* available.
*
* It returns true if there exists a viable sync source member other than our current source,
* whose oplog has reached an optime greater than the max sync source lag later than current
* source's. It can return true in other scenarios (like if {@link #setForceSyncSourceIndex(int) }
* has been called or if we don't have a current sync source.
*
* @param now is used to skip over currently blacklisted sync sources.
* @return
*/
boolean shouldChangeSyncSource(HostAndPort currentSource, Instant now) {
// Methodology:
// If there exists a viable sync source member other than currentSource, whose oplog has
// reached an optime greater than _maxSyncSourceLagSecs later than currentSource's, return
// true.
// If the user requested a sync source change, return true.
if (_forceSyncSourceIndex != -1) {
return true;
}
OptionalInt currentMemberIndex = _rsConfig.findMemberIndexByHostAndPort(currentSource);
if (!currentMemberIndex.isPresent()) {
return true;
}
assert _hbdata.get(currentMemberIndex.getAsInt()) != null;
OpTime currentOpTime = _hbdata.get(currentMemberIndex.getAsInt()).getOpTime();
if (currentOpTime == null) {
// Haven't received a heartbeat from the sync source yet, so can't tell if we should
// change.
return false;
}
long currentSecs = currentOpTime.getSecs();
long goalSecs = currentSecs + _maxSyncSourceLagSecs;
for (int i = 0; i < _hbdata.size(); i++) {
MemberHeartbeatData it = _hbdata.get(i);
MemberConfig candidateConfig = _rsConfig.getMembers().get(i);
OpTime itOpTime = it.getOpTime();
if (itOpTime != null && it.isUp()
&& it.getState().isReadable() && !isBlacklistedMember(candidateConfig, now)
&& goalSecs < itOpTime.getSecs()) {
LOGGER.info("changing sync target because current sync target's most recent OpTime "
+ "is {} which is more than {} seconds behind member {} whose most recent "
+ "OpTime is {} ", currentOpTime, _maxSyncSourceLagSecs,
candidateConfig.getHostAndPort(), itOpTime);
return true;
}
}
return false;
}
ReplSetSyncFromReply executeReplSetSyncFrom(ErrorCode status, HostAndPort target,
OpTime lastOpApplied)
throws MongoException {
if (status == ErrorCode.CALLBACK_CANCELED) {
throw new ShutdownInProgressException("replication system is shutting down");
}
final HostAndPort syncFromRequested = target;
MemberConfig targetConfig = null;
int targetIndex;
for (targetIndex = 0; targetIndex < _rsConfig.getMembers().size(); targetIndex++) {
MemberConfig it = _rsConfig.getMembers().get(targetIndex);
if (it.getHostAndPort().equals(target)) {
targetConfig = it;
break;
}
}
if (targetConfig == null) {
throw new NodeNotFoundException("Could not find member \"" + target + "\" in replica set");
}
if (targetConfig.isArbiter()) {
throw new InvalidOptionsException("Cannot sync from \"" + target
+ "\" because it is an arbiter");
}
String warning = null;
MemberHeartbeatData hbdata = _hbdata.get(targetIndex);
if (hbdata.isAuthIssue()) {
throw new UnauthorizedException("not authorized to communicate with " + target);
}
if (hbdata.getHealth() == Health.UNREACHABLE) {
throw new HostUnreachableException("I cannot reach the requested member: " + target);
}
assert hbdata.getOpTime() != null;
if (hbdata.getOpTime().getSecs() + 10 < lastOpApplied.getSecs()) {
LOGGER.warn("attempting to sync from {}, but its latest opTime is {} and ours is {} "
+ "so this may not work", target, hbdata.getOpTime().getSecs(), lastOpApplied.getSecs());
warning = "requested member \"" + target + "\" is more than 10 seconds behind us";
}
HostAndPort prevSyncSource = getSyncSourceAddress().orElse(null);
setForceSyncSourceIndex(targetIndex);
return new ReplSetSyncFromReply(prevSyncSource, syncFromRequested, warning);
}
/**
* Updates the topology coordinator's notion of the replica set configuration.
*
* @param newConfig the new configuration. It should be not null except for testing purpose.
* @param now
*/
void updateConfig(ReplicaSetConfig newConfig, Instant now) {
final ReplicaSetConfig oldConfig = _rsConfig;
updateHeartbeatDataForReconfig(newConfig, now);
_rsConfig = newConfig;
_forceSyncSourceIndex = -1;
_currentPrimaryIndex = -1; // force secondaries to re-detect who the primary is
versionListeners.forEach(listener -> listener.onVersionChange(this, oldConfig));
}
/**
* Updates {@link #_hbdata} based on the newConfig, ensuring that every member in the newConfig
* has an entry in _hbdata.
* <p>
* If any nodes in the newConfig are also present in {@link #_currentConfig}, copies their
* heartbeat info into the corresponding entry in the updated _hbdata vector.
*/
private void updateHeartbeatDataForReconfig(ReplicaSetConfig newConfig, Instant now) {
if (newConfig == null) {
return;
}
List<MemberHeartbeatData> oldHeartbeats = _hbdata;
_hbdata = new ArrayList<>(newConfig.getMembers().size());
for (int index = 0; index < newConfig.getMembers().size(); index++) {
MemberConfig newMemberConfig = newConfig.getMembers().get(index);
MemberHeartbeatData newHeartbeatData = new MemberHeartbeatData();
if (_rsConfig != null) {
for (int oldIndex = 0; oldIndex < _rsConfig.getMembers().size(); oldIndex++) {
MemberConfig oldMemberConfig = _rsConfig.getMembers().get(oldIndex);
if (oldMemberConfig.getId() == newMemberConfig.getId()
&& oldMemberConfig.getHostAndPort().equals(newMemberConfig.getHostAndPort())) {
// This member existed in the old config with the same member ID and
// HostAndPort, so copy its heartbeat data over.
newHeartbeatData = oldHeartbeats.get(oldIndex);
break;
}
}
}
_hbdata.add(newHeartbeatData);
}
}
/**
* Prepares a heartbeat request appropriate for sending to "target", assuming the current time is
* "now".
* <p>
* The returned pair contains proper arguments for a replSetHeartbeat command, and an amount of
* time to wait for the response.
* <p>
* This call should be paired (with intervening network communication) with a call to
* processHeartbeatResponse for the same "target".
*
* @param now our current time
* @param ourSetName is used as the name for our replica set if the topology coordinator does not
* have a valid configuration installed.
* @param host the target of the request to be created
*/
RemoteCommandRequest<ReplSetHeartbeatArgument> prepareHeartbeatRequest(
Instant now, String ourSetName, HostAndPort target) {
PingStats hbStats = getPingOrDefault(target);
Duration alreadyElapsed;
if (hbStats.getLastHeartbeatStartDate() != null) {
alreadyElapsed = Duration.between(hbStats.getLastHeartbeatStartDate(), now);
} else {
alreadyElapsed = Duration.between(Instant.EPOCH, now);
}
if (_rsConfig == null || (hbStats.getNumFailuresSinceLastStart() > MAX_HEARTBEAT_RETRIES)
|| (alreadyElapsed.toMillis() >= _rsConfig.getHeartbeatTimeoutPeriod())) {
// This is either the first request ever for "target", or the heartbeat timeout has
// passed, so we're starting a "new" heartbeat.
hbStats.start(now);
alreadyElapsed = Duration.ZERO;
}
ReplSetHeartbeatArgument.Builder hbArgs = new ReplSetHeartbeatArgument.Builder(
ReplSetProtocolVersion.V1)
.setCheckEmpty(false);
if (_rsConfig != null) {
hbArgs.setSetName(_rsConfig.getReplSetName());
hbArgs.setConfigVersion(_rsConfig.getConfigVersion());
} else {
hbArgs.setSetName(ourSetName);
hbArgs.setConfigVersion(-2);
}
final Duration timeoutPeriod = _rsConfig != null
? Duration.ofMillis(_rsConfig.getHeartbeatTimeoutPeriod()) :
HEARTBEAT_INTERVAL;
Duration timeout = timeoutPeriod.minus(alreadyElapsed);
return new RemoteCommandRequest<>(target, "admin", hbArgs.build(), timeout);
}
/**
* Processes a heartbeat response from "target" that arrived around "now", having spent
* "networkRoundTripTime" millis on the network.
* <p>
* Updates internal topology coordinator state, and returns instructions about what action to take
* next.
* <p>
* If the next action is {@link HeartbeatResponseAction#makeNoAction() "NoAction"} then nothing
* has to be done.
* <p>
* If the next action indicates {@link HeartbeatResponseAction#makeReconfigAction() "Reconfig"},
* the caller should verify the configuration in hbResponse is acceptable, perform any other
* reconfiguration actions it must, and call
* {@link #updateConfig(
* com.eightkdata.mongowp.mongoserver.api.safe.library.v3m0.pojos.ReplicaSetConfig,
* java.time.Instant, com.eightkdata.mongowp.OpTime) updateConfig}
* with the appropiate arguments.
* <p>
* This call should be paired (with intervening network communication) with a call to
* prepareHeartbeatRequest for the same "target".
*
* @param now the aproximated time when the response has been recived
* @param networkRoundTripTime the time spent on network
* @param target the host that send the respond
* @param hbResponse
*/
HeartbeatResponseAction processHeartbeatResponse(
Instant now,
Duration networkRoundTripTime,
HostAndPort target,
RemoteCommandResponse<ReplSetHeartbeatReply> hbResponse) {
PingStats hbStats = getPingOrDefault(target);
Preconditions.checkState(hbStats.getLastHeartbeatStartDate() != null, "It seems that a hb "
+ "response has been recived before it has been prepared");
if (!hbResponse.isOk()) {
hbStats.miss();
} else {
hbStats.hit(networkRoundTripTime);
}
boolean isUnauthorized = (hbResponse.getErrorCode() == ErrorCode.UNAUTHORIZED) || (hbResponse
.getErrorCode() == ErrorCode.AUTHENTICATION_FAILED);
Duration alreadyElapsed = Duration.between(hbStats.getLastHeartbeatStartDate(), now);
Duration nextHeartbeatDelay;
// determine next start time
if (_rsConfig != null && (hbStats.getNumFailuresSinceLastStart() <= MAX_HEARTBEAT_RETRIES)
&& (alreadyElapsed.toMillis() < _rsConfig.getHeartbeatTimeoutPeriod())) {
if (isUnauthorized) {
nextHeartbeatDelay = HEARTBEAT_INTERVAL;
} else {
nextHeartbeatDelay = Duration.ZERO;
}
} else {
nextHeartbeatDelay = HEARTBEAT_INTERVAL;
}
Optional<ReplSetHeartbeatReply> commandReply = hbResponse.getCommandReply();
if (hbResponse.isOk() && commandReply.get().getConfig().isPresent()) {
long currentConfigVersion = _rsConfig != null ? _rsConfig.getConfigVersion() : -2;
ReplicaSetConfig newConfig = commandReply.get().getConfig().get();
assert newConfig != null;
if (newConfig.getConfigVersion() > currentConfigVersion) {
HeartbeatResponseAction nextAction = HeartbeatResponseAction.makeReconfigAction()
.setNextHeartbeatDelay(nextHeartbeatDelay);
return nextAction;
} else {
// Could be we got the newer version before we got the response, or the
// target erroneously sent us one, even through it isn't newer.
if (newConfig.getConfigVersion() < currentConfigVersion) {
LOGGER.debug("Config version from heartbeat was older than ours.");
LOGGER.trace("Current config: {}. Config from heartbeat: {}", _rsConfig, newConfig);
} else {
LOGGER.trace("Config from heartbeat response was same as ours.");
}
}
}
// Check if the heartbeat target is in our config. If it isn't, there's nothing left to do,
// so return early.
if (_rsConfig == null) {
HeartbeatResponseAction nextAction = HeartbeatResponseAction.makeNoAction();
nextAction.setNextHeartbeatDelay(nextHeartbeatDelay);
return nextAction;
}
OptionalInt memberIndexOpt = _rsConfig.findMemberIndexByHostAndPort(target);
if (!memberIndexOpt.isPresent()) {
LOGGER.debug("replset: Could not find {} in current config so ignoring --"
+ " current config: {}", target, _rsConfig);
HeartbeatResponseAction nextAction = HeartbeatResponseAction.makeNoAction();
nextAction.setNextHeartbeatDelay(nextHeartbeatDelay);
return nextAction;
}
assert memberIndexOpt.isPresent();
int memberIndex = memberIndexOpt.getAsInt();
MemberHeartbeatData hbData = _hbdata.get(memberIndex);
assert hbData != null;
MemberConfig member = _rsConfig.getMembers().get(memberIndex);
if (!hbResponse.isOk()) {
if (isUnauthorized) {
LOGGER.debug("setAuthIssue: heartbeat response failed due to authentication"
+ " issue for member _id: {}", member.getId());
hbData.setAuthIssue(now);
} else if (hbStats.getNumFailuresSinceLastStart() > MAX_HEARTBEAT_RETRIES || alreadyElapsed
.toMillis() >= _rsConfig.getHeartbeatTimeoutPeriod()) {
LOGGER.debug("setDownValues: heartbeat response failed for member _id:{}"
+ ", msg: {}", member.getId(), hbResponse.getErrorDesc());
hbData.setDownValues(now, hbResponse.getErrorDesc());
} else {
LOGGER.trace("Bad heartbeat response from {}; trying again; Retries left: {}; "
+ "{} ms have already elapsed",
target,
MAX_HEARTBEAT_RETRIES - hbStats.getNumFailuresSinceLastStart(),
alreadyElapsed.toMillis()
);
}
} else {
ReplSetHeartbeatReply nonNullReply = commandReply.get();
LOGGER.trace("setUpValues: heartbeat response good for member _id:{}, msg: {}",
member.getId(), nonNullReply.getHbmsg());
hbData.setUpValues(now, member.getHostAndPort(), nonNullReply);
}
HeartbeatResponseAction nextAction = updateHeartbeatDataImpl(memberIndex, now);
nextAction.setNextHeartbeatDelay(nextHeartbeatDelay);
return nextAction;
}
/**
* Performs updating {@link #_hbdata} and {@link #_currentPrimaryIndex} for
* {@link #processHeartbeatResponse(org.threeten.bp.Instant, org.threeten.bp.Duration,
* com.google.common.net.HostAndPort,
* com.eightkdata.mongowp.client.core.MongoConnection.RemoteCommandResponse,
* com.eightkdata.mongowp.OpTime) }.
*/
private HeartbeatResponseAction updateHeartbeatDataImpl(int updatedConfigIndex, Instant now) {
////////////////////
// Phase 1
////////////////////
// If we believe the node whose data was just updated is primary, confirm that
// the updated data supports that notion. If not, erase our notion of who is primary.
if (updatedConfigIndex == _currentPrimaryIndex) {
final MemberHeartbeatData updatedHbData = _hbdata.get(updatedConfigIndex);
assert updatedHbData != null;
if (!updatedHbData.isUp() || updatedHbData.getState() != MemberState.RS_PRIMARY) {
_currentPrimaryIndex = -1;
}
}
HeartbeatResponseAction newAction;
newAction = ifTwoPrimariesChecks(now);
if (newAction != null) {
return newAction;
}
// We do not believe that any remote is primary.
assert _hbdata.stream().noneMatch(input ->
input.isUp() && input.getState() == MemberState.RS_PRIMARY);
assert _currentPrimaryIndex == -1;
return HeartbeatResponseAction.makeNoAction();
}
/**
* Scan the member list's heartbeat data for who is primary, update _currentPrimaryIndex if
* necessary.
*
* @param now
* @return the action that must be executed or null if no action have to be executed, in which
* case is guaranteed that there is no remote primary
*/
@Nullable
private HeartbeatResponseAction ifTwoPrimariesChecks(Instant now) {
int remotePrimaryIndex = -1;
for (int itIndex = 0; itIndex < _hbdata.size(); itIndex++) {
MemberHeartbeatData it = _hbdata.get(itIndex);
if (it.getState() == MemberState.RS_PRIMARY && it.isUp()) {
if (remotePrimaryIndex != -1) {
// two other nodes think they are primary (asynchronously polled)
// -- wait for things to settle down.
LOGGER.info("replSet info two remote primaries (transiently)");
return HeartbeatResponseAction.makeNoAction();
}
remotePrimaryIndex = itIndex;
}
}
if (remotePrimaryIndex != -1) {
// If it's the same as last time, don't do anything further.
if (_currentPrimaryIndex == remotePrimaryIndex) {
return HeartbeatResponseAction.makeNoAction();
}
_currentPrimaryIndex = remotePrimaryIndex;
return HeartbeatResponseAction.makeNoAction();
} else {
return null;
}
}
private int getTotalPings() {
int totalPings = 0;
for (Entry<HostAndPort, PingStats> entry : _pings.entrySet()) {
totalPings += entry.getValue().getCount();
}
return totalPings;
}
private boolean isBlacklistedMember(MemberConfig memberConfig, Instant now) {
Instant blacklistedUntil = _syncSourceBlacklist.get(memberConfig.getHostAndPort());
return blacklistedUntil != null && blacklistedUntil.isAfter(now);
}
/**
* The MemberConfig of the primary node or null if there is no current primary.
*
* @return
*/
@Nullable
private MemberConfig getCurrentPrimaryMember() {
if (_currentPrimaryIndex == -1) {
return null;
}
return _rsConfig.getMembers().get(_currentPrimaryIndex);
}
private long getPing(HostAndPort hostAndPort) {
return getPingOrDefault(hostAndPort).getAvgRoundTripAproximation();
}
private static class PingStats {
@Nonnegative
private long count = 0;
@Nonnegative
private long value = UnsignedInteger.MAX_VALUE.longValue();
private Instant _lastHeartbeatStartDate = null;
private int _numFailuresSinceLastStart = Integer.MAX_VALUE;
/**
* @return the number of {@link #hit(org.threeten.bp.Duration) 'hit'} calls.
*/
@Nonnegative
public long getCount() {
return count;
}
/**
* Returns the weighted average round trip time (in millis) for heartbeat messages to the
* target.
*
* If no information is yet stored, {@link Long#MAX_VALUE} is returned
*
* @return the weighted average round trip time for heartbeat messages to the target.
*/
@Nonnegative
public long getAvgRoundTripAproximation() {
return value;
}
public Instant getLastHeartbeatStartDate() {
return _lastHeartbeatStartDate;
}
/**
* Gets the number of failures since {@link #start(org.threeten.bp.Instant) 'start'} was last
* called.
* <p>
* This value is incremented by calls to {@link #miss()}, cleared by calls to {@link #start()}
* and set to {@link Integer#MAX_VALUE} by calls to
* {@link #hit(org.threeten.bp.Duration) hit()}.
*/
public int getNumFailuresSinceLastStart() {
return _numFailuresSinceLastStart;
}
void start(Instant now) {
_lastHeartbeatStartDate = now;
_numFailuresSinceLastStart = 0;
}
private void miss() {
++_numFailuresSinceLastStart;
}
private void hit(Duration networkRoundTripTime) {
_numFailuresSinceLastStart = Integer.MAX_VALUE;
++count;
if (value == UnsignedInteger.MAX_VALUE.longValue()) { //first hit
value = networkRoundTripTime.toMillis();
} else {
value = calculateAvgRoundTripAprox(networkRoundTripTime);
}
if (value > UnsignedInteger.MAX_VALUE.longValue()) {
value = UnsignedInteger.MAX_VALUE.longValue();
}
}
private long calculateAvgRoundTripAprox(Duration networkRoundTripTime) {
return (long) ((value * 0.8) + (networkRoundTripTime.toMillis() * 0.2));
}
}
}