/*
* Copyright 2014 WANdisco
*
* WANdisco licenses this file to you under the Apache License,
* version 2.0 (the "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/
package c5db.replication;
import c5db.ReplicatorConstants;
import c5db.interfaces.replication.IndexCommitNotice;
import c5db.interfaces.replication.QuorumConfiguration;
import c5db.interfaces.replication.Replicator;
import c5db.interfaces.replication.ReplicatorInstanceEvent;
import c5db.interfaces.replication.ReplicatorLog;
import c5db.interfaces.replication.ReplicatorReceipt;
import c5db.replication.generated.AppendEntries;
import c5db.replication.generated.AppendEntriesReply;
import c5db.replication.generated.LogEntry;
import c5db.replication.generated.PreElectionPoll;
import c5db.replication.generated.PreElectionReply;
import c5db.replication.generated.RequestVote;
import c5db.replication.generated.RequestVoteReply;
import c5db.replication.rpc.RpcReply;
import c5db.replication.rpc.RpcRequest;
import c5db.replication.rpc.RpcWireReply;
import c5db.replication.rpc.RpcWireRequest;
import c5db.util.C5Futures;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.SettableFuture;
import org.jetbrains.annotations.Nullable;
import org.jetlang.channels.AsyncRequest;
import org.jetlang.channels.Channel;
import org.jetlang.channels.ChannelSubscription;
import org.jetlang.channels.MemoryChannel;
import org.jetlang.channels.MemoryRequestChannel;
import org.jetlang.channels.Request;
import org.jetlang.channels.RequestChannel;
import org.jetlang.channels.Subscriber;
import org.jetlang.core.Disposable;
import org.jetlang.fibers.Fiber;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeUnit;
import static c5db.ReplicatorConstants.REPLICATOR_APPEND_RPC_TIMEOUT_MILLISECONDS;
import static c5db.ReplicatorConstants.REPLICATOR_VOTE_RPC_TIMEOUT_MILLISECONDS;
/**
* Single instantiation of a replicator / log / lease. This implementation's logic is based on the
* RAFT algorithm (see <a href="http://raftconsensus.github.io/">http://raftconsensus.github.io/</a>.
* <p>
* A ReplicatorInstance handles the consensus and replication for a single quorum, and communicates
* with the log package via {@link c5db.interfaces.replication.ReplicatorLog}.
*/
public class ReplicatorInstance implements Replicator {
private final Channel<State> stateMemoryChannel = new MemoryChannel<>();
private final RequestChannel<RpcWireRequest, RpcReply> incomingChannel = new MemoryRequestChannel<>();
private final RequestChannel<RpcRequest, RpcWireReply> sendRpcChannel;
private final Channel<ReplicatorInstanceEvent> eventChannel;
private final Channel<IndexCommitNotice> commitNoticeChannel;
private final Fiber fiber;
private final long myId;
private final String quorumId;
private final Logger logger;
private final ReplicatorLog log;
private final long myElectionTimeout;
/**
* These fields are package-private rather than private for convenience of access by testing code.
*/
final ReplicatorClock clock;
final ReplicatorInfoPersistence persister;
/**
* These fields are mutable state used by leader. All of these must only be accessed from the fiber.
*/
private final BlockingQueue<InternalReplicationRequest> logRequests =
new ArrayBlockingQueue<>(ReplicatorConstants.REPLICATOR_MAXIMUM_SIMULTANEOUS_LOG_REQUESTS);
// this is the next index from our log we need to send to each peer, kept track of on a per-peer basis.
private final Map<Long, Long> peersNextIndex = new HashMap<>();
// The last successfully acked message from our peers. I also keep track of my own acked log messages in here.
private final Map<Long, Long> peersLastAckedIndex = new HashMap<>();
private long myFirstIndexAsLeader;
private Disposable queueConsumer;
/**
* These fields are mutable state used by everyone; volatiles should only be updated from the
* fiber, but may be read not-on-the-fiber; for instance, several of these are included in
* toString().
*/
private volatile long currentTerm;
private volatile long votedFor;
private volatile QuorumConfiguration quorumConfig = QuorumConfiguration.EMPTY;
private volatile long quorumConfigIndex = 0;
private volatile long lastCommittedIndex;
private volatile State myState = State.FOLLOWER;
private volatile long lastRPC;
private long whosLeader = 0;
public ReplicatorInstance(final Fiber fiber,
final long myId,
final String quorumId,
ReplicatorLog log,
ReplicatorClock clock,
ReplicatorInfoPersistence persister,
RequestChannel<RpcRequest, RpcWireReply> sendRpcChannel,
final Channel<ReplicatorInstanceEvent> eventChannel,
final Channel<IndexCommitNotice> commitNoticeChannel,
State initialState) {
this.fiber = fiber;
this.myId = myId;
this.quorumId = quorumId;
this.logger = getNewLogger();
this.sendRpcChannel = sendRpcChannel;
this.log = log;
this.clock = clock;
this.persister = persister;
this.eventChannel = eventChannel;
this.commitNoticeChannel = commitNoticeChannel;
this.myElectionTimeout = clock.electionTimeout();
this.lastRPC = clock.currentTimeMillis();
commitNoticeChannel.subscribe(
new ChannelSubscription<>(fiber, this::onCommit,
(notice) ->
notice.nodeId == myId
&& notice.quorumId.equals(quorumId)));
incomingChannel.subscribe(fiber, this::onIncomingMessage);
fiber.scheduleWithFixedDelay(this::checkOnElection, clock.electionCheckInterval(),
clock.electionCheckInterval(), TimeUnit.MILLISECONDS);
this.myState = initialState;
fiber.execute(() -> {
try {
refreshQuorumConfigurationFromLog();
readPersistentData();
// indicate we are running!
eventChannel.publish(
new ReplicatorInstanceEvent(
ReplicatorInstanceEvent.EventType.QUORUM_START,
ReplicatorInstance.this,
0,
0,
clock.currentTimeMillis(),
null, null)
);
if (initialState == State.LEADER) {
becomeLeader();
}
} catch (IOException e) {
logger.error("error during persistent data init", e);
failReplicatorInstance(e);
}
});
}
/**
* public API
*/
@Override
public String getQuorumId() {
return quorumId;
}
@Override
public ListenableFuture<QuorumConfiguration> getQuorumConfiguration() {
SettableFuture<QuorumConfiguration> quorumConfigurationFuture = SettableFuture.create();
fiber.execute(() -> quorumConfigurationFuture.set(quorumConfig));
return quorumConfigurationFuture;
}
@Override
public ListenableFuture<ReplicatorReceipt> changeQuorum(Collection<Long> newPeers) throws InterruptedException {
if (!isLeader()) {
logger.debug("attempted to changeQuorum on a non-leader");
return null;
}
final QuorumConfiguration transitionConfig = quorumConfig.getTransitionalConfiguration(newPeers);
return putQuorumChangeRequest(transitionConfig);
}
@Override
public ListenableFuture<ReplicatorReceipt> logData(List<ByteBuffer> data) throws InterruptedException {
if (!isLeader()) {
logger.debug("attempted to logData on a non-leader");
return null;
}
InternalReplicationRequest req = InternalReplicationRequest.toLogData(data);
logRequests.put(req);
return req.logReceiptFuture;
}
@Override
public long getId() {
return myId;
}
@Override
public String toString() {
return "ReplicatorInstance{" +
"myId=" + myId +
", quorumId='" + quorumId + '\'' +
", lastCommittedIndex=" + lastCommittedIndex +
", myState=" + myState +
", currentTerm=" + currentTerm +
", votedFor=" + votedFor +
", lastRPC=" + lastRPC +
", quorumConfig=" + quorumConfig +
'}';
}
@Override
public Subscriber<State> getStateChannel() {
return stateMemoryChannel;
}
@Override
public Subscriber<ReplicatorInstanceEvent> getEventChannel() {
return this.eventChannel;
}
@Override
public Subscriber<IndexCommitNotice> getCommitNoticeChannel() {
return commitNoticeChannel;
}
public RequestChannel<RpcWireRequest, RpcReply> getIncomingChannel() {
return incomingChannel;
}
public void start() {
logger.debug("started {} with election timeout {}", this.quorumId, this.myElectionTimeout);
fiber.start();
}
public void dispose() {
fiber.dispose();
}
/**
* Call this method on each replicator in a new quorum in order to establish the quorum
* configuration and elect a leader.
* <p>
* Before a quorum (a group of cooperating replicators) may process replication requests
* it must elect a leader. But a leader cannot be elected unless all peers are aware of
* the members of the quorum, which in turn requires all to log a quorum configuration
* entry.
* <p>
* This method logs that first entry. Unlike normal logging, it is not directed by a
* leader; however, the logged entry cannot be confirmed until a leader is chosen. The
* method should be called on every peer in the quorum being established. (Only a majority
* need to successfully log the entry in order for the quorum establishment to go through).
*
* @param peerIds Collection of peers in the new quorum.
* @return A future which will return when the peer has processed the bootstrap request.
* The actual completion of the bootstrap will be signaled by a ReplicatorInstanceEvent
* with the appropriate quorum configuration.
*/
public ListenableFuture<Void> bootstrapQuorum(Collection<Long> peerIds) {
assert peerIds.size() > 0;
SettableFuture<Void> quorumPersistedFuture = SettableFuture.create();
long seqNum = 1;
QuorumConfiguration config = QuorumConfiguration.of(peerIds);
LogEntry configEntry = InternalReplicationRequest.toChangeConfig(config).getEntry(0, seqNum);
AppendEntries message = new AppendEntries(0, 0, 0, 0, Lists.newArrayList(configEntry), 0);
RpcWireRequest request = new RpcWireRequest(myId, quorumId, message);
// Send the append entries message to our own incoming message channel; we will receive it
// and log the quorum configuration entry as usual.
AsyncRequest.withOneReply(fiber, getIncomingChannel(), request, msg -> quorumPersistedFuture.set(null));
return quorumPersistedFuture;
}
void failReplicatorInstance(Throwable e) {
eventChannel.publish(
new ReplicatorInstanceEvent(
ReplicatorInstanceEvent.EventType.QUORUM_FAILURE,
this,
0,
0,
clock.currentTimeMillis(),
null,
e)
);
fiber.dispose(); // kill us forever.
}
private Logger getNewLogger() {
return LoggerFactory.getLogger("(" + getClass().getSimpleName() + " - " + quorumId + " - " + myId + ")");
}
@FiberOnly
private void setState(State state) {
myState = state;
stateMemoryChannel.publish(state);
}
/**
* Submits a request to change the quorum, blocking if necessary until the request
* queue can accept the request.
*
* @return A future which will return the log index of the quorum configuration entry,
*/
private ListenableFuture<ReplicatorReceipt> putQuorumChangeRequest(QuorumConfiguration quorumConfig)
throws InterruptedException {
InternalReplicationRequest req = InternalReplicationRequest.toChangeConfig(quorumConfig);
logRequests.put(req);
return req.logReceiptFuture;
}
/**
* Submits a request to change the quorum, but only if it is possible to do so without blocking.
*
* @return A future which will return the log index of the quorum configuration entry,
* or null if it was not possible to submit the request without blocking.
*/
@FiberOnly
@Nullable
private ListenableFuture<ReplicatorReceipt> offerQuorumChangeRequest(QuorumConfiguration quorumConfig) {
if (this.quorumConfig.equals(quorumConfig)) {
logger.warn("got a request to change quorum to but I'm already in that quorum config {} ", quorumConfig);
return null;
}
InternalReplicationRequest req = InternalReplicationRequest.toChangeConfig(quorumConfig);
if (logRequests.offer(req)) {
return req.logReceiptFuture;
} else {
logger.warn("change request could not be submitted because log request queue was full {}", quorumConfig);
return null;
}
}
@FiberOnly
private void readPersistentData() throws IOException {
currentTerm = persister.readCurrentTerm(quorumId);
votedFor = persister.readVotedFor(quorumId);
}
@FiberOnly
private void onIncomingMessage(Request<RpcWireRequest, RpcReply> message) {
try {
RpcWireRequest req = message.getRequest();
if (req.isPreElectionPollMessage()) {
doPreElectionPollMessage(message);
} else if (req.isRequestVoteMessage()) {
doRequestVote(message);
} else if (req.isAppendMessage()) {
doAppendMessage(message);
} else {
logger.warn("got a message of protobuf type I don't know: {}", req);
}
} catch (Exception e) {
logger.error("Uncaught exception while processing message {}: {}", message, e);
throw e;
}
}
@FiberOnly
private void doPreElectionPollMessage(Request<RpcWireRequest, RpcReply> message) {
final RpcWireRequest request = message.getRequest();
final PreElectionPoll msg = request.getPreElectionPollMessage();
final long msgLastLogTerm = msg.getLastLogTerm();
final long msgLastLogIndex = msg.getLastLogIndex();
final boolean wouldVote =
msg.getTerm() >= currentTerm
&& atLeastAsUpToDateAsLocalLog(msgLastLogTerm, msgLastLogIndex)
&& !rejectPollFromOldConfiguration(request.from, msgLastLogTerm, msgLastLogIndex);
logger.debug("sending pre-election reply to {} wouldVote = {}", message.getRequest().from, wouldVote);
PreElectionReply m = new PreElectionReply(currentTerm, wouldVote);
RpcReply reply = new RpcReply(m);
message.reply(reply);
}
/**
* Special case for pre-election poll. If the sender's log exactly matches our log, but they
* are in the process of being phased out in a transitional configuration (i.e., not a part
* of the next set of peers); and if we are in the FOLLOWER state and we are continuing into
* the next configuration, then reply false to their poll request. Thus, in a transitional
* configuration, a peer being phased out will not be able to initiate an election unless
* a node *not* being phased out is already conducting an election; or, unless its log is
* more advanced, in which case there may be a legitimate need for the phasing-out peer to
* start an election.
*
* @param from id of the peer who sent us a pre-election poll request
* @param msgLastLogTerm the term of the last entry in that peer's log
* @param msgLastLogIndex the index of the last entry in that peer's log
* @return true if we should reply false to the pre-election poll request on the basis described.
*/
@FiberOnly
private boolean rejectPollFromOldConfiguration(long from, long msgLastLogTerm, long msgLastLogIndex) {
return quorumConfig.isTransitional
&& msgLastLogTerm == log.getLastTerm()
&& msgLastLogIndex == log.getLastIndex()
&& !quorumConfig.nextPeers().contains(from)
&& quorumConfig.nextPeers().contains(myId)
&& myState == State.FOLLOWER;
}
@FiberOnly
private void doRequestVote(Request<RpcWireRequest, RpcReply> message) {
RequestVote msg = message.getRequest().getRequestVoteMessage();
// 1. Return if term < currentTerm (sec 5.1)
if (msg.getTerm() < currentTerm) {
RequestVoteReply m = new RequestVoteReply(currentTerm, false);
RpcReply reply = new RpcReply(m);
message.reply(reply);
return;
}
// 2. if term > currentTerm, currentTerm <- term
if (msg.getTerm() > currentTerm) {
logger.debug("RequestVote rpc, pushing forward currentTerm {} to {}", currentTerm, msg.getTerm());
setCurrentTerm(msg.getTerm());
// 2a. Step down if candidate or leader.
if (myState != State.FOLLOWER) {
logger.debug("stepping down to follower, currentTerm: {}", currentTerm);
becomeFollower();
}
}
// 3. if votedFor is null (0), or candidateId, and candidate's log
// is at least as complete as local log (sec 5.2, 5.4), grant vote
// and reset election timeout.
boolean vote = false;
if (atLeastAsUpToDateAsLocalLog(msg.getLastLogTerm(), msg.getLastLogIndex())) {
// we can vote for this because the candidate's log is at least as
// complete as the local log.
if (votedFor == 0 || votedFor == message.getRequest().from) {
setVotedFor(message.getRequest().from);
lastRPC = clock.currentTimeMillis();
vote = true;
}
}
logger.debug("sending vote reply to {} vote = {}, voted = {}", message.getRequest().from, votedFor, vote);
RequestVoteReply m = new RequestVoteReply(currentTerm, vote);
RpcReply reply = new RpcReply(m);
message.reply(reply);
}
@FiberOnly
private boolean atLeastAsUpToDateAsLocalLog(long msgLastLogTerm, long msgLastLogIndex) {
final long localLastLogTerm = log.getLastTerm();
final long localLastLogIndex = log.getLastIndex();
return msgLastLogTerm > localLastLogTerm
|| (msgLastLogTerm == localLastLogTerm && msgLastLogIndex >= localLastLogIndex);
}
@FiberOnly
private void doAppendMessage(final Request<RpcWireRequest, RpcReply> request) {
final AppendEntries appendMessage = request.getRequest().getAppendMessage();
// 1. return if term < currentTerm (sec 5.1)
if (appendMessage.getTerm() < currentTerm) {
appendReply(request, false);
return;
}
// 2. if term > currentTerm, set it (sec 5.1)
if (appendMessage.getTerm() > currentTerm) {
setCurrentTerm(appendMessage.getTerm());
}
// 3. Step down if we are a leader or a candidate (sec 5.2, 5.5)
if (myState != State.FOLLOWER) {
becomeFollower();
}
// 4. reset election timeout
lastRPC = clock.currentTimeMillis();
long theLeader = appendMessage.getLeaderId();
if (whosLeader != theLeader) {
updateFollowersKnowledgeOfCurrentLeader(theLeader);
}
// 5. return failure if log doesn't contain an entry at
// prevLogIndex whose term matches prevLogTerm (sec 5.3)
// if msgPrevLogIndex == 0 -> special case of starting the log!
long msgPrevLogIndex = appendMessage.getPrevLogIndex();
long msgPrevLogTerm = appendMessage.getPrevLogTerm();
if (msgPrevLogIndex != 0 && log.getLogTerm(msgPrevLogIndex) != msgPrevLogTerm) {
AppendEntriesReply m = new AppendEntriesReply(currentTerm, false, log.getLastIndex() + 1);
RpcReply reply = new RpcReply(m);
request.reply(reply);
return;
}
if (appendMessage.getEntriesList().isEmpty()) {
appendReply(request, true);
long newCommitIndex = Math.min(appendMessage.getCommitIndex(), log.getLastIndex());
setLastCommittedIndex(newCommitIndex);
return;
}
// 6. if existing entries conflict with new entries, delete all
// existing entries starting with first conflicting entry (sec 5.3)
// nb: The process in which we fix the local log may involve several async log operations, so that is entirely
// hidden up in these futures. Note that the process can fail, so we handle that as well.
List<ListenableFuture<Boolean>> logOperationFutures = reconcileAppendMessageWithLocalLog(appendMessage);
ListenableFuture<List<Boolean>> bundledLogFuture = Futures.allAsList(logOperationFutures);
// wait for the log to commit before returning message. But do so async.
C5Futures.addCallback(bundledLogFuture,
(resultList) -> {
appendReply(request, true);
// 8. Signal the client of the Replicator that it can apply newly committed entries to state machine
long newCommitIndex = Math.min(appendMessage.getCommitIndex(), log.getLastIndex());
setLastCommittedIndex(newCommitIndex);
},
(Throwable t) -> {
// TODO A log commit failure is probably a fatal error. Quit the instance?
// TODO better error reporting. A log commit failure will be a serious issue.
logger.error("failure reconciling received entries with the local log", t);
appendReply(request, false);
}, fiber);
}
@FiberOnly
private void appendReply(Request<RpcWireRequest, RpcReply> request, boolean success) {
AppendEntriesReply m = new AppendEntriesReply(currentTerm, success, 0);
RpcReply reply = new RpcReply(m);
request.reply(reply);
}
@FiberOnly
private boolean isLeader() {
return myState == State.LEADER;
}
@FiberOnly
private void updateFollowersKnowledgeOfCurrentLeader(long theLeader) {
logger.debug("discovered new leader: {}", theLeader);
whosLeader = theLeader;
eventChannel.publish(
new ReplicatorInstanceEvent(
ReplicatorInstanceEvent.EventType.LEADER_ELECTED,
this,
whosLeader,
currentTerm,
clock.currentTimeMillis(),
null, null)
);
}
@FiberOnly
private List<ListenableFuture<Boolean>> reconcileAppendMessageWithLocalLog(AppendEntries appendMessage) {
List<ListenableFuture<Boolean>> logOperationFutures = new ArrayList<>();
// 6. if existing entries conflict with new entries, delete all
// existing entries starting with first conflicting entry (sec 5.3)
long nextIndex = log.getLastIndex() + 1;
List<LogEntry> entriesFromMessage = appendMessage.getEntriesList();
List<LogEntry> entriesToCommit = new ArrayList<>(entriesFromMessage.size());
for (LogEntry entry : entriesFromMessage) {
long entryIndex = entry.getIndex();
if (entryIndex == nextIndex) {
entriesToCommit.add(entry);
nextIndex++;
continue;
}
if (entryIndex > nextIndex) {
// ok this entry is still beyond the LAST entry, so we have a problem:
logger.warn("Received entry later in sequence than expected: expected {} but the next in the message is {}",
nextIndex, entryIndex);
logOperationFutures.add(Futures.immediateFailedFuture(
new Exception("Unexpected log entry, or entry sequence gap in received entries")));
return logOperationFutures;
}
// at this point entryIndex should be <= log.getLastIndex
assert entryIndex < nextIndex;
if (log.getLogTerm(entryIndex) != entry.getTerm()) {
// This is generally expected to be fairly uncommon.
// conflict:
logger.debug("log conflict at idx {} my term: {} term from leader: {}, truncating log after this point",
entryIndex, log.getLogTerm(entryIndex), entry.getTerm());
// delete this and all subsequent entries from the local log.
logOperationFutures.add(log.truncateLog(entryIndex));
entriesToCommit.add(entry);
nextIndex = entryIndex + 1;
}
}
// 7. Append any new entries not already in the log.
logOperationFutures.add(log.logEntries(entriesToCommit));
refreshQuorumConfigurationFromLog();
return logOperationFutures;
}
@FiberOnly
private void checkOnElection() {
if (myState == State.LEADER) {
logger.trace("leader during election check.");
return;
}
if (lastRPC + this.myElectionTimeout < clock.currentTimeMillis()
&& quorumConfig.allPeers().contains(myId)) {
logger.trace("timed out checking on election, try new election");
if (myState == State.CANDIDATE) {
doElection();
} else {
doPreElection();
}
}
}
@FiberOnly
private void doPreElection() {
eventChannel.publish(
new ReplicatorInstanceEvent(
ReplicatorInstanceEvent.EventType.ELECTION_TIMEOUT,
this,
0,
0,
clock.currentTimeMillis(),
null, null)
);
// Start new election "timer".
lastRPC = clock.currentTimeMillis();
PreElectionPoll msg = new PreElectionPoll(currentTerm, myId, log.getLastIndex(), log.getLastTerm());
logger.debug("starting pre-election poll for currentTerm: {}", currentTerm);
final long termBeingVotedFor = currentTerm;
final Set<Long> votes = new HashSet<>();
for (long peer : quorumConfig.allPeers()) {
final RpcRequest request = new RpcRequest(peer, myId, quorumId, msg);
sendPreElectionRequest(request, termBeingVotedFor, votes, lastRPC);
}
}
@FiberOnly
private void sendPreElectionRequest(RpcRequest request, long termBeingVotedFor, Set<Long> votes, long startTime) {
AsyncRequest.withOneReply(fiber, sendRpcChannel, request,
message -> handlePreElectionReply(message, termBeingVotedFor, votes, startTime),
REPLICATOR_VOTE_RPC_TIMEOUT_MILLISECONDS, TimeUnit.MILLISECONDS,
() -> handlePreElectionTimeout(request, termBeingVotedFor, votes, startTime));
}
@FiberOnly
private void handlePreElectionTimeout(RpcRequest request, long termBeingVotedFor, Set<Long> votes, long startTime) {
if (myState != State.FOLLOWER
|| lastRPC != startTime) {
return;
}
// Also if the term goes forward somehow, this is also out of date, and drop it.
if (currentTerm > termBeingVotedFor) {
logger.trace("pre-election poll timeout, current term has moved on, abandoning this request");
return;
}
logger.trace("pre-election poll timeout to {}, resending RPC", request.to);
sendPreElectionRequest(request, termBeingVotedFor, votes, startTime);
}
@FiberOnly
private void handlePreElectionReply(RpcWireReply message, long termBeingVotedFor, Set<Long> votes, long startTime) {
if (message == null) {
logger.warn("got a NULL message reply, that's unfortunate");
return;
}
if (currentTerm > termBeingVotedFor) {
logger.warn("pre-election reply from {}, but currentTerm {} > vote term {}", message.from,
currentTerm, termBeingVotedFor);
return;
}
if (myState != State.FOLLOWER) {
// we became not, ignore
logger.warn("pre-election reply from {} ignored -> in state {}", message.from, myState);
return;
}
if (lastRPC != startTime) {
// Another timeout occurred, so this poll is moot.
return;
}
PreElectionReply reply = message.getPreElectionReplyMessage();
if (reply.getTerm() > currentTerm) {
logger.warn("election reply from {}, but term {} was not my term {}, updating currentTerm",
message.from, reply.getTerm(), currentTerm);
setCurrentTerm(reply.getTerm());
return;
}
if (reply.getWouldVote()) {
votes.add(message.from);
}
if (votesConstituteMajority(votes)) {
doElection();
}
}
@FiberOnly
private void doElection() {
eventChannel.publish(
new ReplicatorInstanceEvent(
ReplicatorInstanceEvent.EventType.ELECTION_STARTED,
this,
0,
0,
clock.currentTimeMillis(),
null, null)
);
// Start new election "timer".
lastRPC = clock.currentTimeMillis();
// increment term.
setCurrentTerm(currentTerm + 1);
setState(State.CANDIDATE);
RequestVote msg = new RequestVote(currentTerm, myId, log.getLastIndex(), log.getLastTerm());
logger.debug("starting election for currentTerm: {}", currentTerm);
final long termBeingVotedFor = currentTerm;
final Set<Long> votes = new HashSet<>();
for (long peer : quorumConfig.allPeers()) {
RpcRequest req = new RpcRequest(peer, myId, quorumId, msg);
AsyncRequest.withOneReply(fiber, sendRpcChannel, req,
message -> handleElectionReply0(message, termBeingVotedFor, votes),
REPLICATOR_VOTE_RPC_TIMEOUT_MILLISECONDS, TimeUnit.MILLISECONDS,
new RequestVoteTimeout(req, termBeingVotedFor, votes));
}
}
private class RequestVoteTimeout implements Runnable {
public final RpcRequest request;
public final long termBeingVotedFor;
public final Set<Long> votes;
@Override
public String toString() {
return "RequestVoteTimeout{" +
"request=" + request +
", termBeingVotedFor=" + termBeingVotedFor +
", votes=" + votes +
'}';
}
private RequestVoteTimeout(RpcRequest request, long termBeingVotedFor, Set<Long> votes) {
this.request = request;
this.termBeingVotedFor = termBeingVotedFor;
this.votes = votes;
}
@Override
@FiberOnly
public void run() {
// If we are no longer a candidate, retrying RequestVote is pointless.
if (myState != State.CANDIDATE) {
return;
}
// Also if the term goes forward somehow, this is also out of date, and drop it.
if (currentTerm > termBeingVotedFor) {
logger.trace("request vote timeout, current term has moved on, abandoning this request");
return;
}
logger.trace("request vote timeout to {}, resending RPC", request.to);
// Note we are using 'this' as the recursive timeout.
AsyncRequest.withOneReply(fiber, sendRpcChannel, request,
message -> handleElectionReply0(message, termBeingVotedFor, votes),
REPLICATOR_VOTE_RPC_TIMEOUT_MILLISECONDS, TimeUnit.MILLISECONDS, this);
}
}
// RPC callback for timeouts
@FiberOnly
private void handleElectionReply0(RpcWireReply message, long termBeingVotedFor, Set<Long> votes) {
// if current term has advanced, these replies are stale and should be ignored:
if (message == null) {
logger.warn("got a NULL message reply, that's unfortunate");
return;
}
if (currentTerm > termBeingVotedFor) {
logger.warn("election reply from {}, but currentTerm {} > vote term {}", message.from,
currentTerm, termBeingVotedFor);
return;
}
// if we are no longer a Candidate, election was over, these replies are stale.
if (myState != State.CANDIDATE) {
// we became not, ignore
logger.warn("election reply from {} ignored -> in state {}", message.from, myState);
return;
}
RequestVoteReply reply = message.getRequestVoteReplyMessage();
if (reply.getTerm() > currentTerm) {
logger.warn("election reply from {}, but term {} was not my term {}, updating currentTerm",
message.from, reply.getTerm(), currentTerm);
setCurrentTerm(reply.getTerm());
return;
} else if (reply.getTerm() < currentTerm) {
// huh weird.
logger.warn("election reply from {}, their term {} < currentTerm {}",
message.from, reply.getTerm(), currentTerm);
}
// did you vote for me?
if (reply.getVoteGranted()) {
// yes!
votes.add(message.from);
}
if (votesConstituteMajority(votes)) {
becomeLeader();
}
}
@FiberOnly
private boolean votesConstituteMajority(Set<Long> votes) {
return quorumConfig.setContainsMajority(votes);
}
@FiberOnly
private void becomeFollower() {
boolean wasLeader = myState == State.LEADER;
setState(State.FOLLOWER);
if (wasLeader) {
eventChannel.publish(
new ReplicatorInstanceEvent(
ReplicatorInstanceEvent.EventType.LEADER_DEPOSED,
this,
0,
0,
clock.currentTimeMillis(),
null, null));
}
stopQueueConsumer();
}
@FiberOnly
private void stopQueueConsumer() {
if (queueConsumer != null) {
queueConsumer.dispose();
queueConsumer = null;
}
}
@FiberOnly
private void becomeLeader() {
logger.warn("I AM THE LEADER NOW, commence AppendEntries RPCs term = {}", currentTerm);
setState(State.LEADER);
// Page 7, para 5
long myNextLog = log.getLastIndex() + 1;
peersLastAckedIndex.clear();
peersNextIndex.clear();
for (long peer : allPeersExceptMe()) {
peersNextIndex.put(peer, myNextLog);
}
// none so far!
myFirstIndexAsLeader = 0;
eventChannel.publish(
new ReplicatorInstanceEvent(
ReplicatorInstanceEvent.EventType.LEADER_ELECTED,
this,
myId,
currentTerm,
clock.currentTimeMillis(),
null, null)
);
startQueueConsumer();
}
@FiberOnly
private void startQueueConsumer() {
queueConsumer = fiber.scheduleAtFixedRate(() -> {
try {
consumeQueue();
checkOnQuorumChange();
} catch (Throwable t) {
logger.error("Exception in consumeQueue: ", t);
failReplicatorInstance(t);
}
}, 0, clock.leaderLogRequestsProcessingInterval(), TimeUnit.MILLISECONDS);
}
@FiberOnly
private void consumeQueue() {
// retrieve as many items as possible. send rpc.
// If there are no pending log requests, this method sends out heartbeats.
final List<InternalReplicationRequest> reqs = new ArrayList<>();
logger.trace("queue consuming");
while (logRequests.peek() != null) {
reqs.add(logRequests.poll());
}
logger.trace("{} queue items to commit", reqs.size());
final long firstIndexInList = log.getLastIndex() + 1;
final long lastIndexInList = firstIndexInList + reqs.size() - 1;
List<LogEntry> newLogEntries = createLogEntriesFromIntRequests(reqs, firstIndexInList);
leaderLogNewEntries(newLogEntries, lastIndexInList);
refreshQuorumConfigurationFromLog();
assert lastIndexInList == log.getLastIndex();
for (final long peer : allPeersExceptMe()) {
// for each peer, figure out how many "back messages" should I send:
final long peerNextIdx = this.peersNextIndex.getOrDefault(peer, firstIndexInList);
if (peerNextIdx < firstIndexInList) {
final long moreCount = firstIndexInList - peerNextIdx;
logger.debug("sending {} more log entries to peer {}", moreCount, peer);
// TODO check moreCount is reasonable, and available in log. Otherwise do alternative peer catch up
// TODO alternative peer catchup is by a different process, send message to that then skip sending AppendRpc
// TODO allow for smaller 'catch up' messages so we don't try to create a 400GB sized message.
// TODO cache these extra LogEntry objects so we don't recreate too many of them.
ListenableFuture<List<LogEntry>> peerEntriesFuture = log.getLogEntries(peerNextIdx, firstIndexInList);
C5Futures.addCallback(peerEntriesFuture,
(entriesFromLog) -> {
// TODO make sure the lists splice neatly together.
assert entriesFromLog.size() == moreCount;
if (peerNextIdx != peersNextIndex.get(peer) ||
myState != State.LEADER) {
// These were the same when we started checking the log, but they're not now -- that means
// things happened while the log was retrieving, so discard this result. This is safe because
// the next (or concurrent) run of consumeQueue has better information.
return;
}
List<LogEntry> entriesToAppend = new ArrayList<>((int) (newLogEntries.size() + moreCount));
entriesToAppend.addAll(entriesFromLog);
entriesToAppend.addAll(newLogEntries);
sendAppendEntries(peer, peerNextIdx, lastIndexInList, entriesToAppend);
},
(Throwable t) -> {
// TODO is this situation ever recoverable?
logger.error("failed to retrieve from log", t);
failReplicatorInstance(t);
}, fiber);
} else {
sendAppendEntries(peer, peerNextIdx, lastIndexInList, newLogEntries);
}
}
}
@FiberOnly
private void checkOnQuorumChange() {
if (lastCommittedIndex >= quorumConfigIndex) {
if (quorumConfig.isTransitional) {
// Current transitional quorum is committed, so begin replicating the next stage.
// If this call happens to fail because the log request queue is full, we'll catch
// it the next time around.
offerQuorumChangeRequest(quorumConfig.getCompletedConfiguration());
} else {
if (!quorumConfig.allPeers().contains(myId)) {
// Committed, stable configuration
// Resign if it does not include me.
// TODO should there be a special event for this? Should the replicator shut itself down completely?
becomeFollower();
}
}
}
}
@FiberOnly
private List<LogEntry> createLogEntriesFromIntRequests(List<InternalReplicationRequest> requests, long firstEntryIndex) {
long idAssigner = firstEntryIndex;
// Build the log entries:
List<LogEntry> newLogEntries = new ArrayList<>(requests.size());
for (InternalReplicationRequest logReq : requests) {
final LogEntry entry = logReq.getEntry(currentTerm, idAssigner);
newLogEntries.add(entry);
if (myFirstIndexAsLeader == 0) {
myFirstIndexAsLeader = idAssigner;
logger.debug("my first index as leader is: {}", myFirstIndexAsLeader);
}
// let the client know what our id is
logReq.logReceiptFuture.set(new ReplicatorReceipt(currentTerm, idAssigner));
idAssigner++;
}
return newLogEntries;
}
@FiberOnly
private void leaderLogNewEntries(List<LogEntry> newLogEntries, long lastIndexInList) {
final ListenableFuture<Boolean> localLogFuture;
if (newLogEntries.isEmpty()) {
return;
}
localLogFuture = log.logEntries(newLogEntries);
// TODO this callback and some others should have timeouts in case the log hangs somehow
C5Futures.addCallback(localLogFuture,
(result) -> {
peersLastAckedIndex.put(myId, lastIndexInList);
checkIfMajorityCanCommit(lastIndexInList);
},
(Throwable t) -> {
logger.error("As a leader, failed to write new entries to local log", t);
failReplicatorInstance(t);
}, fiber);
}
@FiberOnly
private void sendAppendEntries(long peer, long peerNextIdx, long lastIndexSent, final List<LogEntry> entries) {
assert (entries.size() == 0) || (entries.get(0).getIndex() == peerNextIdx);
assert (entries.size() == 0) || (entries.get(entries.size() - 1).getIndex() == lastIndexSent);
final long prevLogIndex = peerNextIdx - 1;
final long prevLogTerm;
if (prevLogIndex == 0) {
prevLogTerm = 0;
} else {
prevLogTerm = log.getLogTerm(prevLogIndex);
}
// catch them up so the next RPC wont over-send old junk.
peersNextIndex.put(peer, lastIndexSent + 1);
AppendEntries msg = new AppendEntries(
currentTerm, myId, prevLogIndex, prevLogTerm,
entries,
lastCommittedIndex
);
RpcRequest request = new RpcRequest(peer, myId, quorumId, msg);
AsyncRequest.withOneReply(fiber, sendRpcChannel, request, message -> {
logger.trace("got a reply {}", message);
boolean wasSuccessful = message.getAppendReplyMessage().getSuccess();
if (!wasSuccessful) {
// This is per Page 7, paragraph 5. "After a rejection, the leader decrements nextIndex and retries"
if (message.getAppendReplyMessage().getMyNextLogEntry() != 0) {
peersNextIndex.put(peer, message.getAppendReplyMessage().getMyNextLogEntry());
} else {
peersNextIndex.put(peer, Math.max(peerNextIdx - 1, 1));
}
} else {
// we have been successfully acked up to this point.
logger.trace("peer {} acked for {}", peer, lastIndexSent);
peersLastAckedIndex.put(peer, lastIndexSent);
checkIfMajorityCanCommit(lastIndexSent);
}
}, REPLICATOR_APPEND_RPC_TIMEOUT_MILLISECONDS, TimeUnit.MILLISECONDS, () ->
// Do nothing -> let next timeout handle things.
// This timeout exists just so that we can cancel and clean up stuff in jetlang.
logger.trace("peer {} timed out", peer));
}
@FiberOnly
private void checkIfMajorityCanCommit(long lastAckedIndex) {
/**
* Determine if a majority of the current peers have replicated entries up to some index,
* and thus it is safe to commit up to that index (aka declare it visible); if so, actually
* commit. The argument is the last-updated index, but the actual index found to be able to
* be committed by a majority could be less than or equal to that index.
*/
if (lastAckedIndex <= lastCommittedIndex) {
return; // already committed this entry
}
if (myFirstIndexAsLeader == 0) {
return; // can't declare new visible yet until we have a first index as the leader.
}
if (quorumConfig.isEmpty()) {
return; // there are no peers in the quorum, so nothing can be committed
}
/**
* Goal: find an N such that N > lastCommittedIndex, a majority of peersLastAckedIndex[peerId] >= N
* (considering peers within the current peer configuration) and N >= myFirstIndexAsLeader. If found,
* commit up to the greatest such N. (sec 5.3, sec 5.4).
*/
final long newCommitIndex = quorumConfig.calculateCommittedIndex(peersLastAckedIndex);
if (newCommitIndex <= lastCommittedIndex) {
return;
}
if (newCommitIndex < myFirstIndexAsLeader) {
logger.info("Found newCommitIndex {} but my first index as leader was {}, can't declare visible yet",
newCommitIndex, myFirstIndexAsLeader);
return;
}
if (newCommitIndex < lastCommittedIndex) {
logger.warn("weird newCommitIndex {} is smaller than lastCommittedIndex {}",
newCommitIndex, lastCommittedIndex);
return;
}
setLastCommittedIndex(newCommitIndex);
logger.trace("discovered new visible entry {}", lastCommittedIndex);
}
@FiberOnly
private void setLastCommittedIndex(long newLastCommittedIndex) {
if (newLastCommittedIndex < lastCommittedIndex) {
logger.info("Requested to update to commit index {} but it is smaller than my current commit index {}; ignoring",
newLastCommittedIndex, lastCommittedIndex);
} else if (newLastCommittedIndex > lastCommittedIndex) {
long oldLastCommittedIndex = lastCommittedIndex;
lastCommittedIndex = newLastCommittedIndex;
issueCommitNotifications(oldLastCommittedIndex);
}
}
@FiberOnly
private void issueCommitNotifications(long oldLastCommittedIndex) {
// TODO inefficient, because it calls getLogTerm once for every index. Possible optimization here.
final long firstCommittedIndex = oldLastCommittedIndex + 1;
long firstIndexOfTerm = firstCommittedIndex;
long nextTerm = log.getLogTerm(firstCommittedIndex);
for (long index = firstCommittedIndex; index <= lastCommittedIndex; index++) {
long currentTerm = nextTerm;
if (index == lastCommittedIndex
|| (nextTerm = log.getLogTerm(index + 1)) != currentTerm) {
commitNoticeChannel.publish(new IndexCommitNotice(quorumId, myId, firstIndexOfTerm, index, currentTerm));
firstIndexOfTerm = index + 1;
}
}
}
@FiberOnly
private void onCommit(IndexCommitNotice notice) {
if (notice.firstIndex <= quorumConfigIndex
&& quorumConfigIndex <= notice.lastIndex) {
eventChannel.publish(
new ReplicatorInstanceEvent(
ReplicatorInstanceEvent.EventType.QUORUM_CONFIGURATION_COMMITTED,
this,
0,
0,
clock.currentTimeMillis(),
quorumConfig,
null));
}
}
@FiberOnly
private void setVotedFor(long votedFor) {
try {
persister.writeCurrentTermAndVotedFor(quorumId, currentTerm, votedFor);
} catch (IOException e) {
failReplicatorInstance(e);
}
this.votedFor = votedFor;
}
@FiberOnly
private void setCurrentTerm(long newTerm) {
try {
persister.writeCurrentTermAndVotedFor(quorumId, newTerm, 0);
} catch (IOException e) {
failReplicatorInstance(e);
}
this.currentTerm = newTerm;
this.votedFor = 0;
}
@FiberOnly
private Set<Long> allPeersExceptMe() {
return Sets.difference(quorumConfig.allPeers(), Sets.newHashSet(myId));
}
private void refreshQuorumConfigurationFromLog() {
quorumConfig = log.getLastConfiguration();
quorumConfigIndex = log.getLastConfigurationIndex();
}
}