/**
* Licensed to the zk1931 under one or more contributor license
* agreements. See the NOTICE file distributed with this work
* for additional information regarding copyright ownership.
* The ASF licenses this file to you under the Apache License,
* Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the
* License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.github.zk1931.jzab;
import com.github.zk1931.jzab.proto.ZabMessage;
import com.github.zk1931.jzab.proto.ZabMessage.Message;
import com.github.zk1931.jzab.proto.ZabMessage.Message.MessageType;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.TimeoutException;
import java.util.HashMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Fast leader election implementation. Fast leader election will try its best
* effort to elect the leader with the "best" history to minimize the
* synchronization cost.
*/
class FastLeaderElection implements Election {
static final Logger LOG = LoggerFactory.getLogger(FastLeaderElection.class);
// Fast leader election needs transport to exchange vote information.
final Transport transport;
// The queue for incoming messages(both queries and replies).
final BlockingQueue<MessageTuple> messageQueue;
// The last vote for this server.
private VoteInfo voteInfo = null;
// Round number.
private long round = 0;
// Persistent state.
private PersistentState persistence;
// Message queue filter for fast leader election.
private final ElectioneerFilter filter;
FastLeaderElection(PersistentState persistence, Transport transport,
BlockingQueue<MessageTuple> messageQueue) {
this.transport = transport;
this.messageQueue = messageQueue;
this.persistence = persistence;
filter = new ElectioneerFilter(messageQueue);
}
@Override
public String electLeader() throws Exception {
ClusterConfiguration clusterConfig = persistence.getLastSeenConfig();
Zxid lastZxid = persistence.getLog().getLatestZxid();
String serverId = clusterConfig.getServerId();
long ackEpoch = persistence.getAckEpoch();
// The map stores all the votes from the servers who are in the same round.
HashMap<String, VoteInfo> receivedVotes = new HashMap<String, VoteInfo>();
// Everytime enters election, increments the round number.
this.round++;
// The first vote should be itself.
this.voteInfo = new VoteInfo(serverId, ackEpoch, lastZxid, round, true);
int timeoutMs = 100;
int maxTimeoutMs = 1600;
// Broadcasts its own vote first.
broadcast(clusterConfig);
while (true) {
MessageTuple msgTuple;
try {
msgTuple = filter.getMessage(timeoutMs);
} catch (TimeoutException ex) {
// Timeout without any incoming vote message.
if (receivedVotes.size() >= clusterConfig.getQuorumSize()) {
// If we've already received votes from a quorum of servers who are in
// the same round, then we assume probably we find the server who has
// the "best" history.
this.voteInfo.electing = false;
return this.voteInfo.vote;
} else {
// No any incoming message after certain timeout, broadcasts its own
// vote and increase its timeout.
broadcast(clusterConfig);
timeoutMs =
(timeoutMs * 2 > maxTimeoutMs)? maxTimeoutMs : 2 * timeoutMs;
}
continue;
}
VoteInfo vote = VoteInfo.fromMessage(msgTuple.getMessage());
String source = msgTuple.getServerId();
if (!clusterConfig.contains(source)) {
// If the vote comes from a server who is not in your curernt
// configuration, ignores it.
LOG.debug("The vote is from server {} who is not in current " +
"configuration, ignores it.", source);
continue;
}
if (vote.electing) {
// The vote comes from a server who is also in electing phase.
if (vote.round > this.voteInfo.round) {
LOG.debug("The round of peer's vote {} is larger than itself {}",
vote.round, this.voteInfo.round);
this.round = vote.round;
// Updates its round number.
this.voteInfo.round = vote.round;
// Since the round number has been changed, we need to clear the map.
receivedVotes.clear();
if (this.voteInfo.compareTo(vote) < 0) {
// Updates its vote if the peer's vote is better.
this.voteInfo = vote;
}
broadcast(clusterConfig);
} else if (vote.round == this.voteInfo.round &&
this.voteInfo.compareTo(vote) < 0) {
// Updates its vote if the peer's vote is better.
this.voteInfo = vote;
broadcast(clusterConfig);
} else if(vote.round < this.voteInfo.round) {
// Ignores if the peer's round is smaller than itself.
continue;
} else if (vote.round == this.voteInfo.round &&
this.voteInfo.compareTo(vote) > 0) {
broadcast(clusterConfig);
}
// Updates the received votes.
receivedVotes.put(source, vote);
if (receivedVotes.size() == clusterConfig.getPeers().size()) {
this.voteInfo.electing = false;
return this.voteInfo.vote;
}
} else {
// Which means the peer is in non-electing phase.
this.voteInfo = vote;
this.voteInfo.electing = false;
return this.voteInfo.vote;
}
}
}
@Override
public void reply(MessageTuple tuple) {
if (tuple.getMessage().getElectionInfo().getIsElecting() &&
this.voteInfo != null) {
// If it's the server first time joining a cluster, it won't
// initialize its vote information until first synchronization from
// leader is done. The vote might be null before the synchronization is
// done. In this case, we won't reply its vote to other querier.
LOG.debug("Replies to {} with leader info : {}",
tuple.getServerId(),
voteInfo.vote);
this.transport.send(tuple.getServerId(), this.voteInfo.toMessage());
}
}
@Override
public void specifyLeader(String leader) {
this.voteInfo = new VoteInfo(leader, -1, Zxid.ZXID_NOT_EXIST, -1, false);
}
// Broadcasts its vote to all the peers in current configuration.
void broadcast(ClusterConfiguration config) {
Message vote = voteInfo.toMessage();
for (String server : config.getPeers()) {
this.transport.send(server, vote);
}
}
/**
* The information of vote.
*/
static class VoteInfo implements Comparable<VoteInfo> {
final String vote;
final long ackEpoch;
final Zxid zxid;
long round;
boolean electing;
VoteInfo(String vote,
long ackEpoch,
Zxid zxid,
long round,
boolean electing) {
this.vote = vote;
this.ackEpoch = ackEpoch;
this.zxid = zxid;
this.round = round;
this.electing = electing;
}
Message toMessage() {
return MessageBuilder.buildElectionInfo(vote, zxid, ackEpoch, round,
electing);
}
// Compares two votes. The order of the comparison is :
// ackEpoch -> lastZxid -> serverId
@Override
public int compareTo(VoteInfo vi) {
if (ackEpoch != vi.ackEpoch) {
return (int)(this.ackEpoch - vi.ackEpoch);
}
if (!this.zxid.equals(vi.zxid)) {
return this.zxid.compareTo(vi.zxid);
}
return this.vote.compareTo(vi.vote);
}
@Override
public boolean equals(Object o) {
if (o == null || !(o instanceof VoteInfo)) {
return false;
}
return compareTo((VoteInfo)o) == 0;
}
@Override
public int hashCode() {
return 0;
}
static VoteInfo fromMessage(Message msg) {
ZabMessage.ElectionInfo info = msg.getElectionInfo();
return new VoteInfo(info.getVote(),
info.getAckEpoch(),
MessageBuilder.fromProtoZxid(info.getZxid()),
info.getRound(),
info.getIsElecting());
}
}
/**
* This filter filters any message except the ELECTION_INFO message.
*/
class ElectioneerFilter extends MessageQueueFilter {
ElectioneerFilter(BlockingQueue<MessageTuple> msgQueue) {
super(msgQueue);
}
@Override
protected MessageTuple getMessage(int timeoutMs)
throws InterruptedException, TimeoutException {
int startMs = (int)(System.nanoTime() / 1000000);
while (true) {
int nowMs = (int)(System.nanoTime() / 1000000);
int remainMs = timeoutMs - (nowMs - startMs);
if (remainMs < 0) {
remainMs = 0;
}
MessageTuple tuple = super.getMessage(remainMs);
Message msg = tuple.getMessage();
if (msg.getType() == MessageType.ELECTION_INFO) {
// Got what we want, return it to caller.
return tuple;
} else if (msg.getType() == MessageType.DISCONNECTED) {
transport.clear(msg.getDisconnected().getServerId());
}
}
}
}
}