/** * Licensed to the zk1931 under one or more contributor license * agreements. See the NOTICE file distributed with this work * for additional information regarding copyright ownership. * The ASF licenses this file to you under the Apache License, * Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the * License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.github.zk1931.jzab; import com.google.protobuf.TextFormat; import java.io.IOException; import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeoutException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import com.github.zk1931.jzab.proto.ZabMessage; import com.github.zk1931.jzab.proto.ZabMessage.Message; import com.github.zk1931.jzab.proto.ZabMessage.Message.MessageType; import com.github.zk1931.jzab.proto.ZabMessage.Proposal.ProposalType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.MDC; /** * Leader. */ class Leader extends Participant { private final Map<String, PeerHandler> quorumMap = new ConcurrentHashMap<String, PeerHandler>(); // Stores all the new joined/recovered servers who are waiting for // synchronization ends. private final Map<String, PeerHandler> pendingPeers = new HashMap<String, PeerHandler>(); /** * The established epoch for this leader. */ private long establishedEpoch = -1; /** * The zxid for pending COP, or null if there's no pending COP. */ private Zxid pendingCopZxid = null; /** * PreProcessor converts requests to idempotent transactions. */ private PreProcessor preProcessor = null; /** * Determines which transaction can be committed. */ private AckProcessor ackProcessor = null; /** * The leader needs to maintain the last proposed zxid of itself, the last * acknowledged zxid from itself, and the last committed zxid it received. * The last two records are not necessarily most updated since the events * happen in different threads and the events will be passed along the * message pipeline, the leader only has the chance of catching the events in * main thread. But we can guarantee that in main thread there's no zxid * after lastProposedZxid has been proposed, and the last acknowledged and * committed zxid are actually acknowledged/committed. */ private Zxid lastProposedZxid; private Zxid lastAckedZxid; private Zxid lastCommittedZxid; private static final Logger LOG = LoggerFactory.getLogger(Leader.class); public Leader(ParticipantState participantState, StateMachine stateMachine, ZabConfig config) { super(participantState, stateMachine, config); this.electedLeader = participantState.getServerId(); filter = new LeaderFilter(messageQueue, election); MDC.put("state", "leading"); } @Override protected void changePhase(Phase phase) throws IOException, InterruptedException, ExecutionException { this.currentPhase = phase; if (phase == Phase.DISCOVERING) { MDC.put("phase", "discovering"); if (stateChangeCallback != null) { stateChangeCallback.leaderDiscovering(this.serverId); } if (failCallback != null) { failCallback.leaderDiscovering(); } } else if (phase == Phase.SYNCHRONIZING) { MDC.put("phase", "synchronizing"); if (stateChangeCallback != null) { stateChangeCallback.leaderSynchronizing(persistence.getProposedEpoch()); } if (failCallback != null) { failCallback.leaderSynchronizing(); } } else if (phase == Phase.BROADCASTING) { MDC.put("phase", "broadcasting"); if (failCallback != null) { failCallback.leaderBroadcasting(); } if (stateChangeCallback != null) { stateChangeCallback.leaderBroadcasting(persistence.getAckEpoch(), getAllTxns(), persistence.getLastSeenConfig()); } } else if (phase == Phase.FINALIZING) { MDC.put("phase", "finalizing"); stateMachine.recovering(pendings); if (persistence.isInStateTransfer()) { // If the participant goes back to recovering phase in state // transferring mode, we need to explicitly undo the state transferring. persistence.undoStateTransfer(); } // Shuts down all the handler of followers. for (PeerHandler ph : this.quorumMap.values()) { ph.shutdown(); this.quorumMap.remove(ph.getServerId()); } } } /** * Starts from joining leader itself. * * @param peer should be as same as serverId of leader. * @throws Exception in case something goes wrong. */ @Override public void join(String peer) throws Exception { try { // Initializes the persistent variables. List<String> peers = new ArrayList<String>(); peers.add(this.serverId); ClusterConfiguration cnf = new ClusterConfiguration(new Zxid(0, 0), peers, this.serverId); persistence.setLastSeenConfig(cnf); ByteBuffer cop = cnf.toByteBuffer(); Transaction txn = new Transaction(cnf.getVersion(), ProposalType.COP_VALUE, cop); // Also we need to append the initial configuration to log. persistence.getLog().append(txn); persistence.setProposedEpoch(0); persistence.setAckEpoch(0); /* -- Broadcasting phase -- */ // Initialize the vote for leader election. this.election.specifyLeader(this.serverId); changePhase(Phase.BROADCASTING); broadcasting(); } catch (InterruptedException e) { LOG.debug("Participant is canceled by user."); throw e; } catch (TimeoutException e) { LOG.debug("Didn't hear message from peers for {} milliseconds. Going" + " back to leader election.", this.config.getTimeoutMs()); } catch (BackToElectionException e) { LOG.debug("Got GO_BACK message from queue, going back to electing."); } catch (LeftCluster e) { LOG.debug("Exit running : {}", e.getMessage()); throw e; } catch (Exception e) { LOG.error("Caught exception", e); throw e; } finally { changePhase(Phase.FINALIZING); } } public void lead() throws Exception { try { /* -- Discovering phase -- */ changePhase(Phase.DISCOVERING); waitProposedEpochFromQuorum(); proposeNewEpoch(); waitEpochAckFromQuorum(); LOG.debug("Established new epoch {}", persistence.getProposedEpoch()); // Finds one who has the "best" history. String peerId = selectSyncHistoryOwner(); LOG.debug("Chose {} to pull its history.", peerId); /* -- Synchronizing phase -- */ LOG.debug("Synchronizing..."); changePhase(Phase.SYNCHRONIZING); if (!peerId.equals(this.serverId)) { // Pulls history from the follower. synchronizeFromFollower(peerId); } // Updates ACK EPOCH of leader. persistence.setAckEpoch(persistence.getProposedEpoch()); // Starts synchronizing. long st = System.nanoTime(); beginSynchronizing(); waitNewLeaderAckFromQuorum(); long syncTime = System.nanoTime() - st; // Adjusts the sync timeout based on this synchronization time. adjustSyncTimeout((int)(syncTime / 1000000)); // After receiving ACKs from all peers in quorum, broadcasts COMMIT // message to all peers in quorum map. broadcastCommitMessage(); // See if it can be restored from the snapshot file. restoreFromSnapshot(); // Delivers all the txns in log before entering broadcasting phase. deliverUndeliveredTxns(); /* -- Broadcasting phase -- */ changePhase(Phase.BROADCASTING); for (PeerHandler ph : this.quorumMap.values()) { ph.startBroadcastingTask(); ph.updateHeartbeatTime(); } broadcasting(); } catch (InterruptedException e) { LOG.debug("Participant is canceled by user."); throw e; } catch (TimeoutException e) { LOG.debug("Didn't hear message from peers for {} milliseconds. Going" + " back to leader election.", this.config.getTimeoutMs()); } catch (BackToElectionException e) { LOG.debug("Got GO_BACK message from queue, going back to electing."); } catch (Zab.SimulatedException e) { LOG.debug("Got SimulatedException, go back to leader election."); } catch (LeftCluster e) { LOG.debug("Exit running : {}", e.getMessage()); throw e; } catch (Exception e) { LOG.error("Caught exception", e); throw e; } finally { if (this.currentPhase == Phase.SYNCHRONIZING) { incSyncTimeout(); LOG.debug("Go back to recovery in synchronization phase, increase " + "sync timeout to {} milliseconds.", getSyncTimeoutMs()); } changePhase(Phase.FINALIZING); } } /** * Gets the minimal quorum size. * * @return the minimal quorum size * @throws IOException in case of IO failures. */ public int getQuorumSize() throws IOException { return persistence.getLastSeenConfig().getQuorumSize(); } /** * Waits until receives the CEPOCH message from the quorum. * * @throws InterruptedException if anything wrong happens. * @throws TimeoutException in case of timeout. */ void waitProposedEpochFromQuorum() throws InterruptedException, TimeoutException, IOException { ClusterConfiguration currentConfig = persistence.getLastSeenConfig(); long acknowledgedEpoch = persistence.getAckEpoch(); // Waits PROPOED_EPOCH from a quorum of peers in current configuraion. while (this.quorumMap.size() < getQuorumSize() - 1) { MessageTuple tuple = filter.getExpectedMessage(MessageType.PROPOSED_EPOCH, null, config.getTimeoutMs()); Message msg = tuple.getMessage(); String source = tuple.getServerId(); ZabMessage.ProposedEpoch epoch = msg.getProposedEpoch(); ClusterConfiguration peerConfig = ClusterConfiguration.fromProto(epoch.getConfig(), source); long peerProposedEpoch = epoch.getProposedEpoch(); long peerAckedEpoch = epoch.getCurrentEpoch(); int syncTimeoutMs = epoch.getSyncTimeout(); Zxid peerVersion = peerConfig.getVersion(); Zxid selfVersion = currentConfig.getVersion(); // If the peer's config version doesn't match leader's config version, // we'll check if the peer has the more likely "correct" configuration. if (!peerVersion.equals(selfVersion)) { LOG.debug("{}'s config version {} is different with leader's {}", peerVersion, selfVersion); if (peerAckedEpoch > acknowledgedEpoch || (peerAckedEpoch == acknowledgedEpoch && peerVersion.compareTo(selfVersion) > 0)) { LOG.debug("{} probably has right configuration, go back to " + "leader election.", source); // TODO : current we just go back to leader election, probably we want // to select the peer as leader. throw new BackToElectionException(); } } // Rejects peer who is not in the current configuration. if (!currentConfig.contains(source)) { LOG.debug("Current configuration doesn't contain {}, ignores it.", source); continue; } if (this.quorumMap.containsKey(source)) { throw new RuntimeException("Quorum set has already contained " + source + ", probably a bug?"); } LOG.debug("Got PROPOSED_EPOCH from {}", source); PeerHandler ph = new PeerHandler(source, transport, config.getTimeoutMs()/3); ph.setLastProposedEpoch(peerProposedEpoch); ph.setSyncTimeoutMs(syncTimeoutMs); this.quorumMap.put(source, ph); } LOG.debug("Got proposed epoch from a quorum."); } /** * Finds an epoch number which is higher than any proposed epoch in quorum * set and propose the epoch to them. * * @throws IOException in case of IO failure. */ void proposeNewEpoch() throws IOException { long maxEpoch = persistence.getProposedEpoch(); int maxSyncTimeoutMs = getSyncTimeoutMs(); for (PeerHandler ph : this.quorumMap.values()) { if (ph.getLastProposedEpoch() > maxEpoch) { maxEpoch = ph.getLastProposedEpoch(); } if (ph.getSyncTimeoutMs() > maxSyncTimeoutMs) { maxSyncTimeoutMs = ph.getSyncTimeoutMs(); } } // The new epoch number should be larger than any follower's epoch. long newEpoch = maxEpoch + 1; // Updates leader's last proposed epoch. persistence.setProposedEpoch(newEpoch); // Updates leader's sync timeout to the largest timeout found in the quorum. setSyncTimeoutMs(maxSyncTimeoutMs); LOG.debug("Begins proposing new epoch {} with sync timeout {} ms", newEpoch, getSyncTimeoutMs()); // Sends new epoch message to quorum. broadcast(this.quorumMap.keySet().iterator(), MessageBuilder.buildNewEpochMessage(newEpoch, getSyncTimeoutMs())); } /** * Broadcasts the message to all the peers. * * @param peers the destination of peers. * @param message the message to be broadcasted. */ void broadcast(Iterator<String> peers, Message message) { transport.broadcast(peers, message); } /** * Waits until the new epoch is established. * * @throws InterruptedException if anything wrong happens. * @throws TimeoutException in case of timeout. */ void waitEpochAckFromQuorum() throws InterruptedException, TimeoutException { int ackCount = 0; // Waits the Ack from all other peers in the quorum set. while (ackCount < this.quorumMap.size()) { MessageTuple tuple = filter.getExpectedMessage(MessageType.ACK_EPOCH, null, config.getTimeoutMs()); Message msg = tuple.getMessage(); String source = tuple.getServerId(); if (!this.quorumMap.containsKey(source)) { LOG.warn("The Epoch ACK comes from {} who is not in quorum set, " + "possibly from previous epoch?", source); continue; } ackCount++; ZabMessage.AckEpoch ackEpoch = msg.getAckEpoch(); ZabMessage.Zxid zxid = ackEpoch.getLastZxid(); // Updates follower's f.a and lastZxid. PeerHandler ph = this.quorumMap.get(source); ph.setLastAckedEpoch(ackEpoch.getAcknowledgedEpoch()); ph.setLastZxid(MessageBuilder.fromProtoZxid(zxid)); } LOG.debug("Received ACKs from the quorum set of size {}.", this.quorumMap.size() + 1); } /** * Finds a server who has the largest acknowledged epoch and longest * history. * * @return the id of the server * @throws IOException */ String selectSyncHistoryOwner() throws IOException { // L.1.2 Select the history of a follwer f to be the initial history // of the new epoch. Follwer f is such that for every f' in the quorum, // f'.a < f.a or (f'.a == f.a && f'.zxid <= f.zxid). long ackEpoch = persistence.getAckEpoch(); Zxid zxid = persistence.getLatestZxid(); String peerId = this.serverId; Iterator<Map.Entry<String, PeerHandler>> iter; iter = this.quorumMap.entrySet().iterator(); while (iter.hasNext()) { Map.Entry<String, PeerHandler> entry = iter.next(); long fEpoch = entry.getValue().getLastAckedEpoch(); Zxid fZxid = entry.getValue().getLastZxid(); if (fEpoch > ackEpoch || (fEpoch == ackEpoch && fZxid.compareTo(zxid) > 0)) { ackEpoch = fEpoch; zxid = fZxid; peerId = entry.getKey(); } } LOG.debug("{} has largest acknowledged epoch {} and longest history {}", peerId, ackEpoch, zxid); if (this.stateChangeCallback != null) { this.stateChangeCallback.initialHistoryOwner(peerId, ackEpoch, zxid); } return peerId; } /** * Pulls the history from the server who has the "best" history. * * @param peerId the id of the server whose history is selected. */ void synchronizeFromFollower(String peerId) throws IOException, TimeoutException, InterruptedException { LOG.debug("Begins synchronizing from follower {}.", peerId); Zxid lastZxid = persistence.getLatestZxid(); Message pullTxn = MessageBuilder.buildPullTxnReq(lastZxid); LOG.debug("Last zxid of {} is {}", this.serverId, lastZxid); sendMessage(peerId, pullTxn); // Waits until the synchronization is finished. waitForSync(peerId); } /** * Waits for synchronization to followers complete. * * @throws TimeoutException in case of timeout. * @throws InterruptedException in case of interrupt. */ void waitNewLeaderAckFromQuorum() throws TimeoutException, InterruptedException, IOException { LOG.debug("Waiting for synchronization to followers complete."); int completeCount = 0; Zxid lastZxid = persistence.getLatestZxid(); while (completeCount < this.quorumMap.size()) { // Here we should use sync_timeout. MessageTuple tuple = filter.getExpectedMessage(MessageType.ACK, null, getSyncTimeoutMs()); ZabMessage.Ack ack = tuple.getMessage().getAck(); String source =tuple.getServerId(); Zxid zxid = MessageBuilder.fromProtoZxid(ack.getZxid()); if (!this.quorumMap.containsKey(source)) { LOG.warn("Quorum set doesn't contain {}, a bug?", source); continue; } if (zxid.compareTo(lastZxid) != 0) { LOG.error("The follower {} is not correctly synchronized.", source); throw new RuntimeException("The synchronized follower's last zxid" + "doesn't match last zxid of current leader."); } PeerHandler ph = this.quorumMap.get(source); ph.setLastAckedZxid(zxid); completeCount++; } } void broadcastCommitMessage() throws IOException { // Broadcasts commit message. Zxid zxid = persistence.getLatestZxid(); Message commit = MessageBuilder.buildCommit(zxid); broadcast(this.quorumMap.keySet().iterator(), commit); for (PeerHandler ph : this.quorumMap.values()) { ph.setLastCommittedZxid(zxid); } } /** * Starts synchronizing peers in background threads. * * @param es the ExecutorService used to running synchronizing tasks. * @throws IOException in case of IO failure. */ void beginSynchronizing() throws IOException { // Synchronization is performed in other threads. Zxid lastZxid = persistence.getLatestZxid(); ClusterConfiguration clusterConfig = persistence.getLastSeenConfig(); long proposedEpoch = persistence.getProposedEpoch(); for (PeerHandler ph : this.quorumMap.values()) { ph.setSyncTask(new SyncPeerTask(ph.getServerId(), ph.getLastZxid(), lastZxid, clusterConfig), proposedEpoch); ph.startSynchronizingTask(); } } void broadcastingInit() throws IOException { Zxid lastZxid = persistence.getLatestZxid(); this.establishedEpoch = persistence.getAckEpoch(); // Gets the initial configuration at the beginning of broadcasting. clusterConfig = persistence.getLastSeenConfig(); // Add leader itself to quorumMap. PeerHandler lh = new PeerHandler(serverId, transport, config.getTimeoutMs()/3); lh.setLastAckedZxid(lastZxid); lh.setLastCommittedZxid(lastZxid); lh.startBroadcastingTask(); quorumMap.put(this.serverId, lh); this.preProcessor = new PreProcessor(stateMachine, quorumMap, clusterConfig); this.ackProcessor = new AckProcessor(quorumMap, clusterConfig); this.syncProcessor = new SyncProposalProcessor(persistence, transport, maxBatchSize); this.commitProcessor = new CommitProcessor(stateMachine, lastDeliveredZxid, serverId, transport, quorumMap.keySet(), clusterConfig, electedLeader, pendings); this.snapProcessor = new SnapshotProcessor(stateMachine, persistence, serverId, transport); // First time notifies the client active members and cluster configuration. stateMachine.leading(new HashSet<String>(quorumMap.keySet()), new HashSet<String>(clusterConfig.getPeers())); this.lastCommittedZxid = lastZxid; this.lastProposedZxid = lastZxid; this.lastAckedZxid = lastZxid; } /** * Entering broadcasting phase, leader broadcasts proposal to * followers. * * @throws InterruptedException if it's interrupted. * @throws TimeoutException in case of timeout. * @throws IOException in case of IO failure. * @throws ExecutionException in case of exception from executors. */ void broadcasting() throws TimeoutException, InterruptedException, IOException, ExecutionException { // Initialization. broadcastingInit(); try { while (this.quorumMap.size() >= clusterConfig.getQuorumSize()) { MessageTuple tuple = filter.getMessage(config.getTimeoutMs()); Message msg = tuple.getMessage(); String source = tuple.getServerId(); // Checks if it's DISCONNECTED message. if (msg.getType() == MessageType.DISCONNECTED) { String peerId = msg.getDisconnected().getServerId(); if (quorumMap.containsKey(peerId)) { onDisconnected(tuple); } else { this.transport.clear(peerId); } continue; } if (!quorumMap.containsKey(source)) { // Received a message sent from a peer who is outside the quorum. handleMessageOutsideQuorum(tuple); } else { // Received a message sent from the peer who is in quorum. handleMessageFromQuorum(tuple); PeerHandler ph = quorumMap.get(source); if (ph != null) { ph.updateHeartbeatTime(); } checkFollowerLiveness(); } } LOG.debug("Detects the size of the ensemble is less than the" + "quorum size {}, goes back to electing phase.", getQuorumSize()); } finally { ackProcessor.shutdown(); preProcessor.shutdown(); commitProcessor.shutdown(); syncProcessor.shutdown(); snapProcessor.shutdown(); this.lastDeliveredZxid = commitProcessor.getLastDeliveredZxid(); this.participantState.updateLastDeliveredZxid(this.lastDeliveredZxid); } } void handleMessageOutsideQuorum(MessageTuple tuple) throws IOException { Message msg = tuple.getMessage(); String source = tuple.getServerId(); if (msg.getType() == MessageType.PROPOSED_EPOCH) { // Follower is in recovering while the leader is in broadcasting // phase, the leader just sends its current epoch as NEW_EPOCH message // to the follower directly. The synchronization to the recovered // follower will begin after receiving its ACK_EPOCH message. LOG.debug("Got PROPOSED_EPOCH from {}.", source); ClusterConfiguration cnf = persistence.getLastSeenConfig(); if (!cnf.contains(source)) { // Only allows servers who are in the current config to join. LOG.warn("Got PROPOSED_EPOCH from {} who is not in config, " + "ignores it.", source); return; } int syncTimeoutMs = msg.getProposedEpoch().getSyncTimeout(); if (syncTimeoutMs > getSyncTimeoutMs()) { // Updates leader's sync timeout if the peer's time out is larger. setSyncTimeoutMs(syncTimeoutMs); } Message newEpoch = MessageBuilder.buildNewEpochMessage(establishedEpoch, getSyncTimeoutMs()); sendMessage(source, newEpoch); } else if (msg.getType() == MessageType.ACK_EPOCH) { LOG.debug("Got ACK_EPOCH from {}", source); // Got the ACK_EPOCH message, the leader will starts synchronizing to // the follower up to the last proposed zxid. tuple.setZxid(lastProposedZxid); onAckEpoch(tuple); } else if (msg.getType() == MessageType.QUERY_LEADER) { LOG.debug("Got QUERY_LEADER from {}", source); Message reply = MessageBuilder.buildQueryReply(this.serverId); sendMessage(source, reply); } else if (msg.getType() == MessageType.SYNC_HISTORY) { // The new joiner will issue SYNC_HISTORY message first to make its // history synchronized. onSyncHistory(tuple); } else if (msg.getType() == MessageType.ELECTION_INFO) { this.election.reply(tuple); } else { if (LOG.isWarnEnabled()) { LOG.warn("Got unexpected message {} from {}.", TextFormat.shortDebugString(msg), source); } } } void handleMessageFromQuorum(MessageTuple tuple) throws ExecutionException, InterruptedException, IOException { Message msg = tuple.getMessage(); String source = tuple.getServerId(); if (msg.getType() != MessageType.HEARTBEAT && LOG.isDebugEnabled()) { LOG.debug("Got message {} from {}", TextFormat.shortDebugString(msg), source); } if (msg.getType() == MessageType.ACK) { onAck(tuple); } else if (msg.getType() == MessageType.REQUEST) { Zxid proposedZxid = getNextProposedZxid(); // Updates last proposed zxid for this peer. The FLUSH // message needs this zxid to determine when it's safe // to deliver the FLUSH request back to user. quorumMap.get(source).setLastProposedZxid(proposedZxid); tuple.setZxid(proposedZxid); preProcessor.processRequest(tuple); } else if (msg.getType() == MessageType.FLUSH_REQ) { onFlushReq(tuple); } else if (msg.getType() == MessageType.FLUSH) { onFlush(tuple); } else if (msg.getType() == MessageType.HEARTBEAT) { LOG.trace("Got HEARTBEAT replies from {}", source); } else if (msg.getType() == MessageType.PROPOSAL) { onProposal(tuple); } else if (msg.getType() == MessageType.COMMIT) { onCommit(tuple); } else if (msg.getType() == MessageType.REMOVE) { if (pendingCopZxid != null) { LOG.warn("There's a pending reconfiguration still in progress."); return; } pendingCopZxid = getNextProposedZxid(); tuple.setZxid(pendingCopZxid); onRemove(tuple); } else if (msg.getType() == MessageType.DELIVERED) { onDelivered(msg); } else if (msg.getType() == MessageType.JOIN) { LOG.debug("Got JOIN from {}", source); if (pendingCopZxid != null) { LOG.warn("There's a pending reconfiguration still in progress."); return; } pendingCopZxid = getNextProposedZxid(); tuple.setZxid(pendingCopZxid); onJoin(tuple); } else if (msg.getType() == MessageType.SNAPSHOT) { snapProcessor.processRequest(tuple); } else if (msg.getType() == MessageType.SNAPSHOT_DONE) { commitProcessor.processRequest(tuple); } else { if (LOG.isWarnEnabled()) { LOG.warn("Unexpected messgae : {} from {}", TextFormat.shortDebugString(msg), source); } } } void onJoin(MessageTuple tuple) throws IOException { // For JOIN message, we simply create a PeerHandler and add to quorum set // of main thread and pending set. Then we pass the JOIN message to // PreProcessor and AckProcessor. PreProcessor will convert the JOIN // request to COP proposal. Zxid lastZxid = MessageBuilder.fromProtoZxid(tuple.getMessage().getJoin().getLastZxid()); String source = tuple.getServerId(); PeerHandler ph = new PeerHandler(source, transport, config.getTimeoutMs()/3); // For joiner, its history must be empty. ph.setLastZxid(lastZxid); // We'll synchronize the joiner up to last proposed zxid of leader. ph.setLastSyncedZxid(tuple.getZxid()); clusterConfig.addPeer(source); quorumMap.put(source, ph); pendingPeers.put(source, ph); preProcessor.processRequest(tuple); ackProcessor.processRequest(tuple); commitProcessor.processRequest(tuple); } void onCommit(MessageTuple tuple) { Message msg = tuple.getMessage(); this.lastCommittedZxid = MessageBuilder.fromProtoZxid(msg.getCommit().getZxid()); // If there's a pending COP we need to find out whether it can be committed. if (pendingCopZxid != null && lastCommittedZxid.compareTo(this.pendingCopZxid) >= 0) { LOG.debug("COP of {} has been committed.", pendingCopZxid); // Resets it to null to allow for next reconfiguration. pendingCopZxid = null; if (stateChangeCallback != null) { stateChangeCallback.commitCop(); } } if (!pendingPeers.isEmpty()) { // If there're any pending followers who are waiting for synchronization // complete we need to checkout whether we can send out the first COMMIT // message to them. if (LOG.isDebugEnabled()) { LOG.debug("Got message {} and there're pending peers.", TextFormat.shortDebugString(msg)); } Zxid zxidCommit = this.lastCommittedZxid; Iterator<Map.Entry<String, PeerHandler>> iter = pendingPeers.entrySet().iterator(); while (iter.hasNext()) { PeerHandler ph = iter.next().getValue(); Zxid ackZxid = ph.getLastAckedZxid(); if (ackZxid != null && zxidCommit.compareTo(ackZxid) >= 0) { LOG.debug("COMMIT >= last acked zxid {} of pending peer. Send COMMIT", ackZxid); Message commit = MessageBuilder.buildCommit(ackZxid); sendMessage(ph.getServerId(), commit); ph.setLastCommittedZxid(ackZxid); ph.startBroadcastingTask(); iter.remove(); } } } commitProcessor.processRequest(tuple); } void onDisconnected(MessageTuple tuple) throws InterruptedException, ExecutionException { Message msg = tuple.getMessage(); String peerId = msg.getDisconnected().getServerId(); // Remove if it's in pendingPeers. pendingPeers.remove(peerId); PeerHandler ph = this.quorumMap.get(peerId); // Before calling shutdown, we need to disable PeerHandler first to prevent // sending obsolete messages in AckProcessor and preProcessor. Because once // we call shutdown(), the new connection from the peer is allowed, and // AckProcessor and PreProcessor should not send obsolete messages in new // connection. ph.disableSending(); // Stop PeerHandler thread and clear tranport. ph.shutdown(); quorumMap.remove(peerId); preProcessor.processRequest(tuple); ackProcessor.processRequest(tuple); commitProcessor.processRequest(tuple); } Zxid onAck(MessageTuple tuple) throws IOException { String source = tuple.getServerId(); Message msg = tuple.getMessage(); ZabMessage.Ack ack = msg.getAck(); Zxid ackZxid = MessageBuilder.fromProtoZxid(ack.getZxid()); PeerHandler peer = pendingPeers.get(source); if (peer != null) { // This is the ACK sent from the peer at the end of the synchronization. if (LOG.isDebugEnabled()) { LOG.debug("Got first {} from pending peer {}.", TextFormat.shortDebugString(msg), source); } peer.setLastAckedZxid(ackZxid); if (lastCommittedZxid.compareTo(ackZxid) >= 0) { // If the zxid of ACK is already committed, send the first COMMIT to // follower and start broadcasting task. Otherwise we need to wait // until the zxid of ACK is committed. LOG.debug("The ACK of pending peer is committed already, send COMMIT."); Message commit = MessageBuilder.buildCommit(ackZxid); sendMessage(peer.getServerId(), commit); peer.setLastCommittedZxid(ackZxid); // After sending out the first COMMIT message, we can start the // broadcasting task. peer.startBroadcastingTask(); pendingPeers.remove(source); } } if (source.equals(this.serverId)) { // If the ACK comes from the leader, we need to update the last // acknowledged zxid of the leader. this.lastAckedZxid = ackZxid; if (!pendingPeers.isEmpty()) { // If there's any pending peer, we need also check if someone are // waiting for given proposal to be synchronized to log before the // synchronization can be started. for (PeerHandler ph : pendingPeers.values()) { if (!ph.isSyncStarted() && ackZxid.compareTo(ph.getLastSyncedZxid()) >= 0) { // Gets the last configuration which is <= lastSynced zxid. ClusterConfiguration cnf = persistence.getLastConfigWithin(ph.getLastSyncedZxid()); ph.setSyncTask(new SyncPeerTask(ph.getServerId(), ph.getLastZxid(), ph.getLastSyncedZxid(), cnf), establishedEpoch); ph.startSynchronizingTask(); } } } } ackProcessor.processRequest(tuple); return ackZxid; } void onAckEpoch(MessageTuple tuple) throws IOException { String source = tuple.getServerId(); Message msg = tuple.getMessage(); // This is the last proposed zxid from leader, we'll make it as the last // transaction of the synchronization. Zxid lastZxidOfSync = tuple.getZxid(); ZabMessage.AckEpoch ackEpoch = msg.getAckEpoch(); // Last zxid of the peer/follower. Zxid lastPeerZxid = MessageBuilder .fromProtoZxid(ackEpoch.getLastZxid()); PeerHandler ph = new PeerHandler(source, transport, config.getTimeoutMs()/3); ph.setLastZxid(lastPeerZxid); ph.setLastSyncedZxid(lastZxidOfSync); // Add to the quorum set of main thread. this.quorumMap.put(source, ph); // Add to the pending set also. this.pendingPeers.put(source, ph); // Add new recovered follower to PreProcessor. preProcessor.processRequest(tuple); // Add new recovered follower to AckProcessor. ackProcessor.processRequest(tuple); // Also ask CommitProcessor to notify the clients of membership changes. commitProcessor.processRequest(tuple); // Before starting the synchronization, we need to guarantee the last // proposed zxid appears in the leader's log. The way we guarantee this // is that the leader must have already acknowledged this proposal. // Otherwise we can only start synchronizing after receiving the // acknowledgement of the proposal from leader. if (lastAckedZxid.compareTo(lastZxidOfSync) >= 0) { // Great, the leader has already synchronized the last proposed // transaction to its log, we can start the synchronization right now. ClusterConfiguration cnf = persistence.getLastConfigWithin(ph.getLastSyncedZxid()); ph.setSyncTask(new SyncPeerTask(ph.getServerId(), ph.getLastZxid(), ph.getLastSyncedZxid(), cnf), this.establishedEpoch); ph.startSynchronizingTask(); } } void onRemove(MessageTuple tuple) throws IOException { // NOTE : For REMOVE message, we shouldn't remove server from quorumMap // here, the leaving server will close the transport once the COP gets // committed and then we'll remove it like normal DISCONNECTED server. // this.quorumMap.remove(server); // But we still need to remove the server from PreProcessor since logically // all the proposals after COP will not be the responsibilities of removed // server. Message msg = tuple.getMessage(); clusterConfig.removePeer(msg.getRemove().getServerId()); preProcessor.processRequest(tuple); ackProcessor.processRequest(tuple); } void onFlushReq(MessageTuple tuple) { Message msg = tuple.getMessage(); String source = tuple.getServerId(); PeerHandler ph = quorumMap.get(source); Zxid zxid = ph.getLastProposedZxid(); ZabMessage.FlushRequest req = msg.getFlushRequest(); msg = MessageBuilder.buildFlush(zxid, req.getBody().asReadOnlyByteBuffer()); sendMessage(source, msg); } void onSyncHistory(MessageTuple tuple) throws IOException { String source = tuple.getServerId(); PeerHandler ph = new PeerHandler(source, transport, config.getTimeoutMs()/3); Zxid lastZxid = MessageBuilder.fromProtoZxid(tuple.getMessage() .getSyncHistory() .getLastZxid()); // The new joiner will issue the SYNC_HISTORY message first. At this time // the joiner's log must be empty. ph.setLastZxid(lastZxid); // We'll synchronize the peer up to the last zxid that is guaranteed in the // leader's log. ph.setLastSyncedZxid(this.lastAckedZxid); ClusterConfiguration cnf = persistence.getLastConfigWithin(ph.getLastSyncedZxid()); ph.setSyncTask(new SyncPeerTask(ph.getServerId(), ph.getLastZxid(), ph.getLastSyncedZxid(), cnf), establishedEpoch); Message reply = MessageBuilder.buildSyncHistoryReply(getSyncTimeoutMs()); // Sends the reply to tell new joiner the sync timeout. sendMessage(source, reply); ph.startSynchronizingTask(); // Adds it to quorumMap. this.quorumMap.put(source, ph); } void checkFollowerLiveness() { long currentTime = System.nanoTime(); long timeoutNs; for (PeerHandler ph : this.quorumMap.values()) { if (ph.getServerId().equals(this.serverId)) { continue; } if (ph.isDisconnected()) { // If has already been marked as disconnected, skips checking. continue; } // Uses different timeout depends if it's in synchronizing. if (!ph.isSynchronizing()) { timeoutNs = this.config.getTimeoutMs() * (long)1000000; } else { timeoutNs = getSyncTimeoutMs() * (long)1000000; } if (currentTime - ph.getLastHeartbeatTime() >= timeoutNs) { // Removes the peer who is likely to be dead. String peerId = ph.getServerId(); LOG.debug("{} is likely to be dead, enqueue a DISCONNECTED message.", peerId); // Enqueue a DISCONNECTED message. Message disconnected = MessageBuilder.buildDisconnected(peerId); this.messageQueue.add(new MessageTuple(this.serverId, disconnected)); // Marks it as disconnected. Avoids duplicate check before receiving // DISCONNECTED message. ph.markDisconnected(); if (ph.isSynchronizing()) { LOG.debug("Can't get heartbeat reply from {} in synchronizing phase.", peerId); incSyncTimeout(); LOG.debug("Adjusts sync timeout to {} ms", getSyncTimeoutMs()); } } } } /** * Gets next proposed zxid for leader in broadcasting phase. * * @return the zxid of next proposed transaction. */ private Zxid getNextProposedZxid() { if (lastProposedZxid.getEpoch() != establishedEpoch) { lastProposedZxid = new Zxid(establishedEpoch, -1); } lastProposedZxid = new Zxid(establishedEpoch, lastProposedZxid.getXid() + 1); return lastProposedZxid; } /** * A filter class for leader acts as a successor of ElectionMessageFilter * class. It filters and handles DISCONNECTED message. */ class LeaderFilter extends ElectionMessageFilter { LeaderFilter(BlockingQueue<MessageTuple> msgQueue, Election election) { super(msgQueue, election); } @Override protected MessageTuple getMessage(int timeoutMs) throws InterruptedException, TimeoutException { int startMs = (int)(System.nanoTime() / 1000000); while (true) { int nowMs = (int)(System.nanoTime() / 1000000); int remainMs = timeoutMs - (nowMs - startMs); if (remainMs < 0) { remainMs = 0; } MessageTuple tuple = super.getMessage(remainMs); if (tuple.getMessage().getType() == MessageType.DISCONNECTED) { // Got DISCONNECTED message enqueued by onDisconnected callback. Message msg = tuple.getMessage(); String peerId = msg.getDisconnected().getServerId(); if (quorumMap.containsKey(peerId)) { if (currentPhase != Phase.BROADCASTING) { // If you lost someone in your quorumMap before broadcasting // phase, you are for sure not have a quorum of followers, just go // back to leader election. The clearance of the transport happens // in the exception handlers of lead/join function. LOG.debug("Lost follower {} in the quorumMap in recovering.", peerId); throw new BackToElectionException(); } else { // Lost someone who is in the quorumMap in broadcasting phase, // return this message to caller and let it handles the // disconnection. LOG.debug("Lost follower {} in the quorumMap in broadcasting.", peerId); return tuple; } } else { // Just lost someone you don't care, clear the transport so it can // join in in later time. LOG.debug("Lost follower {} outside quorumMap.", peerId); transport.clear(peerId); } } else { return tuple; } } } } }