package com.techq.available.quorum; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.techq.available.connector.ElectionCnxManager; import com.techq.available.connector.impl.ElectionCnxManagerImpl; /** * * @author CHQ * 2012-2-3 */ public class LeaderElection implements Election { static Logger LOG = LoggerFactory.getLogger(LeaderElection.class); AtomicInteger msgCount = new AtomicInteger(0); LinkedBlockingQueue<Notification> sendqueue = new LinkedBlockingQueue<Notification>(); LinkedBlockingQueue<Notification> recvqueue = new LinkedBlockingQueue<Notification>(); LinkedBlockingQueue<Notification> pingQueue = new LinkedBlockingQueue<Notification>(); HashMap<Long, Vote> recvVotes = new HashMap<Long, Vote>(); HashMap<Long, Vote> outOfVotes = new HashMap<Long, Vote>(); QuorumPeer self; boolean isRunning = true; boolean isAlreadyInited = false; ElectionCnxManager manager = null; public LeaderElection(QuorumPeer peer) { self = peer; manager = new ElectionCnxManagerImpl(peer); } public void startListen() { if (manager != null) manager.startListen(); else LOG.error("i can't bind to the port for ElectionCnxManager is null"); } public void halt() { } @Override public Vote lookForLeader() throws InterruptedException { LOG.info("start looking for leader"); int notTimeout = 200; int finalizeWait = 200; int maxNotificationInterval = 60000; self.startTime = System.currentTimeMillis(); self.logicalClock++; if (!isAlreadyInited) { isAlreadyInited = true; startListen(); Messenger messenger = new Messenger(manager, self, sendqueue, recvqueue, pingQueue, self.curVote); messenger.start(); } resetPeer(); /** * start propse */ sendNotifications(Notification.mType.PROPOSE, this.self.getId()); /** * start running */ while (self.getPeerState() == ServerState.LOOKING && isRunning()) { Notification n = recvqueue.poll(notTimeout, TimeUnit.MILLISECONDS); if (n == null) { LOG.debug("======================Get nothing from poll==================="); if (manager.haveDelivered()) { sendNotifications(Notification.mType.PROPOSE, this.self.getId()); } else { LOG.debug("manager: connected all"); manager.connectAll(); } /* * Exponential backoff */ int tmpTimeOut = notTimeout * 2; notTimeout = (tmpTimeOut < maxNotificationInterval ? tmpTimeOut : maxNotificationInterval); LOG.info("Notification time out: " + notTimeout); } else if (self.getVotingViews().containsKey(n.sid)) { switch (n.state) { case LOOKING: if (n.logicalClock > self.logicalClock) { self.logicalClock = n.logicalClock; self.curVote.logicalclock = n.logicalClock; recvVotes.clear(); if (shouldUpdate(new Vote(n.sid, n.zxid, n.logicalClock, n.state))) { updateProposal(n.leader, n.zxid); } else { //clear all the votes and reset my proposed vote updateProposal(self.getId(), self.getZxid()); } sendNotifications(Notification.mType.ACCEPT, this.self.getId()); } else if (n.logicalClock < self.logicalClock) { if (LOG.isDebugEnabled()) { LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = " + n.logicalClock + ", Logical clock" + self.curVote.logicalclock); } //directly response to it long toId = n.from; //let remote peer know mine Notification not = new Notification( Notification.mType.DISAGREE, self.curVote.proposedLeader, self.curVote.proposedZxid, self.logicalClock, self.getPeerState(), toId, //msg to this guy self.getId()// from me ); sendqueue.offer(not); //do not put this vote break; } else if (shouldUpdate(new Vote(n.sid, n.zxid, n.logicalClock,n.state))) { updateProposal(n.leader, n.zxid); sendNotifications(Notification.mType.ACCEPT, this.self.getId()); } if (LOG.isDebugEnabled()) { LOG.debug("Adding vote: From = " + n.sid + ", Proposed leader = " + n.leader + ", Proposed zxid = " + n.zxid + ", Proposed election epoch = " + n.logicalClock); } Vote v = new Vote(n.leader, n.zxid, n.logicalClock,n.state); recvVotes.put(n.sid, v); //waitAmoment(n); // check if it has quorum if (containsQuorum(recvVotes, v)) { // wait a moment and see if any changed while ((n = recvqueue.poll(finalizeWait, TimeUnit.MILLISECONDS)) != null) { if (shouldUpdate(new Vote(n.leader, n.zxid, n.logicalClock,n.state))) { recvqueue.put(n); break; } } if (n == null) { self.setPeerState((v.getId() == self.getId()) ? ServerState.LEADING : ServerState.FOLLOWING); self.curVote.state = self.getPeerState(); self.curVote.proposedLeader = v.getId(); Vote endVote = new Vote(self.curVote); leaveInstance(endVote); return endVote; } } break; case FOLLOWING: case LEADING: LOG.trace("============Received notification, n.electionEpoch is " + n.logicalClock + ", current logicalclock is " + self.curVote.logicalclock + "========================"); // is same election epoch? if (isSameEpoch(n.logicalClock)) { recvVotes.put(n.sid, new Vote(n.leader, n.zxid, n.logicalClock, n.state)); if (shouldUpdate(new Vote(n.leader, n.zxid, n.logicalClock, n.state))) { updateProposal(n.leader, n.zxid); sendNotifications(Notification.mType.ACCEPT, this.self.getId()); // avoid the instant livelock if (checkLeader(recvVotes, n.leader, n.logicalClock)) { self.setPeerState((n.leader == self.getId()) ? ServerState.LEADING : ServerState.FOLLOWING); LOG.debug("=====================recvset, leader is " + n.leader + "======================="); Vote endVote = new Vote(n.leader, n.zxid, n.logicalClock, n.state); leaveInstance(endVote); return endVote; } } } Vote vote = new Vote(n.leader, n.zxid, n.logicalClock, n.state); outOfVotes.put(n.sid, vote); LOG.info("add to out of votes:" + n + ", now size is:" + outOfVotes.size()); if (containsQuorum(outOfVotes, vote) && checkLeader(outOfVotes, n.leader, n.logicalClock)) { self.logicalClock = n.logicalClock; self.curVote.proposedLeader = n.leader; self.setPeerState((n.leader == self.getId()) ? ServerState.LEADING : ServerState.FOLLOWING); self.curVote.state = self.getPeerState(); LOG.info("=====================recvset, leader is " + n.leader + "======================="); Vote endVote = new Vote(n.leader, n.zxid, n.logicalClock, n.state); leaveInstance(endVote); return endVote; } break; default: LOG.warn("Notification state unrecoginized: " + n.state + " (n.state), " + n.sid + " (n.sid)"); break; } } else { LOG.warn("Ignoring notification from non-cluster member " + n.sid); } } return null; } private void resetPeer() { updateProposal(self.getId(), self.getZxid()); self.curVote = new ProposalVote(self.logicalClock, self.getId(), self.getZxid(), ServerState.LOOKING); } private void waitAmoment(Notification n) { if (n != null) msgCount.addAndGet(1); try { LOG.info("wait a moment, current votes is "+recvVotes.size()+", collect msg:" + msgCount.get() + ", msg:" + n); TimeUnit.SECONDS.sleep(10); } catch (InterruptedException e) { e.printStackTrace(); } } private void waitAmoment() { try { LOG.info("wait a moment"); TimeUnit.SECONDS.sleep(10); } catch (InterruptedException e) { e.printStackTrace(); } } private boolean isSameEpoch(long electionEpoch) { return self.getLogicalClock() == electionEpoch; } private boolean checkLeader(HashMap<Long, Vote> votes, long leader, long electionEpoch) { boolean predicate = true; /* * If everyone else thinks I'm the leader, I must be the leader. The * other two checks are just for the case in which I'm not the leader. * If I'm not the leader and I haven't received a message from leader * stating that it is leading, then predicate is false. */ if (leader != self.getId()) { if (votes.get(leader) == null) { predicate = false; } else if (votes.get(leader).getState() != ServerState.LEADING) { predicate = false; } } return predicate; } private void leaveInstance(Vote v) { if (LOG.isDebugEnabled()) { LOG.debug("About to leave FLE instance: Leader= " + v.getId() + ", Zxid = " + v.getZxid() + ", My id = " + self.getId() + ", My state = " + self.getPeerState()); } recvqueue.clear(); recvVotes.clear(); outOfVotes.clear(); } /** * * @param votes * @return * boolean */ boolean containsQuorum(HashMap<Long, Vote> votes, Vote vote) { HashSet<Long> set = new HashSet<Long>(); /* * First make the views consistent. Sometimes peers will have different * zxids for a server depending on timing. */ for (Map.Entry<Long, Vote> entry : votes.entrySet()) { if (vote.equals(entry.getValue())) { set.add(entry.getKey()); } } return (set.size() * 2) > self.getVotingViews().size(); } private boolean isRunning() { return isRunning; } private void updateProposal(long id, long zxid) { LOG.info("update proposal leader to id:" + id); self.curVote.proposedLeader = id;// self.getId(); self.curVote.proposedZxid = zxid;// self.getZxid(); } /** * check if succeeds our current vote * * @param newVote * @return boolean */ private boolean shouldUpdate(Vote newVote) { long remoteZxid = newVote.getZxid(); long remoteId = newVote.getId(); return (((remoteZxid > self.getZxid())) || ((remoteZxid == self .getZxid()) && (remoteId > self.curVote.proposedLeader))); } private void sendNotifications(Notification.mType type, long from) { for (QuorumPeer.QuorumServer server : self.getVotingViews().values()) { long sid = server.id; Notification notmsg = new Notification(type, self.curVote.proposedLeader, self.curVote.proposedZxid, self.logicalClock, self.curVote.state, sid, from); if (LOG.isDebugEnabled()) { LOG.debug("Sending Notification: " + notmsg); } sendqueue.offer(notmsg); } } @Override public long whoIsLeader() { return -1; } @Override public Notification pollConfirm(long timeout, TimeUnit unit) throws InterruptedException { return pingQueue.poll(timeout, unit); } @Override public void offerACK(Notification n) { sendqueue.offer(n); } @Override public void pushback(Notification n) throws InterruptedException { pingQueue.offer(n); } @Override public Notification pollPing(long timeout, TimeUnit unit) throws InterruptedException { return pingQueue.poll(timeout, unit); } @Override public void offerPING(Notification n) throws InterruptedException { LOG.info("send type:" + n.getType() + " to sid:" + n.sid); sendqueue.offer(n); } @Override public void offerAgree(Notification n) { LOG.info("send type:" + n.getType() + " to sid:" + n.sid); sendqueue.offer(n); } }