package lsr.paxos;
import static lsr.common.ProcessDescriptor.processDescriptor;
import java.util.BitSet;
import lsr.paxos.messages.Alive;
import lsr.paxos.messages.Message;
import lsr.paxos.messages.MessageType;
import lsr.paxos.network.MessageHandler;
import lsr.paxos.network.Network;
import lsr.paxos.storage.Storage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Represents failure detector thread. If the current process is the leader,
* then this class is responsible for sending <code>ALIVE</code> message every
* amount of time. Otherwise is responsible for suspecting the leader. If there
* is no message received from leader, then the leader is suspected to crash,
* and <code>Paxos</code> is notified about this event.
*/
final public class ActiveFailureDetector implements Runnable, FailureDetector {
/** How long to wait until suspecting the leader. In milliseconds */
private final int suspectTimeout;
/** How long the leader waits until sending heartbeats. In milliseconds */
private final int sendTimeout;
private final Network network;
private final MessageHandler innerListener;
private final Storage storage;
private final Thread thread;
private int view;
/** Follower role: reception time of the last heartbeat from the leader */
private volatile long lastHeartbeatRcvdTS;
/** Leader role: time when the last message or heartbeat was sent to all */
private volatile long lastHeartbeatSentTS;
private final FailureDetectorListener fdListener;
/**
* Initializes new instance of <code>FailureDetector</code>.
*
* @param paxos - the paxos which should be notified about suspecting leader
* @param network - used to send and receive messages
* @param storage - storage containing all data about paxos
*/
public ActiveFailureDetector(FailureDetectorListener fdListener, Network network,
Storage storage) {
this.fdListener = fdListener;
this.network = network;
this.storage = storage;
suspectTimeout = processDescriptor.fdSuspectTimeout;
sendTimeout = processDescriptor.fdSendTimeout;
thread = new Thread(this, "FailureDetector");
thread.setDaemon(true);
innerListener = new InnerMessageHandler();
storage.addViewChangeListener(viewCahngeListener);
}
/**
* Starts failure detector.
*/
public void start(int initialView) {
synchronized (this) {
view = initialView;
thread.start();
}
// Any message received from the leader serves also as an ALIVE message.
Network.addMessageListener(MessageType.ANY, innerListener);
// Sent messages used when in leader role: also count as ALIVE message
// so don't reset sending timeout.
Network.addMessageListener(MessageType.SENT, innerListener);
}
/**
* Stops failure detector.
*/
public void stop() {
Network.removeMessageListener(MessageType.ANY, innerListener);
Network.removeMessageListener(MessageType.SENT, innerListener);
}
/**
* Updates state of failure detector, due to leader change.
*
* Called whenever the leader changes.
*
* @param newLeader - process id of the new leader
*/
protected Storage.ViewChangeListener viewCahngeListener = new Storage.ViewChangeListener() {
public void viewChanged(int newView, int newLeader) {
synchronized (ActiveFailureDetector.this) {
logger.debug("FD has been informed about view {}", newView);
view = newView;
lastHeartbeatRcvdTS = getTime();
ActiveFailureDetector.this.notify();
}
}
};
public void run() {
logger.info("Starting failure detector");
try {
// Warning for maintainers: Deadlock danger!!
// The code below calls several methods in other classes while
// holding the this lock.
// If the methods called acquire locks and then try to call into
// this failure detector,
// there is the danger of deadlock. Therefore, always ensure that
// the methods called
// below do not themselves obtain locks.
synchronized (this) {
while (true) {
long now = getTime();
// Leader role
if (processDescriptor.isLocalProcessLeader(view)) {
// Send
Alive alive = new Alive(view, storage.getLog().getNextId());
network.sendToOthers(alive);
lastHeartbeatSentTS = now;
long nextSend = lastHeartbeatSentTS + sendTimeout;
while (now < nextSend && processDescriptor.isLocalProcessLeader(view)) {
if (logger.isTraceEnabled()) {
logger.trace("Sending next Alive in {} ms", nextSend - now);
}
wait(nextSend - now);
// recompute the state. lastHBSentTS might have
// changed.
now = getTime();
nextSend = lastHeartbeatSentTS + sendTimeout;
}
// Either no longer the leader or the it is time to send
// an hearbeat
} else {
// follower role
lastHeartbeatRcvdTS = now;
long suspectTime = lastHeartbeatRcvdTS + suspectTimeout;
// Loop until either this process becomes the leader or
// until is time to suspect the leader
while (now < suspectTime && !processDescriptor.isLocalProcessLeader(view)) {
if (logger.isTraceEnabled()) {
logger.trace("Suspecting leader ({}) in {} ms",
processDescriptor.getLeaderOfView(view), suspectTime - now);
}
wait(suspectTime - now);
now = getTime();
suspectTime = lastHeartbeatRcvdTS + suspectTimeout;
}
if (!processDescriptor.isLocalProcessLeader(view)) {
// Raise the suspicion. A suspect task will be
// queued for execution
// on the Protocol thread.
fdListener.suspect(view);
// The view change is done asynchronously as seen
// from this thread.
// To avoid raising multiple suspicions, this thread
// suspends until
// the view change completes. When that happens, the
// method viewChange()
// will be called by the Protocol thread, which will
// notify() this
// monitor, thereby unlocking this thread.
int oldView = view;
while (oldView == view) {
logger.debug("FD is waiting for view change from {}", oldView);
wait();
}
logger.debug("FD now knows about new view");
}
}
}
}
} catch (InterruptedException ex) {
throw new RuntimeException(ex);
}
}
/**
* Intersects any message sent or received, used to reset the timeouts for
* sending and receiving ALIVE messages.
*
* These methods are called by the Network thread.
*
* @author Nuno Santos (LSR)
*/
final class InnerMessageHandler implements MessageHandler {
public void onMessageReceived(Message message, int sender) {
// followers only.
if (processDescriptor.isLocalProcessLeader(view))
return;
// Use the message as heartbeat if the local process is
// a follower and the sender is the leader of the current view
if (sender == processDescriptor.getLeaderOfView(view)) {
lastHeartbeatRcvdTS = getTime();
}
}
public void onMessageSent(Message message, BitSet destinations) {
// leader only.
if (!processDescriptor.isLocalProcessLeader(view))
return;
// Ignore Alive messages, the clock was already reset when the
// message was sent.
if (message.getType() == MessageType.Alive) {
return;
}
// If the message is not sent to all, ignore it as it is not useful
// as an hearbeat. Use n-1 because a process does not send to self
if (destinations.cardinality() < processDescriptor.numReplicas - 1) {
return;
}
// Check if comment above is true
assert !destinations.get(processDescriptor.localId) : message;
// This process just sent a message to all. Reset the timeout.
lastHeartbeatSentTS = getTime();
}
}
static long getTime() {
// return System.currentTimeMillis();
return System.nanoTime() / 1000000;
}
private final static Logger logger = LoggerFactory.getLogger(ActiveFailureDetector.class);
}