package org.jgroups.protocols; import org.jgroups.*; import org.jgroups.annotations.*; import org.jgroups.stack.Protocol; import org.jgroups.util.BoundedList; import org.jgroups.util.MessageBatch; import org.jgroups.util.TimeScheduler; import org.jgroups.util.Util; import java.io.DataInput; import java.io.DataOutput; import java.util.*; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; import java.util.function.Supplier; /** * Failure detection based on simple heartbeat protocol. Regularly polls members * for liveness. Multicasts SUSPECT messages when a member is not reachable. The * simple algorithms works as follows: the membership is known and ordered. Each * HB protocol periodically sends an 'are-you-alive' message to its *neighbor*. * A neighbor is the next in rank in the membership list, which is recomputed * upon a view change. When a response hasn't been received for n milliseconds * and m tries, the corresponding member is suspected (and eventually excluded * if faulty). * <p> * FD starts when it detects (in a view change notification) that there are at * least 2 members in the group. It stops running when the membership drops * below 2. * <p> * When a message is received from the monitored neighbor member, it causes the * pinger thread to 'skip' sending the next are-you-alive message. Thus, traffic * is reduced. * * @author Bela Ban */ @MBean(description="Failure detection based on simple heartbeat protocol") public class FD extends Protocol { /* ----------------------------------------- Properties -------------------------------------------------- */ @Property(description="Timeout to suspect a node P if neither a heartbeat nor data were received from P.") protected long timeout=3000; @Property(description="Number of times to send an are-you-alive message") protected int max_tries=5; @Property(description="Treat messages received from members as heartbeats. Note that this means we're updating " + "a value in a hashmap every time a message is passing up the stack through FD, which is costly.") boolean msg_counts_as_heartbeat=true; /* --------------------------------------------- JMX ------------------------------------------------------ */ protected int num_heartbeats; protected int num_suspect_events; protected final BoundedList<String> suspect_history=new BoundedList<>(20); /* --------------------------------------------- Fields ------------------------------------------------------ */ protected Address local_addr; protected volatile long last_ack=System.nanoTime(); protected final AtomicInteger num_tries=new AtomicInteger(0); protected final Lock lock=new ReentrantLock(); @GuardedBy("lock") protected volatile Address ping_dest; @GuardedBy("lock") protected final List<Address> members=new ArrayList<>(); /** Members from which we select ping_dest. Copy of {@link #members} minus the suspected members */ @GuardedBy("lock") protected final List<Address> pingable_mbrs=new ArrayList<>(); protected TimeScheduler timer; // task that performs the actual monitoring for failure detection @GuardedBy("lock") protected Future<?> monitor_future=null; /** Transmits SUSPECT message until view change or UNSUSPECT is received */ protected final Broadcaster bcast_task=new Broadcaster(); @ManagedAttribute(description="Member address") public String getLocalAddress() {return local_addr != null? local_addr.toString() : "null";} @ManagedAttribute(description="List of cluster members") public String getMembers() {return members != null? members.toString() : "null";} @ManagedAttribute(description="List of pingable members of a cluster") public String getPingableMembers() {return pingable_mbrs != null? pingable_mbrs.toString() : "null";} @ManagedAttribute(description="Ping destination") public String getPingDest() {return ping_dest != null? ping_dest.toString() : "null";} @ManagedAttribute(description="Number of heartbeats sent") public int getNumberOfHeartbeatsSent() {return num_heartbeats;} @ManagedAttribute(description="Number of suspect events received") public int getNumSuspectEventsGenerated() {return num_suspect_events;} public long getTimeout() {return timeout;} public void setTimeout(long timeout) {this.timeout=timeout;} public int getMaxTries() {return max_tries;} public void setMaxTries(int max_tries) {this.max_tries=max_tries;} public int getCurrentNumTries() {return num_tries.get();} @ManagedOperation(description="Print suspect history") public String printSuspectHistory() { StringBuilder sb=new StringBuilder(); for(String addr: suspect_history) sb.append(addr).append("\n"); return sb.toString(); } public void resetStats() { num_heartbeats=num_suspect_events=0; suspect_history.clear(); } public void init() throws Exception { timer=getTransport().getTimer(); if(timer == null) throw new Exception("timer cannot be retrieved"); } public void stop() { lock.lock(); try { ping_dest=null; stopMonitor(); } finally { lock.unlock(); } } protected Address getPingDest(List<Address> mbrs) { Address tmp, retval=null; if(mbrs == null || mbrs.size() < 2 || local_addr == null) return null; for(int i=0; i < mbrs.size(); i++) { tmp=mbrs.get(i); if(local_addr.equals(tmp)) { if(i + 1 >= mbrs.size()) retval=mbrs.get(0); else retval=mbrs.get(i + 1); break; } } return retval; } @ManagedOperation(description="Stops checking for crashed members") public void stopFailureDetection() { stopMonitor(); } @ManagedOperation(description="Resumes checking for crashed members") public void startFailureDetection() { startMonitor(); } /** Requires lock to held by caller */ @GuardedBy("lock") protected void startMonitor() { if(monitor_future == null || monitor_future.isDone()) { last_ack=System.nanoTime(); // start from scratch monitor_future=timer.scheduleWithFixedDelay(new Monitor(), timeout, timeout, TimeUnit.MILLISECONDS, false); num_tries.set(1); } } /** Requires lock to be held by caller */ @GuardedBy("lock") protected void stopMonitor() { if(monitor_future != null) { monitor_future.cancel(true); monitor_future=null; } } @ManagedAttribute(description="Whether the failure detection monitor is running") public boolean isMonitorRunning() {return monitor_future != null && !monitor_future.isDone();} public Object up(Message msg) { FdHeader hdr=msg.getHeader(this.id); if(hdr == null) { if(msg_counts_as_heartbeat) updateTimestamp(msg.getSrc()); return up_prot.up(msg); // message did not originate from FD layer, just pass up } switch(hdr.type) { case FdHeader.HEARTBEAT: // heartbeat request; send heartbeat ack Address hb_sender=msg.getSrc(); log.trace("%s: received are-you-alive from %s, sending response", local_addr, hb_sender); sendHeartbeatResponse(hb_sender); break; // don't pass up ! case FdHeader.HEARTBEAT_ACK: // heartbeat ack updateTimestamp(hdr.from); break; case FdHeader.SUSPECT: if(hdr.mbrs == null) return null; log.trace("%s: received suspect message: %s", local_addr, hdr); for(Address mbr: hdr.mbrs) { if(local_addr != null && mbr.equals(local_addr)) { log.warn("%s: I was suspected by %s; ignoring the SUSPECT message and sending back a HEARTBEAT_ACK", local_addr, msg.src()); sendHeartbeatResponse(msg.getSrc()); continue; } lock.lock(); try { computePingDest(mbr); } finally { lock.unlock(); } up_prot.up(new Event(Event.SUSPECT, mbr)); down_prot.down(new Event(Event.SUSPECT, mbr)); } break; case FdHeader.UNSUSPECT: if(hdr.mbrs == null) return null; log.trace("%s: received unsuspect message: %s", local_addr, hdr); hdr.mbrs.forEach(this::unsuspect); break; } return null; } public void up(MessageBatch batch) { Collection<Message> msgs=batch.getMatchingMessages(id, true); boolean updated=false; if(msgs != null) { for(Message msg: msgs) { FdHeader hdr=msg.getHeader(id); // header is not null at this point if(hdr.type == FdHeader.HEARTBEAT_ACK) updated=true; else up(msg); // SUSPECT and HEARTBEAT } } if(updated || (msg_counts_as_heartbeat && batch.sender() != null)) updateTimestamp(batch.sender()); if(!batch.isEmpty()) up_prot.up(batch); } public Object down(Event evt) { switch(evt.getType()) { case Event.VIEW_CHANGE: Object retval=down_prot.down(evt); View view=evt.getArg(); lock.lock(); try { members.clear(); members.addAll(view.getMembers()); bcast_task.adjustSuspectedMembers(members); computePingDest(null); if(view.size() <= 1) stopMonitor(); else if(!isMonitorRunning()) startMonitor(); } finally { lock.unlock(); } return retval; case Event.UNSUSPECT: FdHeader hdr=new FdHeader(FdHeader.UNSUSPECT); hdr.mbrs=new ArrayList<>(); hdr.mbrs.add(evt.getArg()); hdr.from=local_addr; Message unsuspect_msg=new Message().setFlag(Message.Flag.INTERNAL).putHeader(id, hdr); log.trace("%s: broadcasting UNSUSPECT message (mbrs=%s)", local_addr, hdr.mbrs); down_prot.down(unsuspect_msg); break; case Event.SET_LOCAL_ADDRESS: local_addr=evt.getArg(); break; } return down_prot.down(evt); } protected void sendHeartbeatResponse(Address dest) { Message hb_ack=new Message(dest).setFlag(Message.Flag.INTERNAL); FdHeader tmp_hdr=new FdHeader(FdHeader.HEARTBEAT_ACK); tmp_hdr.from=local_addr; hb_ack.putHeader(this.id, tmp_hdr); down_prot.down(hb_ack); } @GuardedBy("lock") protected void unsuspect(Address mbr) { lock.lock(); try { bcast_task.removeSuspectedMember(mbr); computePingDest(null); } finally { lock.unlock(); } } protected void updateTimestamp(Address sender) { if(Objects.equals(sender, ping_dest)) { last_ack=System.nanoTime(); num_tries.set(1); } } /** * Computes pingable_mbrs (based on the current membership and the suspected members) and ping_dest * @param remove The member to be removed from pingable_mbrs */ @GuardedBy("lock") protected void computePingDest(Address remove) { if(remove != null) pingable_mbrs.remove(remove); else { pingable_mbrs.clear(); pingable_mbrs.addAll(members); pingable_mbrs.removeAll(bcast_task.getSuspectedMembers()); } Address old_ping_dest=ping_dest; ping_dest=getPingDest(pingable_mbrs); if(Util.different(old_ping_dest, ping_dest)) { num_tries.set(1); last_ack=System.nanoTime(); } } public static class FdHeader extends Header { public static final byte HEARTBEAT = 0; public static final byte HEARTBEAT_ACK = 1; public static final byte SUSPECT = 2; public static final byte UNSUSPECT = 3; protected byte type=HEARTBEAT; protected Collection<Address> mbrs; protected Address from; // member who detected that suspected_mbr has failed public FdHeader() { } public FdHeader(byte type) { this.type=type; } public FdHeader(byte type, Collection<Address> mbrs, Address from) { this(type); this.mbrs=mbrs; this.from=from; } public short getMagicId() {return 50;} public Supplier<? extends Header> create() { return FdHeader::new; } public String toString() { switch(type) { case HEARTBEAT: return "heartbeat"; case HEARTBEAT_ACK: return "heartbeat ack"; case SUSPECT: return "SUSPECT (suspected_mbrs=" + mbrs + "), from=" + from; case UNSUSPECT: return "UNSUSPECT (mbrs=" + mbrs + "), from=" + from; default: return "unknown type (" + type + ")"; } } public int serializedSize() { int retval=Global.BYTE_SIZE; // type retval+=Util.size(mbrs); retval+=Util.size(from); return retval; } public void writeTo(DataOutput out) throws Exception { out.writeByte(type); Util.writeAddresses(mbrs, out); Util.writeAddress(from, out); } public void readFrom(DataInput in) throws Exception { type=in.readByte(); mbrs=(Collection<Address>)Util.readAddresses(in, ArrayList.class); from=Util.readAddress(in); } } /** Task which periodically checks of the last_ack from ping_dest exceeded timeout and - if yes - broadcasts * a SUSPECT message */ protected class Monitor implements Runnable { public void run() { Address dest=ping_dest; if(dest == null) { log.trace("%s: ping_dest is null, skipping timeout check: members=%s, pingable_mbrs=%s", local_addr, members, pingable_mbrs); return; } // 1. send heartbeat request Message hb_req=new Message(dest).setFlag(Message.Flag.INTERNAL).putHeader(id, new FdHeader(FdHeader.HEARTBEAT)); log.trace("%s: sending are-you-alive msg to %s", local_addr, dest); down_prot.down(hb_req); num_heartbeats++; // 2. If the time of the last heartbeat is > timeout and max_tries heartbeat messages have not been // received, then broadcast a SUSPECT message. Will be handled by coordinator, which may install // a new view // time in msecs we haven't heard from ping_dest long not_heard_from=TimeUnit.MILLISECONDS.convert(System.nanoTime() - last_ack, TimeUnit.NANOSECONDS); // quick & dirty fix: increase timeout by 500 ms to allow for latency (bela June 27 2003) if(not_heard_from > timeout + 500) { // no heartbeat ack for more than timeout msecs int tmp_tries=num_tries.get(); if(tmp_tries >= max_tries) { if(!dest.equals(ping_dest)) // ping_dest was changed meanwhile... return; log.debug("%s: received no heartbeat from %s for %d times (%d milliseconds), suspecting it", local_addr, dest, tmp_tries, tmp_tries * timeout); // broadcast a SUSPECT message to all members - loop until unsuspect or view change is received bcast_task.addSuspectedMember(dest); num_tries.set(1); if(stats) { num_suspect_events++; suspect_history.add(String.format("%s: %s", new Date(), dest)); } } else { log.debug("%s: heartbeat missing from %s (number=%d)", local_addr, dest, tmp_tries); num_tries.incrementAndGet(); } } } public String toString() { return FD.class.getSimpleName() + ": Monitor (timeout=" + timeout + "ms)"; } } /** * Task that periodically broadcasts a list of suspected members to the group. Goal is not to lose * a SUSPECT message: since these are bcast unreliably, they might get dropped. The BroadcastTask makes * sure they are retransmitted until a view has been received which doesn't contain the suspected members * any longer. Then the task terminates. */ protected final class Broadcaster { protected final List<Address> suspected_mbrs=new ArrayList<>(7); protected final Lock bcast_lock=new ReentrantLock(); @GuardedBy("bcast_lock") protected Future<?> bcast_future=null; @GuardedBy("bcast_lock") protected BroadcastTask task; protected List<Address> getSuspectedMembers() { return suspected_mbrs; } /** * Starts a new task, or - if already running - adds the argument to the running task. * @param suspect */ protected void startBroadcastTask(Address suspect) { bcast_lock.lock(); try { if(bcast_future == null || bcast_future.isDone()) { task=new BroadcastTask(suspected_mbrs); task.addSuspectedMember(suspect); bcast_future=timer.scheduleWithFixedDelay(task, 0, // run immediately the first time timeout, // then every timeout milliseconds, until cancelled TimeUnit.MILLISECONDS, getTransport() instanceof TCP); } else { task.addSuspectedMember(suspect); } } finally { bcast_lock.unlock(); } } protected void stopBroadcastTask() { bcast_lock.lock(); try { if(bcast_future != null) { bcast_future.cancel(true); bcast_future=null; task=null; } } finally { bcast_lock.unlock(); } } /** Adds a suspected member. Starts the task if not yet running */ protected void addSuspectedMember(Address mbr) { if(mbr == null) return; if(!members.contains(mbr)) return; synchronized(suspected_mbrs) { if(!suspected_mbrs.contains(mbr)) { suspected_mbrs.add(mbr); startBroadcastTask(mbr); } } } void removeSuspectedMember(Address suspected_mbr) { if(suspected_mbr == null) return; synchronized(suspected_mbrs) { suspected_mbrs.remove(suspected_mbr); if(suspected_mbrs.isEmpty()) stopBroadcastTask(); } } /** Removes all elements from suspected_mbrs that are <em>not</em> in the new membership */ void adjustSuspectedMembers(List<Address> new_mbrship) { if(new_mbrship == null || new_mbrship.isEmpty()) return; synchronized(suspected_mbrs) { suspected_mbrs.retainAll(new_mbrship); if(suspected_mbrs.isEmpty()) stopBroadcastTask(); } } } protected final class BroadcastTask implements Runnable { protected final List<Address> suspected_members=new ArrayList<>(); BroadcastTask(List<Address> suspected_members) { this.suspected_members.addAll(suspected_members); } public void stop() { suspected_members.clear(); } public void run() { FD.FdHeader hdr; synchronized(suspected_members) { if(suspected_members.isEmpty()) { stop(); return; } hdr=new FdHeader(FdHeader.SUSPECT); hdr.mbrs=new ArrayList<>(suspected_members); hdr.from=local_addr; } Message suspect_msg=new Message().setFlag(Message.Flag.INTERNAL).putHeader(id, hdr); log.trace("%s: broadcasting SUSPECT message (suspects=%s)", local_addr, suspected_members); down_prot.down(suspect_msg); } public void addSuspectedMember(Address suspect) { if(suspect != null && !suspected_members.contains(suspect)) { suspected_members.add(suspect); } } public String toString() { return "BroadcastTask (" + suspected_members.size() + " suspected mbrs)"; } } }