package org.jgroups.protocols;
import org.jgroups.*;
import org.jgroups.annotations.MBean;
import org.jgroups.annotations.ManagedAttribute;
import org.jgroups.annotations.ManagedOperation;
import org.jgroups.annotations.Property;
import org.jgroups.stack.Protocol;
import org.jgroups.util.Promise;
import org.jgroups.util.Util;
import java.io.DataInput;
import java.io.DataOutput;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
/**
* Implementation of total order protocol using a sequencer.
* Consult <a href="https://github.com/belaban/JGroups/blob/master/doc/design/SEQUENCER.txt">SEQUENCER.txt</a> for details
* @author Bela Ban
*/
@MBean(description="Implementation of total order protocol using a sequencer")
public class SEQUENCER extends Protocol {
protected Address local_addr;
protected volatile Address coord;
protected volatile View view;
protected volatile boolean is_coord=false;
protected final AtomicLong seqno=new AtomicLong(0);
/** Maintains messages forwarded to the coord which which no ack has been received yet.
* Needs to be sorted so we resend them in the right order
*/
protected final NavigableMap<Long,byte[]> forward_table=new ConcurrentSkipListMap<Long,byte[]>();
protected final Lock send_lock=new ReentrantLock();
protected final Condition send_cond=send_lock.newCondition();
/** When ack_mode is set, we need to wait for an ack for each forwarded message until we can send the next one */
protected volatile boolean ack_mode=true;
/** Set when we block all sending threads to resend all messages from forward_table */
protected volatile boolean flushing=false;
protected volatile boolean running=true;
/** Keeps track of the threads sending messages */
protected final AtomicInteger in_flight_sends=new AtomicInteger(0);
// Maintains received seqnos, so we can weed out dupes
protected final ConcurrentMap<Address,NavigableSet<Long>> delivery_table=Util.createConcurrentMap();
protected volatile Flusher flusher;
/** Used for each resent message to wait until the message has been received */
protected final Promise<Long> ack_promise=new Promise<Long>();
@Property(description="Size of the set to store received seqnos (for duplicate checking)")
protected int delivery_table_max_size=2000;
@Property(description="Number of acks needed before going from ack-mode to normal mode. " +
"0 disables this, which means that ack-mode is always on")
protected int threshold=10;
protected int num_acks=0;
protected long forwarded_msgs=0;
protected long bcast_msgs=0;
protected long received_forwards=0;
protected long received_bcasts=0;
protected long delivered_bcasts=0;
@ManagedAttribute
public boolean isCoordinator() {return is_coord;}
public Address getCoordinator() {return coord;}
public Address getLocalAddress() {return local_addr;}
@ManagedAttribute
public long getForwarded() {return forwarded_msgs;}
@ManagedAttribute
public long getBroadcast() {return bcast_msgs;}
@ManagedAttribute
public long getReceivedForwards() {return received_forwards;}
@ManagedAttribute
public long getReceivedBroadcasts() {return received_bcasts;}
@ManagedAttribute(description="Number of messages in the forward-table")
public int getForwardTableSize() {return forward_table.size();}
public void setThreshold(int new_threshold) {this.threshold=new_threshold;}
public void setDeliveryTableMaxSize(int size) {delivery_table_max_size=size;}
@ManagedOperation
public void resetStats() {
forwarded_msgs=bcast_msgs=received_forwards=received_bcasts=delivered_bcasts=0L;
}
@ManagedOperation
public Map<String,Object> dumpStats() {
Map<String,Object> m=super.dumpStats();
m.put("forwarded",forwarded_msgs);
m.put("broadcast",bcast_msgs);
m.put("received_forwards", received_forwards);
m.put("received_bcasts", received_bcasts);
m.put("delivered_bcasts", delivered_bcasts);
return m;
}
@ManagedOperation
public String printStats() {
return dumpStats().toString();
}
public void start() throws Exception {
super.start();
running=true;
ack_mode=true;
}
public void stop() {
running=false;
unblockAll();
stopFlusher();
super.stop();
}
public Object down(Event evt) {
switch(evt.getType()) {
case Event.MSG:
Message msg=(Message)evt.getArg();
if(msg.getDest() != null || msg.isFlagSet(Message.NO_TOTAL_ORDER) || msg.isFlagSet(Message.OOB))
break;
if(msg.getSrc() == null)
msg.setSrc(local_addr);
if(flushing)
block();
// A seqno is not used to establish ordering, but only to weed out duplicates; next_seqno doesn't need
// to increase monotonically, but only to be unique (https://issues.jboss.org/browse/JGRP-1461) !
long next_seqno=seqno.incrementAndGet();
in_flight_sends.incrementAndGet();
try {
SequencerHeader hdr=new SequencerHeader(is_coord? SequencerHeader.BCAST : SequencerHeader.WRAPPED_BCAST, next_seqno);
msg.putHeader(this.id, hdr);
if(is_coord)
broadcast(msg, false, msg.getSrc(), next_seqno, false); // don't copy, just use the message passed as argument
else {
byte[] marshalled_msg=Util.objectToByteBuffer(msg);
if(log.isTraceEnabled())
log.trace("[" + local_addr + "]: forwarding " + local_addr + "::" + seqno + " to coord " + coord);
forwardToCoord(marshalled_msg, next_seqno);
}
}
catch(Exception ex) {
log.error("failed sending message", ex);
}
finally {
in_flight_sends.decrementAndGet();
}
return null; // don't pass down
case Event.VIEW_CHANGE:
handleViewChange((View)evt.getArg());
break;
case Event.TMP_VIEW:
handleTmpView((View)evt.getArg());
break;
case Event.SET_LOCAL_ADDRESS:
local_addr=(Address)evt.getArg();
break;
}
return down_prot.down(evt);
}
public Object up(Event evt) {
Message msg;
SequencerHeader hdr;
switch(evt.getType()) {
case Event.MSG:
msg=(Message)evt.getArg();
if(msg.isFlagSet(Message.NO_TOTAL_ORDER) || msg.isFlagSet(Message.OOB))
break;
hdr=(SequencerHeader)msg.getHeader(this.id);
if(hdr == null)
break; // pass up
switch(hdr.type) {
case SequencerHeader.FORWARD:
case SequencerHeader.FLUSH:
if(!is_coord) {
if(log.isErrorEnabled())
log.error(local_addr + ": non-coord; dropping FORWARD request from " + msg.getSrc());
return null;
}
Address sender=msg.getSrc();
if(view != null && !view.containsMember(sender)) {
if(log.isErrorEnabled())
log.error(local_addr + ": dropping FORWARD request from non-member " + sender +
"; view=" + view);
return null;
}
broadcast(msg, true, msg.getSrc(), hdr.seqno, hdr.type == SequencerHeader.FLUSH); // do copy the message
received_forwards++;
return null;
case SequencerHeader.BCAST:
deliver(msg, evt, hdr);
received_bcasts++;
return null;
case SequencerHeader.WRAPPED_BCAST:
unwrapAndDeliver(msg, hdr.flush_ack); // unwrap the original message (in the payload) and deliver it
received_bcasts++;
return null;
}
break;
case Event.VIEW_CHANGE:
Object retval=up_prot.up(evt);
handleViewChange((View)evt.getArg());
return retval;
case Event.TMP_VIEW:
handleTmpView((View)evt.getArg());
break;
}
return up_prot.up(evt);
}
/* --------------------------------- Private Methods ----------------------------------- */
protected void handleViewChange(View v) {
List<Address> mbrs=v.getMembers();
if(mbrs.isEmpty()) return;
if(view == null || view.compareTo(v) < 0)
view=v;
else
return;
delivery_table.keySet().retainAll(mbrs);
Address existing_coord=coord, new_coord=mbrs.get(0);
boolean coord_changed=existing_coord == null || !existing_coord.equals(new_coord);
if(coord_changed && new_coord != null) {
stopFlusher();
startFlusher(new_coord); // needs to be done in the background, to prevent blocking if down() would block
}
}
protected void flush(final Address new_coord) throws InterruptedException {
// wait until all threads currently sending messages have returned (new threads after flushing=true) will block
// flushing is set to true in startFlusher()
while(flushing && running) {
if(in_flight_sends.get() == 0)
break;
Thread.sleep(100);
}
send_lock.lockInterruptibly();
try {
if(log.isTraceEnabled())
log.trace(local_addr + ": coord changed from " + coord + " to " + new_coord);
coord=new_coord;
is_coord=local_addr != null && local_addr.equals(coord);
flushMessagesInForwardTable();
}
finally {
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing completed");
flushing=false;
ack_mode=true; // go to ack-mode after flushing
num_acks=0;
send_cond.signalAll();
send_lock.unlock();
}
}
// If we're becoming coordinator, we need to handle TMP_VIEW as
// an immediate change of view. See JGRP-1452.
private void handleTmpView(View v) {
List<Address> mbrs=v.getMembers();
if(mbrs.isEmpty()) return;
Address new_coord=mbrs.get(0);
if(!new_coord.equals(coord) && local_addr != null && local_addr.equals(new_coord))
handleViewChange(v);
}
/**
* Sends all messages currently in forward_table to the new coordinator (changing the dest field).
* This needs to be done, so the underlying reliable unicast protocol (e.g. UNICAST) adds these messages
* to its retransmission mechanism<br/>
* Note that we need to resend the messages in order of their seqnos ! We also need to prevent other message
* from being inserted until we're done, that's why there's synchronization.<br/>
* Access to the forward_table doesn't need to be synchronized as there won't be any insertions during flushing
* (all down-threads are blocked)
*/
protected void flushMessagesInForwardTable() {
if(is_coord) {
for(Map.Entry<Long,byte[]> entry: forward_table.entrySet()) {
Long key=entry.getKey();
byte[] val=entry.getValue();
Message forward_msg=new Message(null, val);
SequencerHeader hdr=new SequencerHeader(SequencerHeader.WRAPPED_BCAST, key);
forward_msg.putHeader(this.id,hdr);
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing (broadcasting) " + local_addr + "::" + key);
down_prot.down(new Event(Event.MSG, forward_msg));
}
return;
}
// for forwarded messages, we need to receive the forwarded message from the coordinator, to prevent this case:
// - V1={A,B,C}
// - A crashes
// - C installs V2={B,C}
// - C forwards messages 3 and 4 to B (the new coord)
// - B drops 3 because its view is still V1
// - B installs V2
// - B receives message 4 and broadcasts it
// ==> C's message 4 is delivered *before* message 3 !
// ==> By resending 3 until it is received, then resending 4 until it is received, we make sure this won't happen
// (see https://issues.jboss.org/browse/JGRP-1449)
while(flushing && running && !forward_table.isEmpty()) {
Map.Entry<Long,byte[]> entry=forward_table.firstEntry();
final Long key=entry.getKey();
byte[] val=entry.getValue();
while(flushing && running && !forward_table.isEmpty()) {
Message forward_msg=new Message(coord, val);
SequencerHeader hdr=new SequencerHeader(SequencerHeader.FLUSH, key);
forward_msg.putHeader(this.id,hdr);
forward_msg.setFlag(Message.Flag.DONT_BUNDLE);
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing (forwarding) " + local_addr + "::" + key + " to coord " + coord);
ack_promise.reset();
down_prot.down(new Event(Event.MSG, forward_msg));
Long ack=ack_promise.getResult(500);
if((ack != null && ack.equals(key)) || !forward_table.containsKey(key))
break;
}
}
}
protected void forwardToCoord(final byte[] marshalled_msg, long seqno) {
if(!running || flushing) {
forward_table.put(seqno,marshalled_msg);
return;
}
if(!ack_mode) {
forward_table.put(seqno, marshalled_msg);
forward(marshalled_msg, seqno, false);
return;
}
send_lock.lock();
try {
forward_table.put(seqno, marshalled_msg);
while(running && !flushing) {
ack_promise.reset();
forward(marshalled_msg, seqno, true);
if(!ack_mode || !running || flushing)
break;
Long ack=ack_promise.getResult(500);
if((ack != null && ack.equals(seqno)) || !forward_table.containsKey(seqno))
break;
}
}
finally {
send_lock.unlock();
}
}
protected void forward(final byte[] marshalled_msg, long seqno, boolean flush) {
Address target=coord;
if(target == null)
return;
Message forward_msg=new Message(target, marshalled_msg);
byte type=flush? SequencerHeader.FLUSH : SequencerHeader.FORWARD;
SequencerHeader hdr=new SequencerHeader(type, seqno);
forward_msg.putHeader(this.id,hdr);
down_prot.down(new Event(Event.MSG, forward_msg));
forwarded_msgs++;
}
protected void broadcast(final Message msg, boolean copy, Address original_sender, long seqno, boolean resend) {
Message bcast_msg=null;
if(!copy) {
bcast_msg=msg; // no need to add a header, message already has one
}
else {
bcast_msg=new Message(null, msg.getRawBuffer(), msg.getOffset(), msg.getLength());
SequencerHeader new_hdr=new SequencerHeader(SequencerHeader.WRAPPED_BCAST, seqno);
bcast_msg.putHeader(this.id, new_hdr);
if(resend) {
new_hdr.flush_ack=true;
bcast_msg.setFlag(Message.Flag.DONT_BUNDLE);
}
}
if(log.isTraceEnabled())
log.trace(local_addr + ": broadcasting " + original_sender + "::" + seqno);
down_prot.down(new Event(Event.MSG,bcast_msg));
bcast_msgs++;
}
/**
* Unmarshal the original message (in the payload) and then pass it up (unless already delivered)
* @param msg
*/
protected void unwrapAndDeliver(final Message msg, boolean flush_ack) {
try {
Message msg_to_deliver=(Message)Util.objectFromByteBuffer(msg.getRawBuffer(), msg.getOffset(), msg.getLength());
SequencerHeader hdr=(SequencerHeader)msg_to_deliver.getHeader(this.id);
if(flush_ack)
hdr.flush_ack=true;
deliver(msg_to_deliver, new Event(Event.MSG, msg_to_deliver), hdr);
}
catch(Exception ex) {
log.error("failure unmarshalling buffer", ex);
}
}
protected void deliver(Message msg, Event evt, SequencerHeader hdr) {
Address sender=msg.getSrc();
if(sender == null) {
if(log.isErrorEnabled())
log.error(local_addr + ": sender is null, cannot deliver " + "::" + hdr.getSeqno());
return;
}
long msg_seqno=hdr.getSeqno();
if(sender.equals(local_addr)) {
forward_table.remove(msg_seqno);
if(hdr.flush_ack) {
ack_promise.setResult(msg_seqno);
if(ack_mode && !flushing && threshold > 0 && ++num_acks >= threshold) {
ack_mode=false;
num_acks=0;
}
}
}
if(!canDeliver(sender, msg_seqno)) {
if(log.isWarnEnabled())
log.warn(local_addr + ": dropped duplicate message " + sender + "::" + msg_seqno);
return;
}
if(log.isTraceEnabled())
log.trace(local_addr + ": delivering " + sender + "::" + msg_seqno);
up_prot.up(evt);
delivered_bcasts++;
}
/**
* Checks if seqno has already been received from sender. This weeds out duplicates.
* Note that this method is never called concurrently for the same sender, as the sender in NAKACK will always be
* the coordinator.
*/
protected boolean canDeliver(Address sender, long seqno) {
NavigableSet<Long> seqno_set=delivery_table.get(sender);
if(seqno_set == null) {
seqno_set=new ConcurrentSkipListSet<Long>();
NavigableSet<Long> existing=delivery_table.put(sender,seqno_set);
if(existing != null)
seqno_set=existing;
}
boolean added=seqno_set.add(seqno);
int size=seqno_set.size();
if(size > delivery_table_max_size) {
// trim the seqno_set to delivery_table_max_size elements by removing the first N seqnos
for(int i=0; i < size - delivery_table_max_size; i++) {
if(seqno_set.pollFirst() == null)
break;
}
}
return added;
}
protected void block() {
send_lock.lock();
try {
while(flushing && running) {
try {
send_cond.await();
}
catch(InterruptedException e) {
}
}
}
finally {
send_lock.unlock();
}
}
protected void unblockAll() {
flushing=false;
send_lock.lock();
try {
send_cond.signalAll();
ack_promise.setResult(null);
}
finally {
send_lock.unlock();
}
}
protected synchronized void startFlusher(final Address new_coord) {
if(flusher == null || !flusher.isAlive()) {
if(log.isTraceEnabled())
log.trace(local_addr + ": flushing started");
// causes subsequent message sends (broadcasts and forwards) to block (https://issues.jboss.org/browse/JGRP-1495)
flushing=true;
flusher=new Flusher(new_coord);
flusher.setName("Flusher");
flusher.start();
}
}
protected void stopFlusher() {
flushing=false;
Thread tmp=flusher;
while(tmp != null && tmp.isAlive()) {
tmp.interrupt();
ack_promise.setResult(null);
try {
tmp.join();
}
catch(InterruptedException e) {
}
}
}
/* ----------------------------- End of Private Methods -------------------------------- */
protected class Flusher extends Thread {
protected final Address new_coord;
public Flusher(Address new_coord) {
this.new_coord=new_coord;
}
public void run() {
try {
flush(new_coord);
}
catch (InterruptedException e) {
}
}
}
public static class SequencerHeader extends Header {
protected static final byte FORWARD = 1;
protected static final byte FLUSH = 2;
protected static final byte BCAST = 3;
protected static final byte WRAPPED_BCAST = 4;
protected byte type=-1;
protected long seqno=-1;
protected boolean flush_ack;
public SequencerHeader() {
}
public SequencerHeader(byte type) {
this.type=type;
}
public SequencerHeader(byte type, long seqno) {
this(type);
this.seqno=seqno;
}
public long getSeqno() {
return seqno;
}
public String toString() {
StringBuilder sb=new StringBuilder(64);
sb.append(printType());
if(seqno >= 0)
sb.append(" seqno=" + seqno);
if(flush_ack)
sb.append(" (flush_ack)");
return sb.toString();
}
protected final String printType() {
switch(type) {
case FORWARD: return "FORWARD";
case FLUSH: return "FLUSH";
case BCAST: return "BCAST";
case WRAPPED_BCAST: return "WRAPPED_BCAST";
default: return "n/a";
}
}
public void writeTo(DataOutput out) throws Exception {
out.writeByte(type);
Util.writeLong(seqno, out);
out.writeBoolean(flush_ack);
}
public void readFrom(DataInput in) throws Exception {
type=in.readByte();
seqno=Util.readLong(in);
flush_ack=in.readBoolean();
}
public int size() {
return Global.BYTE_SIZE + Util.size(seqno) + Global.BYTE_SIZE; // type + seqno + flush_ack
}
}
}