package lsr.paxos.core; import static lsr.common.ProcessDescriptor.processDescriptor; import java.io.IOException; import java.util.BitSet; import java.util.concurrent.Future; import lsr.common.RequestType; import lsr.common.SingleThreadDispatcher; import lsr.paxos.ActiveFailureDetector; import lsr.paxos.Batcher; import lsr.paxos.FailureDetector; import lsr.paxos.NewPassiveBatcher; import lsr.paxos.Snapshot; import lsr.paxos.SnapshotMaintainer; import lsr.paxos.SnapshotProvider; import lsr.paxos.core.Proposer.ProposerState; import lsr.paxos.messages.Accept; import lsr.paxos.messages.Alive; import lsr.paxos.messages.Message; import lsr.paxos.messages.MessageType; import lsr.paxos.messages.Prepare; import lsr.paxos.messages.PrepareOK; import lsr.paxos.messages.Propose; import lsr.paxos.network.GenericNetwork; import lsr.paxos.network.MessageHandler; import lsr.paxos.network.MulticastNetwork; import lsr.paxos.network.Network; import lsr.paxos.network.NioNetwork; import lsr.paxos.network.TcpNetwork; import lsr.paxos.network.UdpNetwork; import lsr.paxos.replica.ClientRequestManager; import lsr.paxos.replica.DecideCallback; import lsr.paxos.storage.ConsensusInstance; import lsr.paxos.storage.ConsensusInstance.LogEntryState; import lsr.paxos.storage.Log; import lsr.paxos.storage.Storage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Implements state machine replication. It keeps a replicated log internally * and informs the listener of decisions using callbacks. This implementation is * monolithic, in the sense that leader election/view change are integrated on * the paxos protocol. * * <p> * The first consensus instance is 0. Decisions might not be reached in sequence * number order. * </p> */ public class Paxos implements FailureDetector.FailureDetectorListener { private final ProposerImpl proposer; private final Acceptor acceptor; private final Learner learner; private DecideCallback decideCallback; /** * Threading model - This class uses an event-driven threading model. It * starts a Dispatcher thread that is responsible for executing the * replication protocol and has exclusive access to the internal data * structures. The Dispatcher receives work using the pendingEvents queue. */ /** * The Dispatcher thread executes the replication protocol. It receives and * executes events placed on the pendingEvents queue: messages from other * processes or proposals from the local process. * * Only this thread is allowed to access the state of the replication * protocol. Therefore, there is no need for synchronization when accessing * this state. The synchronization is handled by the * <code>pendingEvents</code> queue. */ private final SingleThreadDispatcher dispatcher; private final Storage storage; // Can be a udp, tcp or generic network. private final Network network; private final FailureDetector failureDetector; private final CatchUp catchUp; private final SnapshotMaintainer snapshotMaintainer; /** Receives, queues and creates batches with client requests. */ private final Batcher batcher; protected boolean active = false; /** * Initializes new instance of {@link Paxos}. * * @param decideCallback - the class that should be notified about * decisions. * @param snapshotProvider * @param storage - the state of the paxos protocol * * @throws IOException if an I/O error occurs */ public Paxos(SnapshotProvider snapshotProvider, Storage storage) throws IOException { this.storage = storage; this.dispatcher = new SingleThreadDispatcher("Protocol"); if (snapshotProvider != null) { logger.info("Starting snapshot maintainer"); snapshotMaintainer = new SnapshotMaintainer(this.storage, dispatcher, snapshotProvider); storage.getLog().addLogListener(snapshotMaintainer); } else { logger.error("!!! No snapshot support !!!"); snapshotMaintainer = null; } UdpNetwork udpNetwork = null; if (processDescriptor.network.equals("TCP")) { network = new TcpNetwork(); // for FD udpNetwork = new UdpNetwork(); } else if (processDescriptor.network.equals("NIO")) { network = new NioNetwork(); // for FD udpNetwork = new UdpNetwork(); } else if (processDescriptor.network.equals("UDP")) { network = new UdpNetwork(); } else if (processDescriptor.network.equals("Multicast")) { // for unicast messages, still using TCP TcpNetwork tcpNetwork = new TcpNetwork(); network = new MulticastNetwork(tcpNetwork, storage.getRunUniqueId()); } else if (processDescriptor.network.equals("Generic")) { TcpNetwork tcpNetwork = new TcpNetwork(); udpNetwork = new UdpNetwork(); network = new GenericNetwork(tcpNetwork, udpNetwork); } else { throw new IllegalArgumentException("Unknown network type: " + processDescriptor.network + ". Check paxos.properties configuration."); } logger.info("Network: {}", network.getClass().getCanonicalName()); catchUp = new CatchUp(snapshotProvider, this, this.storage, network); // If the network is not suitable for FD, udpNetwork is created failureDetector = new ActiveFailureDetector(this, udpNetwork == null ? network : udpNetwork, this.storage); // create proposer, acceptor and learner proposer = new ProposerImpl(this, network, this.storage, processDescriptor.crashModel); acceptor = new Acceptor(this, this.storage, network); learner = new Learner(this, this.storage); batcher = new NewPassiveBatcher(this); if (udpNetwork != null) udpNetwork.start(); network.start(); dispatcher.start(); } public void setDecideCallback(DecideCallback decideCallback) { this.decideCallback = decideCallback; batcher.setDecideCallback(decideCallback); } public void setClientRequestManager(ClientRequestManager requestManager) { proposer.setClientRequestManager(requestManager); } /** * Joins this process to the paxos protocol. The catch-up and failure * detector mechanisms are started and message handlers are registered. */ public void startActivePaxos() { assert decideCallback != null : "Cannot start with null DecideCallback"; logger.info("start active Paxos"); // Starts the threads on the child modules. Should be done after // all the dependencies are established, ie. listeners registered. batcher.start(); proposer.start(); failureDetector.start(storage.getView()); active = true; suspect(0); } /** * Joins this process to the paxos protocol. The catch-up and failure * detector mechanisms are started and message handlers are registered. */ public void startPassivePaxos() { assert decideCallback != null : "Cannot start with null DecideCallback"; logger.info("starting passive Paxos"); MessageHandler handler = new MessageHandlerImpl(); Network.addMessageListener(MessageType.Alive, handler); Network.addMessageListener(MessageType.Propose, handler); Network.addMessageListener(MessageType.Prepare, handler); Network.addMessageListener(MessageType.PrepareOK, handler); Network.addMessageListener(MessageType.Accept, handler); } /** * Proposes new value to paxos protocol. * * This process has to be a leader to call this method. If the process is * not a leader, exception is thrown. * * @param request - the value to propose */ public void enqueueRequest(RequestType request) { // called by one of the Selector threads. batcher.enqueueClientRequest(request); } public Batcher getBatcher() { return batcher; } public byte[] requestBatch() { return batcher.requestBatch(); } public void startProposer() { assert dispatcher.amIInDispatcher() : "Incorrect thread: " + Thread.currentThread(); assert proposer.getState() == ProposerState.INACTIVE : "Already in proposer role."; proposer.prepareNextView(); } /** * Is this process on the role of leader? * * @return <code>true</code> if current process is the leader; * <code>false</code> otherwise */ public boolean isLeader() { return processDescriptor.isLocalProcessLeader(storage.getView()); } /** * Gets the id of the replica which is currently the leader. * * @return id of replica which is leader */ public int getLeaderId() { return processDescriptor.getLeaderOfView(storage.getView()); } /** * Gets the dispatcher used by paxos to avoid concurrency in handling * events. * * @return current dispatcher object */ public SingleThreadDispatcher getDispatcher() { return dispatcher; } /** * Changes state of specified consensus instance to <code>DECIDED</code>. * * @param instanceId - the id of instance that has been decided */ public void decide(int instanceId) { assert dispatcher.amIInDispatcher() : "Incorrect thread: " + Thread.currentThread(); ConsensusInstance ci = storage.getLog().getInstance(instanceId); assert ci != null : "Deciding on instance already removed from logs"; assert ci.getState() != LogEntryState.DECIDED : "Deciding on already decided instance"; ci.setDecided(); logger.info(processDescriptor.logMark_OldBenchmark, "Decided {}", instanceId); storage.updateFirstUncommitted(); if (isLeader()) { proposer.stopPropose(instanceId); proposer.ballotFinished(); } else { // not leader. Should we start the catch-up? if (ci.getId() > storage.getFirstUncommitted() + processDescriptor.windowSize) { // The last uncommitted value was already decided, since // the decision just reached is outside the ordering window // So start catch-up. catchUp.forceCatchup(); } } decideCallback.onRequestOrdered(instanceId, ci); } /** * Increases the view of this process to specified value. The new view has * to be greater than the current one. * * This method is executed when this replica receives a message from a * higher view, so the replica is not the leader of newView. * * This may be called before the view is prepared. * * @param newView - the new view number */ public void advanceView(int newView) { assert dispatcher.amIInDispatcher(); int oldView = storage.getView(); assert newView > oldView : "Can't advance to the same or lower view"; if (logger.isInfoEnabled()) { logger.info("Advancing to view {} from {}, Leader={}", newView, oldView, (newView % processDescriptor.numReplicas)); } if (isLeader()) { batcher.suspendBatcher(); proposer.stopProposer(); } storage.setView(newView); // line above changed the leader assert !isLeader() : "Cannot advance to a view where process is leader by receiving a message."; } @Override public void suspect(final int view) { logger.warn(processDescriptor.logMark_Benchmark, "Suspecting {} on view {}", processDescriptor.getLeaderOfView(view), view); // Called by the Failure detector thread. Dispatch to the protocol // thread dispatcher.submit(new Runnable() { @Override public void run() { // The view may have changed since this task was scheduled. // If so, ignore this suspicion. if (view == storage.getView()) { startProposer(); } else { logger.info("Ignoring suspicion for view {}. Current view: {}", view, storage.getView()); } } }); } // ***************** // Auxiliary classes // ***************** /** * Receives messages from other processes and stores them on the * pendingEvents queue for processing by the Dispatcher thread. */ private class MessageHandlerImpl implements MessageHandler { public void onMessageReceived(Message msg, int sender) { logger.debug("Msg rcv by Paxos class: {}", msg); MessageEvent event = new MessageEvent(msg, sender); Future<?> f = dispatcher.submit(event); logger.trace("Msg dispatched to Paxos class: {} as {}", msg, f); } public void onMessageSent(Message message, BitSet destinations) { // Empty } } private final class MessageEvent implements Runnable { private final Message msg; private final int sender; public MessageEvent(Message msg, int sender) { this.msg = msg; this.sender = sender; } public void run() { try { logger.trace("MessageEvent for {} handled by Paxos", msg); /* * Ignore any message with a lower view. Pass alive, it contains * log size; may be useful and is harmless */ if (msg.getView() < storage.getView() && !(msg instanceof Alive)) { logger.debug("Ignoring message. Current view: {}, Message: ", storage.getView(), msg); return; } if (msg.getView() > storage.getView()) { logger.debug("Got message with higher view: {} (current {})", msg, storage.getView()); if (msg.getType() == MessageType.PrepareOK) { logger.error("Theoretically it can happen. If you ever see this message, tell JK"); return; } advanceView(msg.getView()); } switch (msg.getType()) { case Prepare: acceptor.onPrepare((Prepare) msg, sender); break; case PrepareOK: if (proposer.getState() == ProposerState.INACTIVE) { logger.debug("Not in proposer role. Ignoring message {}", msg); } else { proposer.onPrepareOK((PrepareOK) msg, sender); } break; case Propose: acceptor.onPropose((Propose) msg, sender); if (!storage.isInWindow(((Propose) msg).getInstanceId())) { activateCatchup(); } break; case Accept: learner.onAccept((Accept) msg, sender); break; case Alive: if (!isLeader() && checkIfCatchUpNeeded(((Alive) msg).getLogNextId())) { activateCatchup(); } break; default: logger.error("Unknown message type: {}", msg); assert false : msg; } } catch (Throwable t) { throw new RuntimeException(t); } } /** * After getting an alive message, we need to check whether we're up to * date. * * @param aliveNextId - the actual size of the log */ private boolean checkIfCatchUpNeeded(int aliveNextId) { Log log = storage.getLog(); if (log.getNextId() < aliveNextId) { // If we got information, that a newer instance exists, we can // create it log.getInstance(aliveNextId - 1); } // We check if all ballots outside the window finished int i = storage.getFirstUncommitted(); for (; i < log.getNextId() - processDescriptor.windowSize; i++) { if (log.getInstance(i).getState() != LogEntryState.DECIDED) { return true; } } return false; } private void activateCatchup() { catchUp.forceCatchup(); } } public void onSnapshotMade(Snapshot snapshot) { snapshotMaintainer.onSnapshotMade(snapshot); } /** * Returns the storage with the current state of paxos protocol. * * @return the storage */ public Storage getStorage() { return storage; } public Network getNetwork() { return network; } /** * Returns the catch-up mechanism used by paxos protocol. * * @return the catch-up mechanism */ public CatchUp getCatchup() { return catchUp; } public Proposer getProposer() { return proposer; } public void onViewPrepared(int nextInstanceId) { batcher.resumeBatcher(nextInstanceId); } public boolean isActive() { return active; } private final static Logger logger = LoggerFactory.getLogger(Paxos.class); }