package lsr.paxos.replica; import static lsr.common.ProcessDescriptor.processDescriptor; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.SortedMap; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; import lsr.common.ClientRequest; import lsr.common.Configuration; import lsr.common.CrashModel; import lsr.common.ProcessDescriptor; import lsr.common.Reply; import lsr.common.RequestId; import lsr.common.SingleThreadDispatcher; import lsr.paxos.Batcher; import lsr.paxos.Snapshot; import lsr.paxos.SnapshotProvider; import lsr.paxos.core.Paxos; import lsr.paxos.events.AfterCatchupSnapshotEvent; import lsr.paxos.recovery.CrashStopRecovery; import lsr.paxos.recovery.EpochSSRecovery; import lsr.paxos.recovery.FullSSRecovery; import lsr.paxos.recovery.RecoveryAlgorithm; import lsr.paxos.recovery.RecoveryListener; import lsr.paxos.recovery.ViewSSRecovery; import lsr.paxos.storage.ClientBatchStore; import lsr.paxos.storage.ConsensusInstance; import lsr.paxos.storage.ConsensusInstance.LogEntryState; import lsr.paxos.storage.Storage; import lsr.service.Service; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Manages replication of a service. Receives requests from the client, orders * them using Paxos, executes the ordered requests and sends the reply back to * the client. * <p> * Example of usage: * <p> * <blockquote> * * <pre> * public static void main(String[] args) throws IOException { * int localId = Integer.parseInt(args[0]); * Replica replica = new Replica(localId, new MapService()); * replica.start(); * } * </pre> * * </blockquote> */ public class Replica { // // // // // // // // // // // // // // // // // External modules accessed by the replica. // // // // // // // // // // // // // // // // // private Paxos paxos; private NioClientManager clientManager; private ClientRequestManager requestManager; /** Represent the deterministic state machine (service) itself */ private final ServiceProxy serviceProxy; private DecideCallbackImpl decideCallback; /** Client set exposed to the world */ private InternalClient intCli = null; private Batcher batcher; // // // // // // // // // // // // // // Internal modules of the replica. // // // // // // // // // // // // // // private final SnapshotListener2 innerSnapshotListener2; private final SnapshotProvider innerSnapshotProvider; // // // // // // // // // // // Miscellaneous variables // // // // // // // // // // // /** Location for files that should survive crashes */ private String stableStoragePath; /** Next request to be executed. */ private int executeUB = 0; /** Thread for handling events connected to the replica */ private final SingleThreadDispatcher replicaDispatcher; // // // // // // // // // // // // // // // // // // // // // Cached replies and past replies for snapshot creation // // // // // // // // // // // // // // // // // // // // // /** * For each client, keeps the sequence id of the last request executed from * the client. * * This is accessed by the Selector threads, so it must be thread-safe */ private final Map<Long, Reply> executedRequests = new ConcurrentHashMap<Long, Reply>(8192, (float) 0.75, 8); /** View on executedDifference row for current instance */ private ArrayList<Reply> cache; /** caches responses for clients, maps instance ID to sent responses */ private final Map<Integer, List<Reply>> executedDifference = new HashMap<Integer, List<Reply>>(); /** * State of the {@link #executedRequests} from the moment when previous * snapshot has been created. Used to 'reply' the requests in order to add * them as part of new snapshot state */ private final Map<Long, Reply> previousSnapshotExecutedRequests = new HashMap<Long, Reply>(); private ClientBatchManager batchManager; private ClientRequestForwarder requestForwarder; /* * Description of the above variables on an example: * (previousSnapshot)executedRequests and -> maps clientId to lastReply * executedDifference -> map instances to replies * * Example state (in one moment): * * last snapshot instance - 3 next instance - 6 * * previousSnapshotExecutedRequests - (1,1#2) (3,3#1) * * (client 1 has reply for second request cached, client 3 has reply for * first request cached) * * executedDifference - ((4: 3#2, 2#1), (5: 3#3, 4#1)) * * (in instance 4 client 3 received reply for second request and client 2 * received reply for first request, in instance 5 ...) * * executedRequests - (1,1#2) (2,2#1) (3,3#3) (4,4#1) * * If a client comes with a request, it's ID is checked against * executedRequests. * * If the service makes snapshot (eg. after request 2#1), to the snapshot it * must be appended that executed request at that time were * (previousSnapshotExecutedRequests + part of executedDifference) */ /* * TODO: the executedRequests map grows and is NEVER cleared! * * For theoretical correctness, it must stay so. In practical approach, give * me unbounded storage, limit the overall client count or simply let eat * some stale client requests. * * Bad solution keeping correctness: record time stamp from client, his * request will only be valid for 5 minutes, after that time - go away. If * client resends it after 5 minutes, we ignore request. If client changes * the time stamp, it's already a new request, right? Client with broken * clocks will have bad luck. */ // // // // // // // // Public methods // // // // // // // // /** * Initializes new instance of <code>Replica</code> class. * <p> * This constructor doesn't start the replica and Paxos protocol. In order * to run it the {@link #start()} method should be called. * * @param config - the configuration of the replica * @param localId - the id of replica to create * @param service - the state machine to execute request on */ public Replica(Configuration config, int localId, Service service) { ProcessDescriptor.initialize(config, localId); stableStoragePath = processDescriptor.logPath + '/' + localId; innerSnapshotListener2 = new InnerSnapshotListener2(); innerSnapshotProvider = new InnerSnapshotProvider(); replicaDispatcher = new SingleThreadDispatcher("Replica"); serviceProxy = new ServiceProxy(service, executedDifference, replicaDispatcher); serviceProxy.addSnapshotListener(innerSnapshotListener2); cache = new ArrayList<Reply>(2048); executedDifference.put(executeUB, cache); } /** * Starts the replica. * * First the recovery phase is started and after that the replica joins the * Paxos protocol and starts the client manager and the underlying service. * * @throws IOException if some I/O error occurs */ public void start() throws IOException { logger.info(processDescriptor.logMark_Benchmark, "Recovery phase started."); replicaDispatcher.start(); RecoveryAlgorithm recovery = createRecoveryAlgorithm(processDescriptor.crashModel); paxos = recovery.getPaxos(); decideCallback = new DecideCallbackImpl(this, executeUB); paxos.setDecideCallback(decideCallback); batcher = paxos.getBatcher(); if (processDescriptor.indirectConsensus) { batchManager = new ClientBatchManager(paxos, this); batchManager.start(); requestForwarder = null; ClientBatchStore.instance.setClientBatchManager(batchManager); } else { batchManager = null; requestForwarder = new ClientRequestForwarder(paxos); requestForwarder.start(); } paxos.startPassivePaxos(); recovery.addRecoveryListener(new InnerRecoveryListener()); recovery.start(); } private RecoveryAlgorithm createRecoveryAlgorithm(CrashModel crashModel) throws IOException { switch (crashModel) { case CrashStop: return new CrashStopRecovery(innerSnapshotProvider); case FullSS: return new FullSSRecovery(innerSnapshotProvider, stableStoragePath); case EpochSS: return new EpochSSRecovery(innerSnapshotProvider, stableStoragePath); case ViewSS: return new ViewSSRecovery(innerSnapshotProvider, stableStoragePath); default: throw new RuntimeException("Unknown crash model: " + crashModel); } } public void forceExit() { // TODO (JK) hm... implement this? replicaDispatcher.shutdownNow(); } /** * Sets the path to directory where all stable storage logs will be saved. * * @param path to directory where the stable storage logs will be saved */ public void setStableStoragePath(String path) { stableStoragePath = path; } /** * Gets the path to directory where all stable storage logs will be saved. * * @return path */ public String getStableStoragePath() { return stableStoragePath; } public Map<Long, Reply> getExecutedRequestsMap() { return Collections.unmodifiableMap(executedRequests); } /** * Adds the request to the set of requests be executed. If called e(A) e(B), * the delivery will be either d(A) d(B) or d(B) d(A). * * If the replica crashes before the request is delivered, the request may * get lost. * * @param requestValue - the exact request that will be delivered to the * Service execute method * @throws IllegalStateException if the method is called before the recovery * has finished */ public void executeNonFifo(byte[] requestValue) throws IllegalStateException { if (intCli == null) throw new IllegalStateException( "Request cannot be executed before recovery has finished"); intCli.executeNonFifo(requestValue); } /** Returns the current view */ public int getView() { if (paxos == null) throw new IllegalStateException("Replica must be started prior to this call"); return paxos.getStorage().getView(); } /** Returns the ID of current leader */ public int getLeader() { if (paxos == null) throw new IllegalStateException("Replica must be started prior to this call"); return paxos.getLeaderId(); } /** * Adds a listener for leader changes. Allowed after the replica has been * started. */ public boolean addViewChangeListener(Storage.ViewChangeListener listener) { if (listener == null) throw new IllegalArgumentException("The listener cannot be null"); if (paxos == null) throw new IllegalStateException("Replica must be started prior to adding a listener"); return paxos.getStorage().addViewChangeListener(listener); } /** * Removes a listener previously added by * {@link #addViewChangeListener(Storage.ViewChangeListener)} */ public boolean removeViewChangeListener(Storage.ViewChangeListener listener) { return paxos.getStorage().removeViewChangeListener(listener); } // // // // // // // // // // // // // Callback's for JPaxos modules // // // // // // // // // // // // // /** Called when an instance is NOP, in order to count properly the instances */ /* package access */void executeNopInstance(final int nextInstance) { logger.warn("Executing a nop request. Instance: {}", executeUB); } /* package access */void executeClientBatchAndWait(final int instance, final ClientRequest[] requests) { replicaDispatcher.executeAndWait(new Runnable() { @Override public void run() { innerExecuteClientBatch(instance, requests); } }); } /* package access */void instanceExecuted(final int instance, final ClientRequest[] requests) { replicaDispatcher.executeAndWait(new Runnable() { @Override public void run() { innerInstanceExecuted(instance, requests); } }); } /* package access */SingleThreadDispatcher getReplicaDispatcher() { return replicaDispatcher; } // // // // // // // // // // // // // Internal methods and classes. // // // // // // // // // // // // // /** * Called by the RequestManager when it has the ClientRequest that should be * executed next. * * @param instance * @param bInfo */ private void innerExecuteClientBatch(int instance, ClientRequest[] requests) { assert replicaDispatcher.amIInDispatcher() : "Wrong thread: " + Thread.currentThread().getName(); for (ClientRequest cRequest : requests) { RequestId rID = cRequest.getRequestId(); Reply lastReply = executedRequests.get(rID.getClientId()); if (lastReply != null) { int lastSequenceNumberFromClient = lastReply.getRequestId().getSeqNumber(); // Do not execute the same request several times. if (rID.getSeqNumber() <= lastSequenceNumberFromClient) { logger.warn( "Request ordered multiple times. inst: {}, req: {}, lastSequenceNumberFromClient: ", instance, cRequest, lastSequenceNumberFromClient); // (JK) FIXME: investigate if the client could get the // response multiple times here. // Send the cached reply back to the client if (rID.getSeqNumber() == lastSequenceNumberFromClient) { // req manager can be null on fullss disk read if (requestManager != null) requestManager.onRequestExecuted(cRequest, lastReply); } continue; } // else there is a cached reply, but for a past request only. } // Executing the request (at last!) // Here the replica thread is given to Service. byte[] result = serviceProxy.execute(cRequest); Reply reply = new Reply(cRequest.getRequestId(), result); // add request to executed history cache.add(reply); executedRequests.put(rID.getClientId(), reply); // req manager can be null on fullss disk read if (requestManager != null) requestManager.onRequestExecuted(cRequest, reply); } } private void innerInstanceExecuted(final int instance, final ClientRequest[] requests) { assert executeUB == instance : executeUB + " " + instance; // TODO (JK) get rid of unnecessary instance parameter logger.info("Instance finished: {}", instance); cache = new ArrayList<Reply>(2048); executeUB = instance + 1; executedDifference.put(executeUB, cache); serviceProxy.instanceExecuted(instance); batcher.instanceExecuted(instance, requests); } /** * Listener called after recovery algorithm is finished and paxos can be * started. */ private class InnerRecoveryListener implements RecoveryListener { public void recoveryFinished() { if (CrashModel.FullSS.equals(processDescriptor.crashModel)) paxos.getDispatcher().executeAndWait(new Runnable() { public void run() { recoverReplicaFromStorage(); } }); ClientRequestBatcher.generateUniqueRunId(paxos.getStorage()); if (processDescriptor.indirectConsensus) { requestManager = new ClientRequestManager(Replica.this, decideCallback, executedRequests, batchManager, paxos); paxos.setClientRequestManager(requestManager); } else { requestManager = new ClientRequestManager(Replica.this, decideCallback, executedRequests, requestForwarder, paxos); requestForwarder.setClientRequestManager(requestManager); } intCli = new InternalClient(replicaDispatcher, requestManager); try { NioClientProxy.createIdGenerator(paxos.getStorage()); clientManager = new NioClientManager(requestManager); clientManager.start(); } catch (IOException e) { throw new RuntimeException("Could not prepare the socket for clients! Aborting."); } logger.info(processDescriptor.logMark_Benchmark, "Recovery phase finished. Starting paxos protocol."); paxos.startActivePaxos(); replicaDispatcher.execute(new Runnable() { public void run() { serviceProxy.recoveryFinished(); } }); } /** * Replays storage. Needed in FullSS only, other algorithms do not have * storage to replay. */ private void recoverReplicaFromStorage() { Storage storage = paxos.getStorage(); // we need a read-write copy of the map SortedMap<Integer, ConsensusInstance> instances = new TreeMap<Integer, ConsensusInstance>(); instances.putAll(storage.getLog().getInstanceMap()); // We take the snapshot Snapshot snapshot = storage.getLastSnapshot(); if (snapshot != null) { innerSnapshotProvider.handleSnapshot(snapshot); instances = instances.tailMap(snapshot.getNextInstanceId()); } for (ConsensusInstance instance : instances.values()) { if (instance.getState() == LogEntryState.DECIDED) { decideCallback.onRequestOrdered(instance.getId(), instance); } } storage.updateFirstUncommitted(); } } private class InnerSnapshotListener2 implements SnapshotListener2 { public void onSnapshotMade(final Snapshot snapshot) { replicaDispatcher.checkInDispatcher(); if (snapshot.getValue() == null) { throw new RuntimeException("Received a null snapshot!"); } // Get previous snapshot next instance id int prevSnapshotNextInstId; Snapshot lastSnapshot = paxos.getStorage().getLastSnapshot(); if (lastSnapshot != null) { prevSnapshotNextInstId = lastSnapshot.getNextInstanceId(); } else { prevSnapshotNextInstId = 0; } // shift previousSnapshotExecutedRequests to moment of snapshot for (int i = prevSnapshotNextInstId; i < snapshot.getNextInstanceId(); ++i) { List<Reply> ides = executedDifference.remove(i); // this is null only when NoOp if (ides == null) { continue; } for (Reply reply : ides) { previousSnapshotExecutedRequests.put(reply.getRequestId().getClientId(), reply); } } @SuppressWarnings("unchecked") Map<Long, Reply> clone = (Map<Long, Reply>) ((HashMap<?, ?>) previousSnapshotExecutedRequests).clone(); snapshot.setLastReplyForClient(clone); paxos.onSnapshotMade(snapshot); } } private class InnerSnapshotProvider implements SnapshotProvider { public void handleSnapshot(final Snapshot snapshot) { logger.info("New snapshot received"); replicaDispatcher.execute(new Runnable() { public void run() { handleSnapshotInternal(snapshot); } }); } public void askForSnapshot() { replicaDispatcher.execute(new Runnable() { public void run() { serviceProxy.askForSnapshot(); } }); } public void forceSnapshot() { replicaDispatcher.execute(new Runnable() { public void run() { serviceProxy.forceSnapshot(); } }); } /** * Restoring state from a snapshot * * @param snapshot */ private void handleSnapshotInternal(Snapshot snapshot) { assert replicaDispatcher.amIInDispatcher(); assert snapshot != null : "Snapshot is null"; if (snapshot.getNextInstanceId() < executeUB) { logger.error("Received snapshot is older than current state. {}, executeUB: {}", snapshot.getNextInstanceId(), executeUB); return; } logger.warn("Updating machine state from {}", snapshot); serviceProxy.updateToSnapshot(snapshot); decideCallback.atRestoringStateFromSnapshot(snapshot.getNextInstanceId()); executedRequests.clear(); executedDifference.clear(); executedRequests.putAll(snapshot.getLastReplyForClient()); previousSnapshotExecutedRequests.clear(); previousSnapshotExecutedRequests.putAll(snapshot.getLastReplyForClient()); executeUB = snapshot.getNextInstanceId(); cache = new ArrayList<Reply>(2048); executedDifference.put(executeUB, cache); final Object snapshotLock = new Object(); synchronized (snapshotLock) { AfterCatchupSnapshotEvent event = new AfterCatchupSnapshotEvent(snapshot, paxos.getStorage(), snapshotLock); paxos.getDispatcher().submit(event); try { while (!event.isFinished()) { snapshotLock.wait(); } } catch (InterruptedException e) { Thread.currentThread().interrupt(); } } } } /* package access */boolean hasUnexecutedRequests(ClientRequest[] requests) { for (ClientRequest req : requests) { RequestId reqId = req.getRequestId(); Reply prevReply = executedRequests.get(reqId.getClientId()); if (prevReply == null) return true; if (prevReply.getRequestId().getSeqNumber() < reqId.getSeqNumber()) return true; } return false; } private final static Logger logger = LoggerFactory.getLogger(Replica.class); }