package lsr.paxos.core; import static lsr.common.ProcessDescriptor.processDescriptor; import java.util.ArrayList; import java.util.BitSet; import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.SortedMap; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import lsr.common.Configuration; import lsr.common.MovingAverage; import lsr.common.Pair; import lsr.common.Range; import lsr.common.SingleThreadDispatcher; import lsr.paxos.CatchUpListener; import lsr.paxos.Snapshot; import lsr.paxos.SnapshotProvider; import lsr.paxos.messages.CatchUpQuery; import lsr.paxos.messages.CatchUpResponse; import lsr.paxos.messages.CatchUpSnapshot; import lsr.paxos.messages.Message; import lsr.paxos.messages.MessageType; import lsr.paxos.network.MessageHandler; import lsr.paxos.network.Network; import lsr.paxos.replica.ClientBatchManager; import lsr.paxos.replica.ClientBatchManager.FwdBatchRetransmitter; import lsr.paxos.storage.ClientBatchStore; import lsr.paxos.storage.ConsensusInstance; import lsr.paxos.storage.ConsensusInstance.LogEntryState; import lsr.paxos.storage.Storage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class CatchUp { private Storage storage; private SnapshotProvider snapshotProvider; private Network network; private Paxos paxos; private SingleThreadDispatcher paxosDispatcher; /** moving average factor used for changing timeout */ private static final double convergenceFactor = 0.2; /** Initial, conservative value. Updated as a moving average. */ private static final long INITIAL_RETRANSMIT_TIMEOUT = processDescriptor.retransmitTimeout; /** Can send queries no faster than... */ private static final long MIN_CATCHUP_QUERY_RESEND_TIMEOUT_MS = 250; /** On no response, how often should catch up query be sent. */ private MovingAverage resendTimeout = new MovingAverage(convergenceFactor, INITIAL_RETRANSMIT_TIMEOUT); private final Object switchingCatchUpTaskLock = new Object(); private ScheduledFuture<?> catchUpTask = null; long lastQuerySent = 0; /** * Replica rating rules for catch-up: * <ol> * <li>each replica starts with rating 0 * <li>as we send the request, we decrease the value by number of instances * requested * <li>as we receive the request, we increase the value by number of * instances acquired * <li>if the sendMessage function throws an exception, the rating is set to * negative value * <li>An empty, not periodic response reaches us, we decrees down by 5 the * value (but not under 0) * </ol> * * Selecting best replica: * <ol> * <li>We check, if we were not asked to select the leader (happens if other * replica answered it has no value for us) * <li>We select replicas that is neither me not leader * <li>the replica with best rating is chosen * <li>if the best rating is negative, we're catching up with the leader, * and we reset all ratings; otherwise we catch up with the selected one * </ol> */ /** holds replica rating for choosing best replica for catch-up */ private int[] replicaRating; /** Holds all listeners that want to know about catch-up state change */ HashSet<CatchUpListener> listeners = new HashSet<CatchUpListener>(); public CatchUp(SnapshotProvider snapshotProvider, Paxos paxos, Storage storage, Network network) { this.snapshotProvider = snapshotProvider; this.network = network; this.paxosDispatcher = paxos.getDispatcher(); MessageHandler handler = new InnerMessageHandler(); Network.addMessageListener(MessageType.CatchUpQuery, handler); Network.addMessageListener(MessageType.CatchUpResponse, handler); Network.addMessageListener(MessageType.CatchUpSnapshot, handler); this.paxos = paxos; this.storage = storage; replicaRating = new int[processDescriptor.numReplicas]; } /** * Forces the catch-up to send query */ public void forceCatchup() { synchronized (switchingCatchUpTaskLock) { if (catchUpTask != null) // already catching up. Ignore. return; assert !paxos.isLeader(); logger.info("Starting normal catchup"); catchUpTask = paxosDispatcher.scheduleAtFixedRate(new Runnable() { public void run() { sendQuery(); } }, 0, (long) resendTimeout.get(), TimeUnit.MILLISECONDS); } } private void finished() { synchronized (switchingCatchUpTaskLock) { if (catchUpTask == null) // normal catch-up is not working, ignore return; catchUpTask.cancel(false); catchUpTask = null; } } private void dispatchCatchUp(boolean immediatly) { synchronized (switchingCatchUpTaskLock) { if (catchUpTask == null) { logger.info("Catch-up interrupted by suspect-induced view change."); return; } catchUpTask.cancel(false); long initialDelay = immediatly ? 0 : (long) resendTimeout.get(); long now = System.currentTimeMillis(); long timeToNextQuery = now + initialDelay; long nextAllowedTime = lastQuerySent + MIN_CATCHUP_QUERY_RESEND_TIMEOUT_MS; initialDelay = Math.max(timeToNextQuery, nextAllowedTime) - now; catchUpTask = paxosDispatcher.scheduleAtFixedRate(new Runnable() { public void run() { sendQuery(); } }, initialDelay, (long) resendTimeout.get(), TimeUnit.MILLISECONDS); } } /** * Main catch-up process: we're creating (and later updating) the list of * undecided instances, and sending it. * * We're trying to reach best replica possible, and as we get the needed * information, we exit. */ private void sendQuery() { assert paxosDispatcher.amIInDispatcher() : "Must be running on the Protocol thread"; /* * A follower may submit a catch-up task for execution and then become * leader before the task runs. As the leader never needs to catch-up * (the view change ensures that it becomes up-to-date), we ignore the * catch-up. */ if (paxos.isLeader()) { logger.info("Interrupting catchup. Replica is in leader role"); finished(); return; } int target = getBestContactReplica(); assert target != processDescriptor.localId : "Selected self for catch-up"; CatchUpQuery query = new CatchUpQuery(storage.getView(), new int[0], new Range[0]); int requestedInstanceCount = fillUnknownList(query); network.sendMessage(query, target); lastQuerySent = System.currentTimeMillis(); // Modifying the rating of replica we're catching up with // We don't count the additional logSize+1 number requested replicaRating[target] -= requestedInstanceCount; logger.debug(processDescriptor.logMark_Benchmark, "Sent {} to [p{}]", query, target); } /** * the predicate being true when the catch-up mechanism should finish */ private boolean assumeSucceded() { // the current is OK for catch-up caused by the window violation, and // not for periodical one. Consider writing a method giving more sense return storage.isInWindow(storage.getLog().getNextId() - 1); } /** * If a replica responds that it has even not a single value for us, we're * trying to catch-up with the leader */ private boolean askLeader = false; /** * Should return the ID to replica best suiting for catch-up; may change * during catching-up */ private int getBestContactReplica() { // TODO: verify code changing replica ratings if (askLeader) { askLeader = false; return paxos.getLeaderId(); } // BitSet candidates has all processes without his and the leader BitSet candidates = new BitSet(processDescriptor.numReplicas); candidates.set(0, processDescriptor.numReplicas); candidates.clear(processDescriptor.localId); candidates.clear(paxos.getLeaderId()); // replica with greatest rating is used int i = candidates.nextSetBit(0); int bestReplica = i; for (; i >= 0; i = candidates.nextSetBit(i + 1)) { if (replicaRating[i] > replicaRating[bestReplica]) { bestReplica = i; } } // If a replica has negative rating, we catch-up with the leader if (replicaRating[bestReplica] < 0) { bestReplica = paxos.getLeaderId(); // For all but leader the value is set to zero for (int j = 0; j < replicaRating.length; ++j) { if (j != bestReplica) { replicaRating[j] = 0; } } } return bestReplica; } /** * Generates (ascending) list of instance numbers, which we consider for * undecided yet on basis of the log, and adds the next instance number on * the end of this list * * @param query * @return count of instances embedded */ private int fillUnknownList(CatchUpQuery query) { List<Integer> unknownList = new ArrayList<Integer>(); List<Range> unknownRange = new ArrayList<Range>(); SortedMap<Integer, ConsensusInstance> log = storage.getLog().getInstanceMap(); if (log.isEmpty()) { query.setInstanceIdList(Collections.singletonList(storage.getLog().getNextId())); return 0; } int begin = -1; boolean previous = false; int lastKey = log.lastKey(); int count = 1; ConsensusInstance instance; for (int i = Math.max(storage.getFirstUncommitted(), log.firstKey()); i <= lastKey; ++i) { instance = log.get(i); if (instance == null) { continue; } if (!instance.isDecidable()) { count++; if (!previous) { begin = i; previous = true; } } else if (previous) { assert begin != -1 : "Problem in unknown list creation 1"; if (begin == i - 1) { unknownList.add(begin); } else { unknownRange.add(new Range(begin, i - 1)); } previous = false; } } if (previous) { assert begin != -1 : "Problem in unknown list creation 2"; if (begin == lastKey) { unknownList.add(begin); } else { unknownRange.add(new Range(begin, lastKey)); } } unknownList.add(lastKey + 1); query.setInstanceIdList(unknownList); query.setInstanceIdRangeList(unknownRange); return count; } private void handleSnapshot(CatchUpSnapshot msg, int sender) { Snapshot snapshot = msg.getSnapshot(); logger.info("Catch-up snapshot from [p{}] : {}", sender, msg); replicaRating[sender] = Math.max(replicaRating[sender], 5); snapshotProvider.handleSnapshot(snapshot); } /** * Procedure handling the catch-up response - if it's empty, it's dropped, * otherwise we're adding proper event for the dispatcher */ private void handleResponse(CatchUpResponse response, int sender) { logger.info("Catch-up from [p{}] : ", sender, response.toString()); List<ConsensusInstance> logFragment = response.getDecided(); if (logFragment.isEmpty()) { // decrease the rating of a replica, that has no value at all replicaRating[sender] = Math.max(0, replicaRating[sender] - 5); askLeader = true; return; } replicaRating[sender] += 2 * logFragment.size(); long processingTime = System.currentTimeMillis() - response.getRequestTime(); resendTimeout.add(3 * processingTime); if (resendTimeout.get() < Configuration.CATCHUP_MIN_RESEND_TIMEOUT) resendTimeout.reset(Configuration.CATCHUP_MIN_RESEND_TIMEOUT); logger.debug("Changing resend timeout for Catch-Up to {}", resendTimeout); handleCatchUpEvent(logFragment); } /** * Main procedure for the replica responding to query. * * The replica checks if the requested instances are decided by it, and if * yes - it appends them to response. */ private void handleQuery(CatchUpQuery query, int sender) { logger.info("Got {} from [p{}]", query, sender); SortedMap<Integer, ConsensusInstance> log = storage.getLog().getInstanceMap(); if (log.isEmpty()) { if (storage.getLastSnapshot() != null) { sendSnapshotResponse(query, sender); } else { logger.error("Log empty, no snapshot, yet a catch-up query received?"); } return; } assert query.getInstanceIdArray().length > 0; // check if the lowest single ID is available if (query.getInstanceIdArray()[0] < log.firstKey()) { // if no, send snapshot sendSnapshotResponse(query, sender); return; } Integer lastKey = log.lastKey(); ResponseSender responseSender = new ResponseSender(query.getSentTime(), sender); // Adding instances from the requested ranges pairs : for (Pair<Integer, Integer> range : query.getInstanceIdRangeArray()) { for (int i = range.key(); i <= range.value() && i <= lastKey; ++i) { if (i > lastKey) { break pairs; } ConsensusInstance consensusInstance = log.get(i); if (consensusInstance == null) { sendSnapshotResponse(query, sender); return; } if (consensusInstance.getState() == LogEntryState.DECIDED) { responseSender.add(consensusInstance); } } } // We check for requested id's for (int instanceId : query.getInstanceIdArray()) { if (instanceId > lastKey) { continue; } ConsensusInstance consensusInstance = log.get(instanceId); if (consensusInstance == null) { sendSnapshotResponse(query, sender); return; } if (consensusInstance.getState() == LogEntryState.DECIDED) { responseSender.add(consensusInstance); } } responseSender.finish(); } private void sendSnapshotResponse(CatchUpQuery query, int sender) { Snapshot lastSnapshot = storage.getLastSnapshot(); assert lastSnapshot != null; Message m = new CatchUpSnapshot(storage.getView(), query.getSentTime(), lastSnapshot); logger.debug("Sending snapshot {} to [p{}]", m, sender); network.sendMessage(m, sender); CatchUpQuery newQuery = trimQuery(query, lastSnapshot.getNextInstanceId()); handleQuery(newQuery, sender); } /** * To re-use code for sending response, this function trims catch-up query * below the just sent snapshot */ private CatchUpQuery trimQuery(CatchUpQuery query, int nextInstanceId) { List<Integer> iil = query.getInstanceIdList(); while (!iil.isEmpty() && iil.get(0) < nextInstanceId) { iil.remove(0); } List<Range> iirl = query.getInstanceIdRangeList(); while (!iirl.isEmpty() && iirl.get(0).getValue() < nextInstanceId) { iirl.remove(0); } if (!iirl.isEmpty() && iirl.get(0).getKey() < nextInstanceId) { if (iirl.get(0).getValue() == nextInstanceId) { iil.add(0, nextInstanceId); iirl.remove(0); } else iirl.get(0).setKey(nextInstanceId); } int iilMax = iil.isEmpty() ? -1 : iil.get(iil.size() - 1); int iirlMax = iirl.isEmpty() ? -1 : iirl.get(iirl.size() - 1).getValue(); int max = Math.max(Math.max(iilMax, iirlMax), nextInstanceId); if (max + 1 < storage.getFirstUncommitted() - 1) { iirl.add(new Range(max + 1, storage.getFirstUncommitted() - 1)); } else if (max + 1 == storage.getFirstUncommitted() - 1) { iil.add(max + 1); } iil.add(storage.getFirstUncommitted()); return new CatchUpQuery(query.getView(), iil, iirl); } /** * updates the storage and request deciding the caught-up instances */ private void handleCatchUpEvent(List<ConsensusInstance> logFragment) { paxosDispatcher.checkInDispatcher(); for (final ConsensusInstance newInstance : logFragment) { final ConsensusInstance localInstance = storage.getLog().getInstance( newInstance.getId()); // A snapshot && log truncate took place; must have been decided if (localInstance == null) { continue; } if (localInstance.getState() == LogEntryState.DECIDED) { continue; } if (!processDescriptor.indirectConsensus || ClientBatchStore.instance.hasAllBatches(newInstance.getClientBatchIds())) { localInstance.updateStateFromDecision(newInstance.getView(), newInstance.getValue()); paxos.decide(localInstance.getId()); } else { localInstance.setDecidable(true); final BatchFetchingTimeoutTask bftt = new BatchFetchingTimeoutTask( localInstance.getId()); final ScheduledFuture<?> sf = paxosDispatcher.schedule(bftt, processDescriptor.maxBatchFetchingTimeoutMs, TimeUnit.MILLISECONDS); FwdBatchRetransmitter fbr = ClientBatchStore.instance.getClientBatchManager().fetchMissingBatches( newInstance.getClientBatchIds(), new ClientBatchManager.Hook() { public void hook() { paxosDispatcher.execute(new Runnable() { public void run() { localInstance.updateStateFromDecision( newInstance.getView(), newInstance.getValue()); if (!LogEntryState.DECIDED.equals(localInstance.getState())) paxos.decide(localInstance.getId()); paxosDispatcher.remove(bftt); sf.cancel(false); checkCatchupSucceded(false); } }); } }, true); bftt.setFwdBatchForwarder(fbr); localInstance.setFwdBatchForwarder(fbr); } } } /** * If fetching the batch contents failed, this method re-triggers normal * catch-up for the instance. */ protected class BatchFetchingTimeoutTask implements Runnable { private final int instanceId; private FwdBatchRetransmitter fbr = null; public BatchFetchingTimeoutTask(int instanceId) { this.instanceId = instanceId; } void setFwdBatchForwarder(FwdBatchRetransmitter fbr) { this.fbr = fbr; } public void run() { assert fbr != null; ClientBatchStore.instance.getClientBatchManager().removeTask(fbr); ConsensusInstance ci = storage.getLog().getInstance(instanceId); if (ci != null) // snapshot protection ci.setDecidable(false); checkCatchupSucceded(true); } } /** Needed to be notified about messages for catch-up */ private class InnerMessageHandler implements MessageHandler { public void onMessageReceived(final Message msg, final int sender) { paxosDispatcher.submit(new Runnable() { public void run() { /* * catch-up messages are not parsed anywhere else, so one * must check here if these do not contain a more recent * view */ if (msg.getView() > storage.getView()) paxosDispatcher.execute(new Runnable() { public void run() { paxos.advanceView(msg.getView()); } }); logger.debug(processDescriptor.logMark_Benchmark, "Received {}", msg); // Handle the message itself switch (msg.getType()) { case CatchUpResponse: handleResponse((CatchUpResponse) msg, sender); checkCatchupSucceded(((CatchUpResponse) msg).isLastPart()); break; case CatchUpQuery: handleQuery((CatchUpQuery) msg, sender); break; case CatchUpSnapshot: handleSnapshot((CatchUpSnapshot) msg, sender); checkCatchupSucceded(false); break; default: assert false : "Unexpected message type: " + msg.getType(); } } }); } public void onMessageSent(Message message, BitSet destinations) { // Empty } } private void checkCatchupSucceded(boolean restartImmediatlyIfNot) { for (CatchUpListener listener : listeners) { listener.catchUpAdvanced(); } if (assumeSucceded()) { logger.info("Catch-up succeedd"); finished(); } else { dispatchCatchUp(restartImmediatlyIfNot); } } private final static long EMPTY_RESPONSE_SIZE = (new CatchUpResponse(0, 0, new ArrayList<ConsensusInstance>())).toByteArray().length; private static final int MAX_RESPONSE_SIZE = processDescriptor.maxUdpPacketSize; /** * Collects instances to be sent */ private class ResponseSender { private final long querySentTime; private final int sender; private long currentSize; /** * Contains instances matching requested id's that we have marked as * decided */ private final List<ConsensusInstance> availableInstances; public ResponseSender(long querySentTime, int sender) { this.querySentTime = querySentTime; this.sender = sender; availableInstances = new ArrayList<ConsensusInstance>(); currentSize = EMPTY_RESPONSE_SIZE; } public void add(ConsensusInstance instance) { long instanceSize = instance.byteSize(); if (currentSize + instanceSize > MAX_RESPONSE_SIZE) { sendAvailablePart(); currentSize = EMPTY_RESPONSE_SIZE; } currentSize += instanceSize; availableInstances.add(instance); } public void finish() { // if nothing has been sent, we need to sent empty message. // if anything has been sent, it had LastPart=false set, so we // need to send a message with lastPart=true CatchUpResponse response = new CatchUpResponse( storage.getView(), querySentTime, availableInstances); logger.debug("Sending to {}: {} ", sender, response); network.sendMessage(response, sender); } private void sendAvailablePart() { CatchUpResponse response = new CatchUpResponse( storage.getView(), querySentTime, availableInstances); response.setLastPart(false); logger.debug("Sending to {}: {} ", sender, response); network.sendMessage(response, sender); availableInstances.clear(); } } /** Adds the listener, returns if succeeded */ public boolean addListener(CatchUpListener listener) { return listeners.add(listener); } /** Removes the listener, returns if succeeded */ public boolean removeListener(CatchUpListener listener) { return listeners.remove(listener); } private final static Logger logger = LoggerFactory.getLogger(CatchUp.class); }