/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.iv2; import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Queue; import java.util.TreeMap; import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import org.voltcore.logging.VoltLogger; import org.voltcore.messaging.HostMessenger; import org.voltcore.messaging.TransactionInfoBaseMessage; import org.voltcore.messaging.VoltMessage; import org.voltcore.utils.CoreUtils; import org.voltdb.ClientResponseImpl; import org.voltdb.CommandLog; import org.voltdb.CommandLog.DurabilityListener; import org.voltdb.Consistency; import org.voltdb.Consistency.ReadLevel; import org.voltdb.RealVoltDB; import org.voltdb.SnapshotCompletionInterest; import org.voltdb.SnapshotCompletionMonitor; import org.voltdb.SystemProcedureCatalog; import org.voltdb.VoltDB; import org.voltdb.VoltTable; import org.voltdb.client.ClientResponse; import org.voltdb.dtxn.TransactionState; import org.voltdb.iv2.SiteTasker.SiteTaskerRunnable; import org.voltdb.messaging.BorrowTaskMessage; import org.voltdb.messaging.CompleteTransactionMessage; import org.voltdb.messaging.CompleteTransactionResponseMessage; import org.voltdb.messaging.DummyTransactionResponseMessage; import org.voltdb.messaging.DummyTransactionTaskMessage; import org.voltdb.messaging.DumpMessage; import org.voltdb.messaging.FragmentResponseMessage; import org.voltdb.messaging.FragmentTaskMessage; import org.voltdb.messaging.InitiateResponseMessage; import org.voltdb.messaging.Iv2InitiateTaskMessage; import org.voltdb.messaging.Iv2LogFaultMessage; import org.voltdb.messaging.MultiPartitionParticipantMessage; import org.voltdb.messaging.RepairLogTruncationMessage; import org.voltdb.utils.MiscUtils; import org.voltdb.utils.VoltTrace; import com.google_voltpatches.common.primitives.Ints; import com.google_voltpatches.common.primitives.Longs; import com.google_voltpatches.common.util.concurrent.ListenableFuture; import com.google_voltpatches.common.util.concurrent.SettableFuture; public class SpScheduler extends Scheduler implements SnapshotCompletionInterest { static final VoltLogger tmLog = new VoltLogger("TM"); static class DuplicateCounterKey implements Comparable<DuplicateCounterKey> { private final long m_txnId; private final long m_spHandle; DuplicateCounterKey(long txnId, long spHandle) { m_txnId = txnId; m_spHandle = spHandle; } @Override public boolean equals(Object o) { try { DuplicateCounterKey other = (DuplicateCounterKey) o; return (m_txnId == other.m_txnId && m_spHandle == other.m_spHandle); } catch (Exception e) { return false; } } // Only care about comparing TXN ID part for sorting in updateReplicas @Override public int compareTo(DuplicateCounterKey o) { if (m_txnId < o.m_txnId) { return -1; } else if (m_txnId > o.m_txnId) { return 1; } else { if (m_spHandle < o.m_spHandle) { return -1; } else if (m_spHandle > o.m_spHandle) { return 1; } else { return 0; } } } @Override public int hashCode() { assert(false) : "Hashing this is unsafe as it can't promise no collisions."; throw new UnsupportedOperationException( "Hashing this is unsafe as it can't promise no collisions."); } @Override public String toString() { return "<" + TxnEgo.txnIdToString(m_txnId) + ", " + TxnEgo.txnIdToString(m_spHandle) + ">"; } }; public interface DurableUniqueIdListener { /** * Notify listener of last durable Single-Part and Multi-Part uniqueIds */ public void lastUniqueIdsMadeDurable(long spUniqueId, long mpUniqueId); } List<Long> m_replicaHSIds = new ArrayList<Long>(); long m_sendToHSIds[] = new long[0]; private final TransactionTaskQueue m_pendingTasks; private final Map<Long, TransactionState> m_outstandingTxns = new HashMap<Long, TransactionState>(); private final Map<DuplicateCounterKey, DuplicateCounter> m_duplicateCounters = new TreeMap<DuplicateCounterKey, DuplicateCounter>(); // MP fragment tasks or completion tasks pending durability private final Map<Long, Queue<TransactionTask>> m_mpsPendingDurability = new HashMap<Long, Queue<TransactionTask>>(); private CommandLog m_cl; private final SnapshotCompletionMonitor m_snapMonitor; // used to decide if we should shortcut reads private Consistency.ReadLevel m_defaultConsistencyReadLevel; private BufferedReadLog m_bufferedReadLog = null; // Need to track when command log replay is complete (even if not performed) so that // we know when we can start writing viable replay sets to the fault log. boolean m_replayComplete = false; // The DurabilityListener is not thread-safe. Access it only on the Site thread. private final DurabilityListener m_durabilityListener; // Generator of pre-IV2ish timestamp based unique IDs private final UniqueIdGenerator m_uniqueIdGenerator; // the current not-needed-any-more point of the repair log. long m_repairLogTruncationHandle = Long.MIN_VALUE; // the truncation handle last sent to the replicas long m_lastSentTruncationHandle = Long.MIN_VALUE; // the max schedule transaction sphandle, multi-fragments mp txn counts one long m_maxScheduledTxnSpHandle = Long.MIN_VALUE; SpScheduler(int partitionId, SiteTaskerQueue taskQueue, SnapshotCompletionMonitor snapMonitor) { super(partitionId, taskQueue); m_pendingTasks = new TransactionTaskQueue(m_tasks); m_snapMonitor = snapMonitor; m_durabilityListener = new SpDurabilityListener(this, m_pendingTasks); m_uniqueIdGenerator = new UniqueIdGenerator(partitionId, 0); // try to get the global default setting for read consistency, but fall back to SAFE m_defaultConsistencyReadLevel = VoltDB.Configuration.getDefaultReadConsistencyLevel(); if (m_defaultConsistencyReadLevel == ReadLevel.SAFE) { m_bufferedReadLog = new BufferedReadLog(); } m_repairLogTruncationHandle = getCurrentTxnId(); // initialized as current txn id in order to release the initial reads into the system m_maxScheduledTxnSpHandle = getCurrentTxnId(); } @Override public void setLeaderState(boolean isLeader) { super.setLeaderState(isLeader); m_snapMonitor.addInterest(this); } @Override public void setMaxSeenTxnId(long maxSeenTxnId) { super.setMaxSeenTxnId(maxSeenTxnId); writeIv2ViableReplayEntry(); } @Override public void setDurableUniqueIdListener(final DurableUniqueIdListener listener) { m_tasks.offer(new SiteTaskerRunnable() { @Override void run() { m_durabilityListener.setUniqueIdListener(listener); } }); } @Override public void shutdown() { m_tasks.offer(m_nullTask); } // This is going to run in the BabySitter's thread. This and deliver are synchronized by // virtue of both being called on InitiatorMailbox and not directly called. // (That is, InitiatorMailbox's API, used by BabySitter, is synchronized on the same // lock deliver() is synchronized on.) @Override public void updateReplicas(List<Long> replicas, Map<Integer, Long> partitionMasters) { // First - correct the official replica set. m_replicaHSIds = replicas; // Update the list of remote replicas that we'll need to send to List<Long> sendToHSIds = new ArrayList<Long>(m_replicaHSIds); sendToHSIds.remove(m_mailbox.getHSId()); m_sendToHSIds = Longs.toArray(sendToHSIds); // Cleanup duplicate counters and collect DONE counters // in this list for further processing. List<DuplicateCounterKey> doneCounters = new LinkedList<DuplicateCounterKey>(); for (Entry<DuplicateCounterKey, DuplicateCounter> entry : m_duplicateCounters.entrySet()) { DuplicateCounter counter = entry.getValue(); int result = counter.updateReplicas(m_replicaHSIds); if (result == DuplicateCounter.DONE) { doneCounters.add(entry.getKey()); } } // Maintain the CI invariant that responses arrive in txnid order. Collections.sort(doneCounters); for (DuplicateCounterKey key : doneCounters) { DuplicateCounter counter = m_duplicateCounters.remove(key); final TransactionState txn = m_outstandingTxns.get(key.m_txnId); if (txn == null || txn.isDone()) { m_outstandingTxns.remove(key.m_txnId); // for MP write txns, we should use it's first SpHandle in the TransactionState // for SP write txns, we can just use the SpHandle from the DuplicateCounterKey long m_safeSpHandle = txn == null ? key.m_spHandle: txn.m_spHandle; setRepairLogTruncationHandle(m_safeSpHandle); } VoltMessage resp = counter.getLastResponse(); if (resp != null) { // MPI is tracking deps per partition HSID. We need to make // sure we write ours into the message getting sent to the MPI if (resp instanceof FragmentResponseMessage) { FragmentResponseMessage fresp = (FragmentResponseMessage)resp; fresp.setExecutorSiteId(m_mailbox.getHSId()); } m_mailbox.send(counter.m_destinationId, resp); } else { hostLog.warn("TXN " + counter.getTxnId() + " lost all replicas and " + "had no responses. This should be impossible?"); } } SettableFuture<Boolean> written = writeIv2ViableReplayEntry(); // Get the fault log status here to ensure the leader has written it to disk // before initiating transactions again. blockFaultLogWriteStatus(written); } /** * Poll the replay sequencer and process the messages until it returns null */ private void deliverReadyTxns() { // First, pull all the sequenced messages, if any. VoltMessage m = m_replaySequencer.poll(); while(m != null) { deliver(m); m = m_replaySequencer.poll(); } // Then, try to pull all the drainable messages, if any. m = m_replaySequencer.drain(); while (m != null) { if (m instanceof Iv2InitiateTaskMessage) { // Send IGNORED response for all SPs Iv2InitiateTaskMessage task = (Iv2InitiateTaskMessage) m; final InitiateResponseMessage response = new InitiateResponseMessage(task); response.setResults(new ClientResponseImpl(ClientResponse.UNEXPECTED_FAILURE, new VoltTable[0], ClientResponseImpl.IGNORED_TRANSACTION)); m_mailbox.send(response.getInitiatorHSId(), response); } m = m_replaySequencer.drain(); } } /** * Sequence the message for replay if it's for CL or DR. * * @param message * @return true if the message can be delivered directly to the scheduler, * false if the message is queued */ @Override public boolean sequenceForReplay(VoltMessage message) { boolean canDeliver = false; long sequenceWithUniqueId = Long.MIN_VALUE; boolean commandLog = (message instanceof TransactionInfoBaseMessage && (((TransactionInfoBaseMessage)message).isForReplay())); boolean sentinel = message instanceof MultiPartitionParticipantMessage; boolean replay = commandLog || sentinel; boolean sequenceForReplay = m_isLeader && replay; if (replay) { sequenceWithUniqueId = ((TransactionInfoBaseMessage)message).getUniqueId(); } if (sequenceForReplay) { InitiateResponseMessage dupe = m_replaySequencer.dedupe(sequenceWithUniqueId, (TransactionInfoBaseMessage) message); if (dupe != null) { // Duplicate initiate task message, send response m_mailbox.send(dupe.getInitiatorHSId(), dupe); } else if (!m_replaySequencer.offer(sequenceWithUniqueId, (TransactionInfoBaseMessage) message)) { canDeliver = true; } else { deliverReadyTxns(); } // If it's a DR sentinel, send an acknowledgement if (sentinel && !commandLog) { MultiPartitionParticipantMessage mppm = (MultiPartitionParticipantMessage) message; final InitiateResponseMessage response = new InitiateResponseMessage(mppm); ClientResponseImpl clientResponse = new ClientResponseImpl(ClientResponseImpl.UNEXPECTED_FAILURE, new VoltTable[0], ClientResponseImpl.IGNORED_TRANSACTION); response.setResults(clientResponse); m_mailbox.send(response.getInitiatorHSId(), response); } } else { if (replay) { // Update last seen and last polled uniqueId for replicas m_replaySequencer.updateLastSeenUniqueId(sequenceWithUniqueId, (TransactionInfoBaseMessage) message); m_replaySequencer.updateLastPolledUniqueId(sequenceWithUniqueId, (TransactionInfoBaseMessage) message); } canDeliver = true; } return canDeliver; } // SpInitiators will see every message type. The Responses currently come // from local work, but will come from replicas when replication is // implemented @Override public void deliver(VoltMessage message) { if (message instanceof Iv2InitiateTaskMessage) { handleIv2InitiateTaskMessage((Iv2InitiateTaskMessage)message); } else if (message instanceof InitiateResponseMessage) { handleInitiateResponseMessage((InitiateResponseMessage)message); } else if (message instanceof FragmentTaskMessage) { handleFragmentTaskMessage((FragmentTaskMessage)message); } else if (message instanceof FragmentResponseMessage) { handleFragmentResponseMessage((FragmentResponseMessage)message); } else if (message instanceof CompleteTransactionMessage) { handleCompleteTransactionMessage((CompleteTransactionMessage)message); } else if (message instanceof CompleteTransactionResponseMessage) { handleCompleteTransactionResponseMessage((CompleteTransactionResponseMessage) message); } else if (message instanceof BorrowTaskMessage) { handleBorrowTaskMessage((BorrowTaskMessage)message); } else if (message instanceof Iv2LogFaultMessage) { handleIv2LogFaultMessage((Iv2LogFaultMessage)message); } else if (message instanceof DumpMessage) { handleDumpMessage(); } else if (message instanceof DummyTransactionTaskMessage) { handleDummyTransactionTaskMessage((DummyTransactionTaskMessage) message); } else if (message instanceof DummyTransactionResponseMessage) { handleDummyTransactionResponseMessage((DummyTransactionResponseMessage)message); } else { throw new RuntimeException("UNKNOWN MESSAGE TYPE, BOOM!"); } } // SpScheduler expects to see InitiateTaskMessages corresponding to single-partition // procedures only. private void handleIv2InitiateTaskMessage(Iv2InitiateTaskMessage message) { if (!message.isSinglePartition()) { throw new RuntimeException("SpScheduler.handleIv2InitiateTaskMessage " + "should never receive multi-partition initiations."); } final String procedureName = message.getStoredProcedureName(); long newSpHandle; long uniqueId = Long.MIN_VALUE; Iv2InitiateTaskMessage msg = message; if (m_isLeader || message.isReadOnly()) { /* * A short circuit read is a read where the client interface is local to * this node. The CI will let a replica perform a read in this case and * it does looser tracking of client handles since it can't be * partitioned from the local replica. */ if (!m_isLeader && CoreUtils.getHostIdFromHSId(msg.getInitiatorHSId()) != CoreUtils.getHostIdFromHSId(m_mailbox.getHSId())) { VoltDB.crashLocalVoltDB("Only allowed to do short circuit reads locally", true, null); } /* * If this is for CL replay or DR, update the unique ID generator */ if (message.isForReplay()) { uniqueId = message.getUniqueId(); try { m_uniqueIdGenerator.updateMostRecentlyGeneratedUniqueId(uniqueId); } catch (Exception e) { hostLog.fatal(e.getMessage()); hostLog.fatal("Invocation: " + message); VoltDB.crashLocalVoltDB(e.getMessage(), true, e); } } /* * If this is CL replay use the txnid from the CL and also * update the txnid to match the one from the CL */ if (message.isForReplay()) { TxnEgo ego = advanceTxnEgo(); newSpHandle = ego.getTxnId(); updateMaxScheduledTransactionSpHandle(newSpHandle); } else if (m_isLeader && !message.isReadOnly()) { TxnEgo ego = advanceTxnEgo(); newSpHandle = ego.getTxnId(); updateMaxScheduledTransactionSpHandle(newSpHandle); uniqueId = m_uniqueIdGenerator.getNextUniqueId(); } else { /* * The SPI read or the short circuit read case. Since we are read only, * do not create new transaction IDs but reuse the last seen * txnid. For a timestamp, might as well give a reasonable one * for a read heavy workload so time isn't bursty. */ uniqueId = UniqueIdGenerator.makeIdFromComponents( Math.max(System.currentTimeMillis(), m_uniqueIdGenerator.lastUsedTime), 0, m_uniqueIdGenerator.partitionId); newSpHandle = getMaxScheduledTxnSpHandle(); } // Need to set the SP handle on the received message // Need to copy this or the other local sites handling // the same initiate task message will overwrite each // other's memory -- the message isn't copied on delivery // to other local mailboxes. msg = new Iv2InitiateTaskMessage( message.getInitiatorHSId(), message.getCoordinatorHSId(), getRepairLogTruncationHandleForReplicas(), message.getTxnId(), message.getUniqueId(), message.isReadOnly(), message.isSinglePartition(), message.getStoredProcedureInvocation(), message.getClientInterfaceHandle(), message.getConnectionId(), message.isForReplay()); msg.setSpHandle(newSpHandle); // Also, if this is a vanilla single-part procedure, make the TXNID // be the SpHandle (for now) // Only system procedures are every-site, so we'll check through the SystemProcedureCatalog if (SystemProcedureCatalog.listing.get(procedureName) == null || !SystemProcedureCatalog.listing.get(procedureName).getEverysite()) { msg.setTxnId(newSpHandle); msg.setUniqueId(uniqueId); } // The leader will be responsible to replicate messages to replicas. // Don't replicate reads, not matter FAST or SAFE. if (m_isLeader && (!msg.isReadOnly()) && (m_sendToHSIds.length > 0)) { for (long hsId : m_sendToHSIds) { Iv2InitiateTaskMessage finalMsg = msg; final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); if (traceLog != null) { traceLog.add(() -> VoltTrace.beginAsync("replicateSP", MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), hsId, finalMsg.getSpHandle(), finalMsg.getClientInterfaceHandle()), "txnId", TxnEgo.txnIdToString(finalMsg.getTxnId()), "dest", CoreUtils.hsIdToString(hsId))); } } Iv2InitiateTaskMessage replmsg = new Iv2InitiateTaskMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), getRepairLogTruncationHandleForReplicas(), msg.getTxnId(), msg.getUniqueId(), msg.isReadOnly(), msg.isSinglePartition(), msg.getStoredProcedureInvocation(), msg.getClientInterfaceHandle(), msg.getConnectionId(), msg.isForReplay()); // Update the handle in the copy since the constructor doesn't set it replmsg.setSpHandle(newSpHandle); m_mailbox.send(m_sendToHSIds, replmsg); DuplicateCounter counter = new DuplicateCounter( msg.getInitiatorHSId(), msg.getTxnId(), m_replicaHSIds, msg); safeAddToDuplicateCounterMap(new DuplicateCounterKey(msg.getTxnId(), newSpHandle), counter); } } else { setMaxSeenTxnId(msg.getSpHandle()); newSpHandle = msg.getSpHandle(); // Don't update the uniqueID if this is a run-everywhere txn, because it has an MPI unique ID. if (UniqueIdGenerator.getPartitionIdFromUniqueId(msg.getUniqueId()) == m_partitionId) { m_uniqueIdGenerator.updateMostRecentlyGeneratedUniqueId(msg.getUniqueId()); } } Iv2Trace.logIv2InitiateTaskMessage(message, m_mailbox.getHSId(), msg.getTxnId(), newSpHandle); doLocalInitiateOffer(msg); return; } /** * Do the work necessary to turn the Iv2InitiateTaskMessage into a * TransactionTask which can be queued to the TransactionTaskQueue. * This is reused by both the normal message handling path and the repair * path, and assumes that the caller has dealt with or ensured that the * necessary ID, SpHandles, and replication issues are resolved. */ private void doLocalInitiateOffer(Iv2InitiateTaskMessage msg) { final String threadName = Thread.currentThread().getName(); // Thread name has to be materialized here final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); if (traceLog != null) { traceLog.add(() -> VoltTrace.meta("process_name", "name", CoreUtils.getHostnameOrAddress())) .add(() -> VoltTrace.meta("thread_name", "name", threadName)) .add(() -> VoltTrace.meta("thread_sort_index", "sort_index", Integer.toString(10000))) .add(() -> VoltTrace.beginAsync("initsp", MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), m_mailbox.getHSId(), msg.getSpHandle(), msg.getClientInterfaceHandle()), "ciHandle", msg.getClientInterfaceHandle(), "txnId", TxnEgo.txnIdToString(msg.getTxnId()), "partition", m_partitionId, "read", msg.isReadOnly(), "name", msg.getStoredProcedureName(), "hsId", CoreUtils.hsIdToString(m_mailbox.getHSId()))); } /** * A shortcut read is a read operation sent to any replica and completed with no * confirmation or communication with other replicas. In a partition scenario, it's * possible to read an unconfirmed transaction's writes that will be lost. */ final boolean shortcutRead = msg.isReadOnly() && (m_defaultConsistencyReadLevel == ReadLevel.FAST); final String procedureName = msg.getStoredProcedureName(); final SpProcedureTask task = new SpProcedureTask(m_mailbox, procedureName, m_pendingTasks, msg); if (!shortcutRead) { ListenableFuture<Object> durabilityBackpressureFuture = m_cl.log(msg, msg.getSpHandle(), null, m_durabilityListener, task); if (traceLog != null && durabilityBackpressureFuture != null) { traceLog.add(() -> VoltTrace.beginAsync("durability", MiscUtils.hsIdTxnIdToString(m_mailbox.getHSId(), msg.getSpHandle()), "txnId", TxnEgo.txnIdToString(msg.getTxnId()), "partition", Integer.toString(m_partitionId))); } //Durability future is always null for sync command logging //the transaction will be delivered again by the CL for execution once durable //Async command logging has to offer the task immediately with a Future for backpressure if (m_cl.canOfferTask()) { m_pendingTasks.offer(task.setDurabilityBackpressureFuture(durabilityBackpressureFuture)); } } else { m_pendingTasks.offer(task); } } @Override public void handleMessageRepair(List<Long> needsRepair, VoltMessage message) { if (message instanceof Iv2InitiateTaskMessage) { handleIv2InitiateTaskMessageRepair(needsRepair, (Iv2InitiateTaskMessage)message); } else if (message instanceof FragmentTaskMessage) { handleFragmentTaskMessageRepair(needsRepair, (FragmentTaskMessage)message); } else if (message instanceof CompleteTransactionMessage) { // It should be safe to just send CompleteTransactionMessages to everyone. handleCompleteTransactionMessage((CompleteTransactionMessage)message); } else { throw new RuntimeException("SpScheduler.handleMessageRepair received unexpected message type: " + message); } } private void handleIv2InitiateTaskMessageRepair(List<Long> needsRepair, Iv2InitiateTaskMessage message) { if (!message.isSinglePartition()) { throw new RuntimeException("SpScheduler.handleIv2InitiateTaskMessageRepair " + "should never receive multi-partition initiations."); } // set up duplicate counter. expect exactly the responses corresponding // to needsRepair. These may, or may not, include the local site. // We currently send the final response into the ether, since we don't // have the original ClientInterface HSID stored. It would be more // useful to have the original ClienInterface HSId somewhere handy. List<Long> expectedHSIds = new ArrayList<Long>(needsRepair); DuplicateCounter counter = new DuplicateCounter( HostMessenger.VALHALLA, message.getTxnId(), expectedHSIds, message); safeAddToDuplicateCounterMap(new DuplicateCounterKey(message.getTxnId(), message.getSpHandle()), counter); m_uniqueIdGenerator.updateMostRecentlyGeneratedUniqueId(message.getUniqueId()); // is local repair necessary? if (needsRepair.contains(m_mailbox.getHSId())) { needsRepair.remove(m_mailbox.getHSId()); // make a copy because handleIv2 non-repair case does? Iv2InitiateTaskMessage localWork = new Iv2InitiateTaskMessage(message.getInitiatorHSId(), message.getCoordinatorHSId(), message); doLocalInitiateOffer(localWork); } // is remote repair necessary? if (!needsRepair.isEmpty()) { Iv2InitiateTaskMessage replmsg = new Iv2InitiateTaskMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), message); m_mailbox.send(com.google_voltpatches.common.primitives.Longs.toArray(needsRepair), replmsg); } } private void handleFragmentTaskMessageRepair(List<Long> needsRepair, FragmentTaskMessage message) { // set up duplicate counter. expect exactly the responses corresponding // to needsRepair. These may, or may not, include the local site. List<Long> expectedHSIds = new ArrayList<Long>(needsRepair); DuplicateCounter counter = new DuplicateCounter( message.getCoordinatorHSId(), // Assume that the MPI's HSID hasn't changed message.getTxnId(), expectedHSIds, message); safeAddToDuplicateCounterMap(new DuplicateCounterKey(message.getTxnId(), message.getSpHandle()), counter); // is local repair necessary? if (needsRepair.contains(m_mailbox.getHSId())) { // Sanity check that we really need repair. if (m_outstandingTxns.get(message.getTxnId()) != null) { hostLog.warn("SPI repair attempted to repair a fragment which it has already seen. " + "This shouldn't be possible."); // Not sure what to do in this event. Crash for now throw new RuntimeException("Attempted to repair with a fragment we've already seen."); } needsRepair.remove(m_mailbox.getHSId()); // make a copy because handleIv2 non-repair case does? FragmentTaskMessage localWork = new FragmentTaskMessage(message.getInitiatorHSId(), message.getCoordinatorHSId(), message); doLocalFragmentOffer(localWork); } // is remote repair necessary? if (!needsRepair.isEmpty()) { FragmentTaskMessage replmsg = new FragmentTaskMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), message); m_mailbox.send(com.google_voltpatches.common.primitives.Longs.toArray(needsRepair), replmsg); } } // Pass a response through the duplicate counters. private void handleInitiateResponseMessage(InitiateResponseMessage message) { /** * A shortcut read is a read operation sent to any replica and completed with no * confirmation or communication with other replicas. In a partition scenario, it's * possible to read an unconfirmed transaction's writes that will be lost. */ final long spHandle = message.getSpHandle(); final DuplicateCounterKey dcKey = new DuplicateCounterKey(message.getTxnId(), spHandle); DuplicateCounter counter = m_duplicateCounters.get(dcKey); final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); // All reads will have no duplicate counter. // Avoid all the lookup below. // Also, don't update the truncation handle, since it won't have meaning for anyone. if (message.isReadOnly()) { if (traceLog != null) { traceLog.add(() -> VoltTrace.endAsync("initsp", MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), message.m_sourceHSId, message.getSpHandle(), message.getClientInterfaceHandle()))); } if (m_defaultConsistencyReadLevel == ReadLevel.FAST) { // the initiatorHSId is the ClientInterface mailbox. m_mailbox.send(message.getInitiatorHSId(), message); return; } if (m_defaultConsistencyReadLevel == ReadLevel.SAFE) { // InvocationDispatcher routes SAFE reads to SPI only assert(m_isLeader); assert(m_bufferedReadLog != null); m_bufferedReadLog.offer(m_mailbox, message, m_repairLogTruncationHandle); return; } } if (counter != null) { String traceName = "initsp"; if (message.m_sourceHSId != m_mailbox.getHSId()) { traceName = "replicatesp"; } String finalTraceName = traceName; if (traceLog != null) { traceLog.add(() -> VoltTrace.endAsync(finalTraceName, MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), message.m_sourceHSId, message.getSpHandle(), message.getClientInterfaceHandle()), "hash", message.getClientResponseData().getHashes()[0])); } int result = counter.offer(message); if (result == DuplicateCounter.DONE) { m_duplicateCounters.remove(dcKey); setRepairLogTruncationHandle(spHandle); m_mailbox.send(counter.m_destinationId, counter.getLastResponse()); } else if (result == DuplicateCounter.MISMATCH) { RealVoltDB.printDiagnosticInformation(VoltDB.instance().getCatalogContext(), counter.getStoredProcedureName(), m_procSet); VoltDB.crashGlobalVoltDB("HASH MISMATCH: replicas produced different results.", true, null); } else if (result == DuplicateCounter.ABORT) { RealVoltDB.printDiagnosticInformation(VoltDB.instance().getCatalogContext(), counter.getStoredProcedureName(), m_procSet); VoltDB.crashGlobalVoltDB("PARTIAL ROLLBACK/ABORT: transaction succeeded on one replica but failed on another replica.", true, null); } } else { if (traceLog != null) { traceLog.add(() -> VoltTrace.endAsync("initsp", MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), message.m_sourceHSId, message.getSpHandle(), message.getClientInterfaceHandle()))); } // the initiatorHSId is the ClientInterface mailbox. // this will be on SPI without k-safety or replica only with k-safety assert(!message.isReadOnly()); setRepairLogTruncationHandle(spHandle); m_mailbox.send(message.getInitiatorHSId(), message); } } // BorrowTaskMessages encapsulate a FragmentTaskMessage along with // input dependency tables. The MPI issues borrows to a local site // to perform replicated reads or aggregation fragment work. private void handleBorrowTaskMessage(BorrowTaskMessage message) { // borrows do not advance the sp handle. The handle would // move backwards anyway once the next message is received // from the SP leader. long newSpHandle = getMaxScheduledTxnSpHandle(); Iv2Trace.logFragmentTaskMessage(message.getFragmentTaskMessage(), m_mailbox.getHSId(), newSpHandle, true); final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); if (traceLog != null) { traceLog.add(() -> VoltTrace.beginAsync("recvfragment", MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), m_mailbox.getHSId(), newSpHandle, 0), "txnId", TxnEgo.txnIdToString(message.getTxnId()), "partition", m_partitionId, "hsId", CoreUtils.hsIdToString(m_mailbox.getHSId()))); } TransactionState txn = m_outstandingTxns.get(message.getTxnId()); if (txn == null) { // If the borrow is the first fragment for a transaction, run it as // a single partition fragment; Must not engage/pause this // site on a MP transaction before the SP instructs to do so. // Do not track the borrow task as outstanding - it completes // immediately and is not a valid transaction state for // full MP participation (it claims everything can run as SP). txn = new BorrowTransactionState(newSpHandle, message); } // BorrowTask is a read only task embedded in a MP transaction // and its response (FragmentResponseMessage) should not be buffered if (message.getFragmentTaskMessage().isSysProcTask()) { final SysprocFragmentTask task = new SysprocFragmentTask(m_mailbox, (ParticipantTransactionState)txn, m_pendingTasks, message.getFragmentTaskMessage(), message.getInputDepMap()); task.setResponseNotBufferable(); m_pendingTasks.offer(task); } else { final FragmentTask task = new FragmentTask(m_mailbox, (ParticipantTransactionState)txn, m_pendingTasks, message.getFragmentTaskMessage(), message.getInputDepMap()); task.setResponseNotBufferable(); m_pendingTasks.offer(task); } } // SpSchedulers will see FragmentTaskMessage for: // - The scatter fragment(s) of a multi-part transaction (normal or sysproc) // - Borrow tasks to do the local fragment work if this partition is the // buddy of the MPI. Borrow tasks may include input dependency tables for // aggregation fragments, or not, if it's a replicated table read. // For multi-batch MP transactions, we'll need to look up the transaction state // that gets created when the first batch arrives. // During command log replay a new SP handle is going to be generated, but it really // doesn't matter, it isn't going to be used for anything. void handleFragmentTaskMessage(FragmentTaskMessage message) { FragmentTaskMessage msg = message; long newSpHandle; if (m_isLeader) { // Quick hack to make progress...we need to copy the FragmentTaskMessage // before we start mucking with its state (SPHANDLE). We need to revisit // all the messaging mess at some point. msg = new FragmentTaskMessage(message.getInitiatorHSId(), message.getCoordinatorHSId(), message); //Not going to use the timestamp from the new Ego because the multi-part timestamp is what should be used if (!message.isReadOnly()) { TxnEgo ego = advanceTxnEgo(); newSpHandle = ego.getTxnId(); if (m_outstandingTxns.get(msg.getTxnId()) == null) { updateMaxScheduledTransactionSpHandle(newSpHandle); } } else { newSpHandle = getMaxScheduledTxnSpHandle(); } msg.setSpHandle(newSpHandle); if (msg.getInitiateTask() != null) { msg.getInitiateTask().setSpHandle(newSpHandle);//set the handle //Trigger reserialization so the new handle is used msg.setStateForDurability(msg.getInitiateTask(), msg.getInvolvedPartitions()); } /* * If there a replicas to send it to, forward it! * Unless... it's read only AND not a sysproc. Read only sysprocs may expect to be sent * everywhere. * In that case don't propagate it to avoid a determinism check and extra messaging overhead */ if (m_sendToHSIds.length > 0 && (!message.isReadOnly() || msg.isSysProcTask())) { for (long hsId : m_sendToHSIds) { FragmentTaskMessage finalMsg = msg; final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); if (traceLog != null) { traceLog.add(() -> VoltTrace.beginAsync("replicatefragment", MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), hsId, finalMsg.getSpHandle(), finalMsg.getTxnId()), "txnId", TxnEgo.txnIdToString(finalMsg.getTxnId()), "dest", CoreUtils.hsIdToString(hsId))); } } FragmentTaskMessage replmsg = new FragmentTaskMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), msg); m_mailbox.send(m_sendToHSIds, replmsg); DuplicateCounter counter; /* * Non-determinism should be impossible to happen with MP fragments. * if you see "MP_DETERMINISM_ERROR" as procedure name in the crash logs * something has horribly gone wrong. */ if (message.getFragmentTaskType() != FragmentTaskMessage.SYS_PROC_PER_SITE) { counter = new DuplicateCounter( msg.getCoordinatorHSId(), msg.getTxnId(), m_replicaHSIds, message); } else { counter = new SysProcDuplicateCounter( msg.getCoordinatorHSId(), msg.getTxnId(), m_replicaHSIds, message); } safeAddToDuplicateCounterMap(new DuplicateCounterKey(message.getTxnId(), newSpHandle), counter); } } else { newSpHandle = msg.getSpHandle(); setMaxSeenTxnId(newSpHandle); } Iv2Trace.logFragmentTaskMessage(message, m_mailbox.getHSId(), newSpHandle, false); doLocalFragmentOffer(msg); } /** * Do the work necessary to turn the FragmentTaskMessage into a * TransactionTask which can be queued to the TransactionTaskQueue. * This is reused by both the normal message handling path and the repair * path, and assumes that the caller has dealt with or ensured that the * necessary ID, SpHandles, and replication issues are resolved. */ private void doLocalFragmentOffer(FragmentTaskMessage msg) { final String threadName = Thread.currentThread().getName(); // Thread name has to be materialized here final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); if (traceLog != null) { traceLog.add(() -> VoltTrace.meta("process_name", "name", CoreUtils.getHostnameOrAddress())) .add(() -> VoltTrace.meta("thread_name", "name", threadName)) .add(() -> VoltTrace.meta("thread_sort_index", "sort_index", Integer.toString(10000))) .add(() -> VoltTrace.beginAsync("recvfragment", MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), m_mailbox.getHSId(), msg.getSpHandle(), msg.getTxnId()), "txnId", TxnEgo.txnIdToString(msg.getTxnId()), "partition", m_partitionId, "hsId", CoreUtils.hsIdToString(m_mailbox.getHSId()), "final", msg.isFinalTask())); } TransactionState txn = m_outstandingTxns.get(msg.getTxnId()); boolean logThis = false; // bit of a hack...we will probably not want to create and // offer FragmentTasks for txn ids that don't match if we have // something in progress already if (txn == null) { txn = new ParticipantTransactionState(msg.getSpHandle(), msg, msg.isReadOnly()); m_outstandingTxns.put(msg.getTxnId(), txn); // Only want to send things to the command log if it satisfies this predicate // AND we've never seen anything for this transaction before. We can't // actually log until we create a TransactionTask, though, so just keep track // of whether it needs to be done. // Like SP, we should log writes and safe reads. // Fast reads can be directly put on the task queue. boolean shortcutRead = msg.isReadOnly() && (m_defaultConsistencyReadLevel == ReadLevel.FAST); logThis = !shortcutRead; } // Check to see if this is the final task for this txn, and if so, if we can close it out early // Right now, this just means read-only. // NOTE: this overlaps slightly with CompleteTransactionMessage handling completion. It's so tiny // that for now, meh, but if this scope grows then it should get refactored out if (msg.isFinalTask() && txn.isReadOnly()) { m_outstandingTxns.remove(msg.getTxnId()); } TransactionTask task; if (msg.isSysProcTask()) { task = new SysprocFragmentTask(m_mailbox, (ParticipantTransactionState)txn, m_pendingTasks, msg, null); } else { task = new FragmentTask(m_mailbox, (ParticipantTransactionState)txn, m_pendingTasks, msg, null); } if (logThis) { ListenableFuture<Object> durabilityBackpressureFuture = m_cl.log(msg.getInitiateTask(), msg.getSpHandle(), Ints.toArray(msg.getInvolvedPartitions()), m_durabilityListener, task); if (traceLog != null && durabilityBackpressureFuture != null) { traceLog.add(() -> VoltTrace.beginAsync("durability", MiscUtils.hsIdTxnIdToString(m_mailbox.getHSId(), msg.getSpHandle()), "txnId", TxnEgo.txnIdToString(msg.getTxnId()), "partition", Integer.toString(m_partitionId))); } //Durability future is always null for sync command logging //the transaction will be delivered again by the CL for execution once durable //Async command logging has to offer the task immediately with a Future for backpressure if (m_cl.canOfferTask()) { m_pendingTasks.offer(task.setDurabilityBackpressureFuture(durabilityBackpressureFuture)); } else { /* Getting here means that the task is the first fragment of an MP txn and * synchronous command logging is on, so create a backlog for future tasks of * this MP arrived before it's marked durable. * * This is important for synchronous command logging and MP txn restart. Without * this, a restarted MP txn may not be gated by logging of the first fragment. */ assert !m_mpsPendingDurability.containsKey(task.getTxnId()); m_mpsPendingDurability.put(task.getTxnId(), new ArrayDeque<TransactionTask>()); } } else { queueOrOfferMPTask(task); } } /** * Offer all fragment tasks and complete transaction tasks queued for durability for the given * MP transaction, and remove the entry from the pending map so that future ones won't be * queued. * * @param txnId The MP transaction ID. */ public void offerPendingMPTasks(long txnId) { Queue<TransactionTask> pendingTasks = m_mpsPendingDurability.get(txnId); if (pendingTasks != null) { for (TransactionTask task : pendingTasks) { if (task instanceof SpProcedureTask) { final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); if (traceLog != null) { traceLog.add(() -> VoltTrace.endAsync("durability", MiscUtils.hsIdTxnIdToString(m_mailbox.getHSId(), task.getSpHandle()))); } } else if (task instanceof FragmentTask) { final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); if (traceLog != null) { traceLog.add(() -> VoltTrace.endAsync("durability", MiscUtils.hsIdTxnIdToString(m_mailbox.getHSId(), ((FragmentTask) task).m_fragmentMsg.getSpHandle()))); } } m_pendingTasks.offer(task); } m_mpsPendingDurability.remove(txnId); } } /** * Check if the MP task has to be queued because the first fragment is still being logged * synchronously to the command log. If not, offer it to the transaction task queue. * * @param task A fragment task or a complete transaction task */ private void queueOrOfferMPTask(TransactionTask task) { // The pending map will only have an entry for the transaction if the first fragment is // still pending durability. Queue<TransactionTask> pendingTasks = m_mpsPendingDurability.get(task.getTxnId()); if (pendingTasks != null) { pendingTasks.offer(task); } else { m_pendingTasks.offer(task); } } // Eventually, the master for a partition set will need to be able to dedupe // FragmentResponses from its replicas. private void handleFragmentResponseMessage(FragmentResponseMessage message) { final TransactionState txnState = m_outstandingTxns.get(message.getTxnId()); final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); // Send the message to the duplicate counter, if any DuplicateCounter counter = m_duplicateCounters.get(new DuplicateCounterKey(message.getTxnId(), message.getSpHandle())); final TransactionState txn = m_outstandingTxns.get(message.getTxnId()); if (counter != null) { String traceName = "recvfragment"; if (message.m_sourceHSId != m_mailbox.getHSId()) { traceName = "replicatefragment"; } String finalTraceName = traceName; if (traceLog != null) { traceLog.add(() -> VoltTrace.endAsync(finalTraceName, MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), message.m_sourceHSId, message.getSpHandle(), message.getTxnId()), "status", message.getStatusCode())); } int result = counter.offer(message); if (result == DuplicateCounter.DONE) { if (txn != null && txn.isDone()) { setRepairLogTruncationHandle(txn.m_spHandle); } m_duplicateCounters.remove(new DuplicateCounterKey(message.getTxnId(), message.getSpHandle())); FragmentResponseMessage resp = (FragmentResponseMessage)counter.getLastResponse(); // MPI is tracking deps per partition HSID. We need to make // sure we write ours into the message getting sent to the MPI resp.setExecutorSiteId(m_mailbox.getHSId()); m_mailbox.send(counter.m_destinationId, resp); } else if (result == DuplicateCounter.MISMATCH) { VoltDB.crashGlobalVoltDB("HASH MISMATCH running multi-part procedure.", true, null); } else if (result == DuplicateCounter.ABORT) { VoltDB.crashGlobalVoltDB("PARTIAL ROLLBACK/ABORT running multi-part procedure.", true, null); } // doing duplicate suppression: all done. return; } // No k-safety means no replica: read/write queries on master. // K-safety: read-only queries (on master) or write queries (on replica). if (m_defaultConsistencyReadLevel == ReadLevel.SAFE && m_isLeader && m_sendToHSIds.length > 0 && message.getRespBufferable() && (txn == null || txn.isReadOnly()) ) { // on k-safety leader with safe reads configuration: one shot reads + normal multi-fragments MP reads // we will have to buffer these reads until previous writes acked in the cluster. long readTxnId = txn == null ? message.getSpHandle() : txn.m_spHandle; m_bufferedReadLog.offer(m_mailbox, message, readTxnId, m_repairLogTruncationHandle); return; } // for complete writes txn, we will advance the transaction point if (txn != null && !txn.isReadOnly() && txn.isDone()) { setRepairLogTruncationHandle(txn.m_spHandle); } if (traceLog != null) { traceLog.add(() -> VoltTrace.endAsync("recvfragment", MiscUtils.hsIdPairTxnIdToString(m_mailbox.getHSId(), message.m_sourceHSId, message.getSpHandle(), message.getTxnId()), "status", message.getStatusCode())); } m_mailbox.send(message.getDestinationSiteId(), message); } private void handleCompleteTransactionMessage(CompleteTransactionMessage message) { CompleteTransactionMessage msg = message; if (m_isLeader) { msg = new CompleteTransactionMessage(m_mailbox.getHSId(), m_mailbox.getHSId(), message); // Set the spHandle so that on repair the new master will set the max seen spHandle // correctly advanceTxnEgo(); msg.setSpHandle(getCurrentTxnId()); if (m_sendToHSIds.length > 0 && !msg.isReadOnly()) { m_mailbox.send(m_sendToHSIds, msg); } } else { setMaxSeenTxnId(msg.getSpHandle()); } TransactionState txn = m_outstandingTxns.get(msg.getTxnId()); // We can currently receive CompleteTransactionMessages for multipart procedures // which only use the buddy site (replicated table read). Ignore them for // now, fix that later. if (txn != null) { final FragmentTaskMessage frag = (FragmentTaskMessage) txn.getNotice(); CompleteTransactionMessage finalMsg = msg; final VoltTrace.TraceEventBatch traceLog = VoltTrace.log(VoltTrace.Category.SPI); if (traceLog != null) { traceLog.add(() -> VoltTrace.instant("recvCompleteTxn", "txnId", TxnEgo.txnIdToString(finalMsg.getTxnId()), "partition", Integer.toString(m_partitionId), "hsId", CoreUtils.hsIdToString(m_mailbox.getHSId()))); } final boolean isSysproc = ((FragmentTaskMessage) txn.getNotice()).isSysProcTask(); if (m_sendToHSIds.length > 0 && !msg.isRestart() && (!msg.isReadOnly() || isSysproc)) { DuplicateCounter counter; counter = new DuplicateCounter(msg.getCoordinatorHSId(), msg.getTxnId(), m_replicaHSIds, msg); safeAddToDuplicateCounterMap(new DuplicateCounterKey(msg.getTxnId(), msg.getSpHandle()), counter); } Iv2Trace.logCompleteTransactionMessage(msg, m_mailbox.getHSId()); final CompleteTransactionTask task = new CompleteTransactionTask(m_mailbox, txn, m_pendingTasks, msg); queueOrOfferMPTask(task); } else { // Generate a dummy response message when this site has not seen previous FragmentTaskMessage, // the leader may have started to wait for replicas' response messages. // This can happen in the early phase of site rejoin before replica receiving the snapshot initiation, // it also means this CompleteTransactionMessage message will be dropped because it's after snapshot. final CompleteTransactionResponseMessage resp = new CompleteTransactionResponseMessage(msg); resp.m_sourceHSId = m_mailbox.getHSId(); handleCompleteTransactionResponseMessage(resp); } } private void handleCompleteTransactionResponseMessage(CompleteTransactionResponseMessage msg) { final DuplicateCounterKey duplicateCounterKey = new DuplicateCounterKey(msg.getTxnId(), msg.getSpHandle()); DuplicateCounter counter = m_duplicateCounters.get(duplicateCounterKey); boolean txnDone = true; if (msg.isRestart()) { // Don't mark txn done for restarts txnDone = false; } if (counter != null) { txnDone = counter.offer(msg) == DuplicateCounter.DONE; } if (txnDone) { assert !msg.isRestart(); final TransactionState txn = m_outstandingTxns.remove(msg.getTxnId()); m_duplicateCounters.remove(duplicateCounterKey); if (txn != null) { // Set the truncation handle here instead of when processing // FragmentResponseMessage to avoid letting replicas think a // fragment is done before the MP txn is fully committed. assert txn.isDone() : "Counter " + counter + ", leader " + m_isLeader + ", " + msg; setRepairLogTruncationHandle(txn.m_spHandle); } } // The CompleteTransactionResponseMessage ends at the SPI. It is not // sent to the MPI because it doesn't care about it. // // The SPI uses this response message to track if all replicas have // committed the transaction. if (!m_isLeader) { m_mailbox.send(msg.getSPIHSId(), msg); } } /** * Should only receive these messages at replicas, when told by the leader */ private void handleIv2LogFaultMessage(Iv2LogFaultMessage message) { //call the internal log write with the provided SP handle and wait for the fault log IO to complete SettableFuture<Boolean> written = writeIv2ViableReplayEntryInternal(message.getSpHandle()); // Get the Fault Log Status here to ensure the replica completes the log fault task is finished before // it starts processing transactions again blockFaultLogWriteStatus(written); setMaxSeenTxnId(message.getSpHandle()); // Also initialize the unique ID generator and the last durable unique ID using // the value sent by the master m_uniqueIdGenerator.updateMostRecentlyGeneratedUniqueId(message.getSpUniqueId()); m_cl.initializeLastDurableUniqueId(m_durabilityListener, m_uniqueIdGenerator.getLastUniqueId()); } /** * Wait to get the status of a fault log write */ private void blockFaultLogWriteStatus(SettableFuture<Boolean> written) { boolean logWritten = false; if (written != null) { try { logWritten = written.get(); } catch (InterruptedException e) { } catch (ExecutionException e) { if (tmLog.isDebugEnabled()) { tmLog.debug("Could not determine fault log state for partition: " + m_partitionId, e); } } if (!logWritten) { tmLog.warn("Attempted fault log not written for partition: " + m_partitionId); } } } private void handleDumpMessage() { String who = CoreUtils.hsIdToString(m_mailbox.getHSId()); hostLog.warn("State dump for site: " + who); hostLog.warn(who + ": partition: " + m_partitionId + ", isLeader: " + m_isLeader); if (m_isLeader) { hostLog.warn(who + ": replicas: " + CoreUtils.hsIdCollectionToString(m_replicaHSIds)); if (m_sendToHSIds.length > 0) { m_mailbox.send(m_sendToHSIds, new DumpMessage()); } } hostLog.warn(who + ": most recent SP handle: " + TxnEgo.txnIdToString(getCurrentTxnId())); hostLog.warn(who + ": outstanding txns: " + m_outstandingTxns.keySet() + " " + TxnEgo.txnIdCollectionToString(m_outstandingTxns.keySet())); hostLog.warn(who + ": TransactionTaskQueue: " + m_pendingTasks.toString()); if (m_duplicateCounters.size() > 0) { hostLog.warn(who + ": duplicate counters: "); for (Entry<DuplicateCounterKey, DuplicateCounter> e : m_duplicateCounters.entrySet()) { hostLog.warn("\t" + who + ": " + e.getKey().toString() + ": " + e.getValue().toString()); } } } private void handleDummyTransactionTaskMessage(DummyTransactionTaskMessage message) { DummyTransactionTaskMessage msg = message; if (m_isLeader) { TxnEgo ego = advanceTxnEgo(); long newSpHandle = ego.getTxnId(); updateMaxScheduledTransactionSpHandle(newSpHandle); // this uniqueId is needed as the command log tracks it (uniqueId has to advance) long uniqueId = m_uniqueIdGenerator.getNextUniqueId(); msg = new DummyTransactionTaskMessage(m_mailbox.getHSId(), newSpHandle, uniqueId); if (m_sendToHSIds.length > 0) { m_mailbox.send(m_sendToHSIds, msg); DuplicateCounter counter = new DuplicateCounter( HostMessenger.VALHALLA, msg.getTxnId(), m_replicaHSIds, msg); safeAddToDuplicateCounterMap(new DuplicateCounterKey(msg.getTxnId(), newSpHandle), counter); } } else { setMaxSeenTxnId(msg.getSpHandle()); } Iv2Trace.logDummyTransactionTaskMessage(msg, m_mailbox.getHSId()); DummyTransactionTask task = new DummyTransactionTask(m_mailbox, new SpTransactionState(msg), m_pendingTasks); // This read only DummyTransactionTask is to help flushing the task queue, // including tasks in command log queue as well. ListenableFuture<Object> durabilityBackpressureFuture = m_cl.log(null, msg.getSpHandle(), null, m_durabilityListener, task); // Durability future is always null for sync command logging // the transaction will be delivered again by the CL for execution once durable // Async command logging has to offer the task immediately with a Future for backpressure if (m_cl.canOfferTask()) { m_pendingTasks.offer(task.setDurabilityBackpressureFuture(durabilityBackpressureFuture)); } } private void handleDummyTransactionResponseMessage(DummyTransactionResponseMessage message) { final long spHandle = message.getSpHandle(); final DuplicateCounterKey dcKey = new DuplicateCounterKey(message.getTxnId(), spHandle); DuplicateCounter counter = m_duplicateCounters.get(dcKey); if (counter == null) { // this will be on SPI without k-safety or replica only with k-safety setRepairLogTruncationHandle(spHandle); if (!m_isLeader) { m_mailbox.send(message.getSPIHSId(), message); } return; } int result = counter.offer(message); if (result == DuplicateCounter.DONE) { // DummyTransactionResponseMessage ends on SPI m_duplicateCounters.remove(dcKey); setRepairLogTruncationHandle(spHandle); } } @Override public void setCommandLog(CommandLog cl) { m_cl = cl; m_durabilityListener.createFirstCompletionCheck(cl.isSynchronous(), cl.isEnabled()); m_cl.registerDurabilityListener(m_durabilityListener); } @Override public void enableWritingIv2FaultLog() { m_replayComplete = true; writeIv2ViableReplayEntry(); } /** * If appropriate, cause the initiator to write the viable replay set to the command log * Use when it's unclear whether the caller is the leader or a replica; the right thing will happen. * * This will return a future to block on for the write on the fault log. If the attempt to write * the replay entry was never followed through due to conditions, it will be null. If the attempt * to write the replay entry went through but could not be done internally, the future will be false. */ SettableFuture<Boolean> writeIv2ViableReplayEntry() { SettableFuture<Boolean> written = null; if (m_replayComplete) { if (m_isLeader) { // write the viable set locally long faultSpHandle = advanceTxnEgo().getTxnId(); written = writeIv2ViableReplayEntryInternal(faultSpHandle); // Generate Iv2LogFault message and send it to replicas Iv2LogFaultMessage faultMsg = new Iv2LogFaultMessage(faultSpHandle, m_uniqueIdGenerator.getLastUniqueId()); m_mailbox.send(m_sendToHSIds, faultMsg); } } return written; } /** * Write the viable replay set to the command log with the provided SP Handle. * Pass back the future that is set after the fault log is written to disk. */ SettableFuture<Boolean> writeIv2ViableReplayEntryInternal(long spHandle) { SettableFuture<Boolean> written = null; if (m_replayComplete) { written = m_cl.logIv2Fault(m_mailbox.getHSId(), new HashSet<Long>(m_replicaHSIds), m_partitionId, spHandle); } return written; } @Override public CountDownLatch snapshotCompleted(SnapshotCompletionEvent event) { if (event.truncationSnapshot && event.didSucceed) { synchronized(m_lock) { writeIv2ViableReplayEntry(); } } return new CountDownLatch(0); } public void processDurabilityChecks(final CommandLog.CompletionChecks currentChecks) { final SiteTaskerRunnable r = new SiteTasker.SiteTaskerRunnable() { @Override void run() { assert(currentChecks != null); synchronized (m_lock) { currentChecks.processChecks(); } } }; if (InitiatorMailbox.SCHEDULE_IN_SITE_THREAD) { m_tasks.offer(r); } else { r.run(); } } /** * Just using "put" on the dup counter map is unsafe. * It won't detect the case where keys collide from two different transactions. */ void safeAddToDuplicateCounterMap(DuplicateCounterKey dpKey, DuplicateCounter counter) { DuplicateCounter existingDC = m_duplicateCounters.get(dpKey); if (existingDC != null) { // this is a collision and is bad existingDC.logWithCollidingDuplicateCounters(counter); VoltDB.crashGlobalVoltDB("DUPLICATE COUNTER MISMATCH: two duplicate counter keys collided.", true, null); } else { m_duplicateCounters.put(dpKey, counter); } } @Override public void dump() { m_replaySequencer.dump(m_mailbox.getHSId()); tmLog.info(String.format("%s: %s", CoreUtils.hsIdToString(m_mailbox.getHSId()), m_pendingTasks)); if (m_defaultConsistencyReadLevel == ReadLevel.SAFE) { tmLog.info("[dump] current truncation handle: " + TxnEgo.txnIdToString(m_repairLogTruncationHandle) + " " + (m_defaultConsistencyReadLevel == Consistency.ReadLevel.SAFE ? m_bufferedReadLog.toString() : "")); } } // This is for test only public void setConsistentReadLevelForTestOnly(ReadLevel readLevel) { m_defaultConsistencyReadLevel = readLevel; if (m_defaultConsistencyReadLevel == ReadLevel.SAFE) { m_bufferedReadLog = new BufferedReadLog(); } } private void updateMaxScheduledTransactionSpHandle(long newSpHandle) { m_maxScheduledTxnSpHandle = Math.max(m_maxScheduledTxnSpHandle, newSpHandle); } private long getMaxScheduledTxnSpHandle() { return m_maxScheduledTxnSpHandle; } private long getRepairLogTruncationHandleForReplicas() { m_lastSentTruncationHandle = m_repairLogTruncationHandle; return m_repairLogTruncationHandle; } private void setRepairLogTruncationHandle(long newHandle) { if (newHandle > m_repairLogTruncationHandle) { m_repairLogTruncationHandle = newHandle; // We have to advance the local truncation point on the replica. It's important for // node promotion when there are no missing repair log transactions on the replica. // Because we still want to release the reads if no following writes will come to this replica. if (! m_isLeader) { return; } if (m_defaultConsistencyReadLevel == ReadLevel.SAFE) { m_bufferedReadLog.releaseBufferedReads(m_mailbox, m_repairLogTruncationHandle); } scheduleRepairLogTruncateMsg(); } else { // As far as I know, they are cases that will move truncation handle backwards. // These include node failures (promotion phase) and node rejoin (early rejoin phase). if (tmLog.isDebugEnabled()) { tmLog.debug("Updating truncation point from " + TxnEgo.txnIdToString(m_repairLogTruncationHandle) + "to" + TxnEgo.txnIdToString(newHandle)); } } } /** * Schedules a task to be run on the site to send the latest truncation * handle to the replicas. This should be called whenever the local * truncation handle advances on the leader to guarantee that the replicas * will hear about the new handle in case there is no more transactions to * carry the information over. * * The truncation handle is not sent immediately when this method is called * to avoid sending a message for every committed transaction. In most cases * when there is sufficient load on the system, there will always be a new * transaction that this information can piggy-back on. In that case, by the * time this task runs on the site, the last sent truncation handle has * already advanced, so there is no need to send the message. This has the * benefit of sending more truncation messages when the throughput is low, * which makes the replicas see committed transactions faster. */ private void scheduleRepairLogTruncateMsg() { if (m_sendToHSIds.length == 0) { return; } m_tasks.offer(new SiteTaskerRunnable() { @Override void run() { synchronized (m_lock) { if (m_lastSentTruncationHandle < m_repairLogTruncationHandle) { m_lastSentTruncationHandle = m_repairLogTruncationHandle; final RepairLogTruncationMessage truncMsg = new RepairLogTruncationMessage(m_repairLogTruncationHandle); // Also keep the local repair log's truncation point up-to-date // so that it can trigger the callbacks. m_mailbox.deliver(truncMsg); m_mailbox.send(m_sendToHSIds, truncMsg); } } } }); } }