/* This file is part of VoltDB.
* Copyright (C) 2008-2017 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb.iv2;
import java.util.Deque;
import java.util.LinkedList;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.voltcore.logging.VoltLogger;
import org.voltcore.messaging.TransactionInfoBaseMessage;
import org.voltcore.messaging.VoltMessage;
import org.voltcore.utils.CoreUtils;
import org.voltdb.ClientResponseImpl;
import org.voltdb.StoredProcedureInvocation;
import org.voltdb.VoltTable;
import org.voltdb.messaging.CompleteTransactionMessage;
import org.voltdb.messaging.FragmentTaskMessage;
import org.voltdb.messaging.InitiateResponseMessage;
import org.voltdb.messaging.Iv2EndOfLogMessage;
import org.voltdb.messaging.Iv2InitiateTaskMessage;
import org.voltdb.messaging.MultiPartitionParticipantMessage;
/**
* Orders work for command log replay - where fragment tasks can show up before
* or after the partition-wise sentinels that record the correct location of a
* multi-partition work in the partition's transaction sequence.
*
* Offer a message to the replay sequencer. If the sequencer rejects this
* message, it is already correctly sequenced. Callers must check the return
* code of <code>offer</code>. If offering makes other messages available, they
* must be retrieved by calling poll() until it returns null.
*
* End of log handling: There is no per-partition end of log message any more,
* only the MPI will send end of log message. If the MPI reaches end of log, and there is
* an outstanding sentinel in the sequencer, then all SPs blocked after this
* sentinel will be drained. There cannot be any fragments in
* the replay sequencer when the MPI EOL arrives, because the MPI will only send
* EOLs when it has finished all previous MP work.
*
* NOTE: messages are sequenced according to the transactionId passed in to the
* offer() method. This transaction id may differ from the value stored in the
* ReplayEntry.m_firstFragment in the case of DR fragment tasks. The
* ReplaySequencer MUST do all txnId comparisons on the value passed to offer
* (which becomes a key in m_replayEntries tree map).
*
* Drainable: When poll() should make no more progress, we need to switch to drain().
* These conditions are only applicable to command log replay and not DR.
* - If we're blocked on a sentinel with no matching fragment and we've seen the MP EOL condition,
* then we know that we're never going to be able to order anything later than that position in
* the log, and we need to drain any outstanding invocations so we can respond IGNORING for them
* to complete command log replay.
* - If we're blocked on a fragment with no matching sentinel and we've seen the SP EOL condition,
* then we know that we're never going to be able to order anything later than that position in
* the log. This is currently defect ENG-4218. We will need to do drain, plus we'll
* probably want to respond to any outstanding FragmentTasks with an error response of some kind to abort
* those transactions.
*/
public class ReplaySequencer
{
static final VoltLogger tmLog = new VoltLogger("TM");
// place holder that associates sentinel, first fragment and
// work that follows in the transaction sequence.
private class ReplayEntry {
Long m_sentinelUniqueId = null;
FragmentTaskMessage m_firstFragment = null;
/**
* If this entry is either missing the sentinel or the first fragment, queue up all following Iv2 messages
* in this queue until the entry has both sentinel and first fragment.
*/
private Deque<VoltMessage> m_queuedMessages = new LinkedList<VoltMessage>();
private boolean m_servedFragment = false;
boolean isReady()
{
return m_sentinelUniqueId != null && m_firstFragment != null;
}
boolean hasSentinel()
{
return m_sentinelUniqueId != null;
}
void addQueuedMessage(VoltMessage m)
{
m_queuedMessages.addLast(m);
}
VoltMessage poll()
{
if (!isReady()) return null;
if (!m_servedFragment)
{
m_servedFragment = true;
return m_firstFragment;
}
else
{
return m_queuedMessages.poll();
}
}
VoltMessage drain()
{
if(!m_servedFragment && m_firstFragment != null)
{
m_servedFragment = true;
return m_firstFragment;
}
return m_queuedMessages.poll();
}
boolean isEmpty() {
return isReady() && m_servedFragment && m_queuedMessages.isEmpty();
}
@Override
public String toString()
{
return String.format("(SENTINEL UNIQUEID: %d (%s), %d QUEUED MESSAGES, %s)\n%s",
m_sentinelUniqueId, m_sentinelUniqueId != null ?
UniqueIdGenerator.toString(m_sentinelUniqueId) : "",
m_queuedMessages.size(),
m_servedFragment ? "SERVED FRAGMENT" : "",
m_firstFragment);
}
}
// queued entries hashed by unique id.
TreeMap<Long, ReplayEntry> m_replayEntries = new TreeMap<Long, ReplayEntry>();
// lastPolledFragmentUniqueId tracks released MP transactions; new fragments
// for released transactions do not need further sequencing.
long m_lastPolledFragmentUniqueId = Long.MIN_VALUE;
// lastSeenUniqueId tracks the last seen uniqueId for this partition
long m_lastSeenUniqueId = Long.MIN_VALUE;
// has reached end of log for the MPI, no more fragments or SPs will come,
// release all txns.
boolean m_mpiEOLReached = false;
// some combination of conditions has occurred which will result in no
// further sequence-able transactions. All remaining invocations in the
// sequencer must be removed using drain()
boolean m_mustDrain = false;
/**
* Dedupe initiate task messages. Check if the initiate task message is seen before.
*
* @param inUniqueId The uniqueId of the message
* @param in The initiate task message
* @return A client response to return if it's a duplicate, otherwise null.
*/
public InitiateResponseMessage dedupe(long inUniqueId, TransactionInfoBaseMessage in)
{
if (in instanceof Iv2InitiateTaskMessage) {
final Iv2InitiateTaskMessage init = (Iv2InitiateTaskMessage) in;
final StoredProcedureInvocation invocation = init.getStoredProcedureInvocation();
final String procName = invocation.getProcName();
/*
* Ning - @LoadSinglepartTable and @LoadMultipartTable always have the same txnId
* which is the txnId of the snapshot.
*/
if (!(procName.equalsIgnoreCase("@LoadSinglepartitionTable") ||
procName.equalsIgnoreCase("@LoadMultipartitionTable")) &&
inUniqueId <= m_lastSeenUniqueId) {
// already sequenced
final InitiateResponseMessage resp = new InitiateResponseMessage(init);
resp.setResults(new ClientResponseImpl(ClientResponseImpl.UNEXPECTED_FAILURE,
new VoltTable[0],
ClientResponseImpl.IGNORED_TRANSACTION));
return resp;
}
}
return null;
}
/**
* Update the last seen uniqueId for this partition if it's an initiate task message.
*
* @param inUniqueId
* @param in
*/
public void updateLastSeenUniqueId(long inUniqueId, TransactionInfoBaseMessage in)
{
if (in instanceof Iv2InitiateTaskMessage && inUniqueId > m_lastSeenUniqueId) {
m_lastSeenUniqueId = inUniqueId;
}
}
/**
* Update the last polled uniqueId for this partition if it's a fragment task message.
* @param inUniqueId
* @param in
*/
public void updateLastPolledUniqueId(long inUniqueId, TransactionInfoBaseMessage in)
{
if (in instanceof FragmentTaskMessage) {
m_lastPolledFragmentUniqueId = inUniqueId;
}
}
// Return the next correctly sequenced message or null if none exists.
public VoltMessage poll()
{
if (m_mustDrain || m_replayEntries.isEmpty()) {
return null;
}
if (m_replayEntries.firstEntry().getValue().isEmpty()) {
m_replayEntries.pollFirstEntry();
}
// All the drain conditions depend on being blocked, which
// we will only really know for sure when we try to poll().
checkDrainCondition();
if (m_mustDrain || m_replayEntries.isEmpty()) {
return null;
}
VoltMessage m = m_replayEntries.firstEntry().getValue().poll();
updateLastPolledUniqueId(m_replayEntries.firstEntry().getKey(), (TransactionInfoBaseMessage) m);
return m;
}
// Pull the next message that needs an IGNORING response. Once this
// starts returning messages, poll() will always return null
public VoltMessage drain()
{
if (!m_mustDrain || m_replayEntries.isEmpty()) {
return null;
}
VoltMessage head = m_replayEntries.firstEntry().getValue().drain();
while (head == null) {
m_replayEntries.pollFirstEntry();
if (!m_replayEntries.isEmpty()) {
// This will end up null if the next ReplayEntry was just a sentinel.
// We'll keep going.
head = m_replayEntries.firstEntry().getValue().drain();
}
else {
break;
}
}
return head;
}
private void checkDrainCondition()
{
// Don't ever go backwards once the drain decision is made.
if (m_mustDrain) {
return;
}
// if we've got things to sequence, check to if we're blocked
if (!m_replayEntries.isEmpty()) {
ReplayEntry head = m_replayEntries.firstEntry().getValue();
if (!head.isReady()) {
// if we're blocked, see if we have a sentinel or a fragment.
// we know we have one or the other but not both. Neither
// means we wouldn't exist, and both would make us ready.
// if it's the sentinel, see if the MPI's command log is done
if (head.hasSentinel() && m_mpiEOLReached) {
m_mustDrain = true;
}
}
}
}
// Offer a new message. Return false if the offered message can be run immediately.
public boolean offer(long inUniqueId, TransactionInfoBaseMessage in)
{
ReplayEntry found = m_replayEntries.get(inUniqueId);
if (in instanceof Iv2EndOfLogMessage) {
m_mpiEOLReached = true;
return true;
}
if (in instanceof MultiPartitionParticipantMessage) {
//--------------------------------------------
// DRv1 path, mark for future removal
/*
* DR sends multiple @LoadMultipartitionTable proc calls with the
* same txnId, which is the snapshot txnId. For each partition,
* there is a sentinel paired with the @LoadMultipartitionTable
* call. Dedupe the sentinels the same way as we dedupe fragments,
* so that there won't be sentinels end up in the sequencer where
* matching fragments are deduped.
*/
if (inUniqueId <= m_lastPolledFragmentUniqueId) {
return true;
}
//--------------------------------------------
if (found == null) {
ReplayEntry newEntry = new ReplayEntry();
newEntry.m_sentinelUniqueId = inUniqueId;
m_replayEntries.put(inUniqueId, newEntry);
}
else {
found.m_sentinelUniqueId = inUniqueId;
assert(found.isReady());
}
}
else if (in instanceof FragmentTaskMessage) {
// already sequenced
if (inUniqueId <= m_lastPolledFragmentUniqueId) {
return false;
}
FragmentTaskMessage ftm = (FragmentTaskMessage)in;
if (found == null) {
ReplayEntry newEntry = new ReplayEntry();
newEntry.m_firstFragment = ftm;
m_replayEntries.put(inUniqueId, newEntry);
}
else if (found.m_firstFragment == null) {
found.m_firstFragment = ftm;
assert(found.isReady());
}
else {
found.addQueuedMessage(ftm);
}
}
else if (in instanceof CompleteTransactionMessage) {
// don't sequence CompleteTranscationMessage, throw them to scheduler directly
return false;
}
else {
//--------------------------------------------
// DRv1 path, mark for future removal
if (dedupe(inUniqueId, in) != null) {
// Ignore an already seen txn
return true;
}
//--------------------------------------------
updateLastSeenUniqueId(inUniqueId, in);
if (m_replayEntries.isEmpty() || !m_replayEntries.lastEntry().getValue().hasSentinel()) {
// not-blocked work; rejected and not queued.
return false;
}
else {
// queued the message with the newest replayEntry
m_replayEntries.lastEntry().getValue().addQueuedMessage(in);
}
}
return true;
}
public void dump(long hsId)
{
final String who = CoreUtils.hsIdToString(hsId);
tmLog.info(String.format("%s: REPLAY SEQUENCER DUMP, LAST POLLED FRAGMENT %d (%s), LAST SEEN TXNID %d (%s), %s%s",
who,
m_lastPolledFragmentUniqueId, TxnEgo.txnIdToString(m_lastPolledFragmentUniqueId),
m_lastSeenUniqueId, TxnEgo.txnIdToString(m_lastSeenUniqueId),
m_mpiEOLReached ? "MPI EOL, " : "",
m_mustDrain ? "MUST DRAIN" : ""));
for (Entry<Long, ReplayEntry> e : m_replayEntries.entrySet()) {
tmLog.info(String.format("%s: REPLAY ENTRY %s: %s", who, e.getKey(), e.getValue()));
}
}
}