/* This file is part of VoltDB.
* Copyright (C) 2008-2017 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltcore.agreement;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.PriorityQueue;
import org.apache.zookeeper_voltpatches.ZooDefs.OpCode;
import org.voltcore.TransactionIdManager;
import org.voltcore.agreement.AgreementSite.AgreementTransactionState;
import org.voltcore.logging.VoltLogger;
import org.voltcore.messaging.HeartbeatResponseMessage;
import org.voltcore.messaging.Mailbox;
import org.voltcore.utils.CoreUtils;
/**
* <p>Extends a PriorityQueue such that is only stores transaction state
* objects, and it only releases them (to a poll() call) if they are
* ready to be processed.</p>
*
* <p>In this case, ready to be processed is determined by storing the
* most recent transaction id from each initiator. The smallest transaction
* id across all initiators is safe to run. Also any older transactions are
* also safe to run.</p>
*
* <p>This class manages all that state.</p>
*/
public class RestrictedPriorityQueue extends PriorityQueue<OrderableTransaction> {
private static final long serialVersionUID = 1L;
private final VoltLogger m_recoveryLog = new VoltLogger("RECOVERY");
public enum QueueState {
UNBLOCKED,
BLOCKED_EMPTY,
BLOCKED_ORDERING,
BLOCKED_SAFETY,
BLOCKED_CLOSED; // terminal state.
}
class LastInitiatorData {
LastInitiatorData() {
m_lastSeenTxnId = DtxnConstants.DUMMY_LAST_SEEN_TXN_ID; // -1
m_lastSafeTxnId = DtxnConstants.DUMMY_LAST_SEEN_TXN_ID; // -1
}
long m_lastSeenTxnId;
long m_lastSafeTxnId;
@Override
public String toString() {
return "{" + TransactionIdManager.toString(m_lastSeenTxnId) + "," + TransactionIdManager.toString(m_lastSafeTxnId) + "}";
}
}
final LinkedHashMap<Long, LastInitiatorData> m_initiatorData = new LinkedHashMap<Long, LastInitiatorData>();
long m_newestCandidateTransaction = -1;
final long m_hsId;
QueueState m_state = QueueState.BLOCKED_EMPTY;
final Mailbox m_mailbox;
final boolean m_useSafetyDance;
/**
* Tell this queue about all initiators. If any initiators
* are later referenced that aren't in this list, trip
* an assertion.
*/
public RestrictedPriorityQueue(long hsId, Mailbox mbox, boolean useSafetyDance) {
m_hsId = hsId;
m_mailbox = mbox;
m_useSafetyDance = useSafetyDance;
}
/**
* Only return transaction state objects that are ready to run.
*/
@Override
public OrderableTransaction poll() {
OrderableTransaction retval = null;
updateQueueState();
if (m_state == QueueState.UNBLOCKED) {
retval = super.peek();
super.poll();
// not BLOCKED_EMPTY
assert(retval != null);
}
return retval;
}
/**
* Only return transaction state objects that are ready to run.
*/
@Override
public OrderableTransaction peek() {
OrderableTransaction retval = null;
updateQueueState();
if (m_state == QueueState.UNBLOCKED) {
retval = super.peek();
// not BLOCKED_EMPTY
assert(retval != null);
}
return retval;
}
/**
* Drop data for unknown initiators. This is the only valid add interface.
*/
@Override
public boolean add(OrderableTransaction txnState) {
if (m_initiatorData.containsKey(txnState.initiatorHSId) == false) {
return false;
}
boolean retval = super.add(txnState);
// update the queue state
if (retval) updateQueueState();
return retval;
}
@Override
public boolean remove(Object txnState) {
boolean retval = super.remove(txnState);
updateQueueState();
return retval;
}
/**
* Update the information stored about the latest transaction
* seen from each initiator. Compute the newest safe transaction id.
*/
public long noteTransactionRecievedAndReturnLastSeen(long initiatorHSId, long txnId,
long lastSafeTxnIdFromInitiator)
{
// System.out.printf("Site %d got heartbeat message from initiator %d with txnid/safeid: %d/%d\n",
// m_siteId, initiatorSiteId, txnId, lastSafeTxnIdFromInitiator);
// this doesn't exclude dummy txnid but is also a sanity check
assert(txnId != 0);
// Drop old data from already-failed initiators.
if (m_initiatorData.containsKey(initiatorHSId) == false) {
//hostLog.info("Dropping txn " + txnId + " data from failed initiatorSiteId: " + initiatorSiteId);
return DtxnConstants.DUMMY_LAST_SEEN_TXN_ID;
}
// update the latest transaction for the specified initiator
LastInitiatorData lid = m_initiatorData.get(initiatorHSId);
if (lid.m_lastSeenTxnId < txnId)
lid.m_lastSeenTxnId = txnId;
if (lid.m_lastSafeTxnId < lastSafeTxnIdFromInitiator)
lid.m_lastSafeTxnId = lastSafeTxnIdFromInitiator;
/*
* Why aren't we asserting that the txnId is > then the last seen/last safe
* It seems like this should be guaranteed by TCP ordering and we want to
* know if it isn't!
*/
// find the minimum value across all latest transactions
long min = Long.MAX_VALUE;
for (LastInitiatorData l : m_initiatorData.values())
if (l.m_lastSeenTxnId < min) min = l.m_lastSeenTxnId;
// This transaction is the guaranteed minimum
// but is not yet necessarily 2PC'd to every site.
m_newestCandidateTransaction = min;
// this will update the state of the queue if needed
updateQueueState();
// return the last seen id for the originating initiator
return lid.m_lastSeenTxnId;
}
/**
* Remove all pending transactions from the specified initiator
* and do not require heartbeats from that initiator to proceed.
* @param initiatorId id of the failed initiator.
*/
public void gotFaultForInitiator(long initiatorId) {
// calculate the next minimum transaction w/o our dead friend
noteTransactionRecievedAndReturnLastSeen(initiatorId, Long.MAX_VALUE, DtxnConstants.DUMMY_LAST_SEEN_TXN_ID);
// remove initiator from minimum. txnid scoreboard
LastInitiatorData remove = m_initiatorData.remove(initiatorId);
assert(remove != null);
}
public void faultTransaction(OrderableTransaction txnState) {
this.remove(txnState);
}
/**
* After a catalog change, double check that all initators in the catalog
* that are known to be "up" are here, in the RPQ's list.
* @param initiatorId Initiator present in the catalog.
* @return The number of initiators that weren't known
*/
public int ensureInitiatorIsKnown(long initiatorId) {
int newInitiatorCount = 0;
if (m_initiatorData.get(initiatorId) == null) {
m_initiatorData.put(initiatorId, new LastInitiatorData());
newInitiatorCount++;
}
return newInitiatorCount;
}
/**
* @return The id of the newest safe transaction to run.
*/
long getNewestSafeTransaction() {
return m_newestCandidateTransaction;
}
/**
* Return the largest confirmed txn id for the initiator given.
* Used to figure out what to do after an initiator fails.
* @param initiatorId The id of the initiator that has failed.
*/
public Long getNewestSafeTransactionForInitiator(Long initiatorId) {
LastInitiatorData lid = m_initiatorData.get(initiatorId);
if (lid == null) {
return null;
}
return lid.m_lastSafeTxnId;
}
public void shutdown() throws InterruptedException {
}
public QueueState getQueueState() {
return m_state;
}
long m_blockTime = 0;
QueueState updateQueueState() {
QueueState newState = QueueState.UNBLOCKED;
OrderableTransaction ts = super.peek();
LastInitiatorData lid = null;
// Terminal states (currently only BLOCKED_CLOSED)
if (m_state == QueueState.BLOCKED_CLOSED) {
return m_state;
}
assert (newState == QueueState.UNBLOCKED);
// Empty queue
if (ts == null) {
//Switch to BLOCKED_EMPTY
newState = QueueState.BLOCKED_EMPTY;
executeStateChange(newState, ts, lid);
return m_state;
}
assert (newState == QueueState.UNBLOCKED);
if (ts instanceof AgreementTransactionState) {
AgreementTransactionState ats = (AgreementTransactionState)ts;
switch (ats.m_request.type) {
//For reads see if we can skip global agreement and just do the read
case OpCode.exists:
case OpCode.getChildren:
case OpCode.getChildren2:
case OpCode.getData:
newState = QueueState.UNBLOCKED;
executeStateChange(newState, ts, lid);
return newState;
default:
break;
}
}
// Sufficient ordering established?
if (ts.txnId > m_newestCandidateTransaction) {
newState = QueueState.BLOCKED_ORDERING;
executeStateChange(newState, ts, lid);
return m_state;
}
assert (newState == QueueState.UNBLOCKED);
// Remember, an 'in recovery' response satisfies the safety dance
lid = m_initiatorData.get(ts.initiatorHSId);
if (lid == null) {
// what does this mean???
}
// if the txn is newer than the last safe txn from initiatior, block
// except if this RPQ has safety turned off
else if (m_useSafetyDance && (ts.txnId > lid.m_lastSafeTxnId)) {
newState = QueueState.BLOCKED_SAFETY;
executeStateChange(newState, ts, lid);
return m_state;
}
assert (newState == QueueState.UNBLOCKED);
// legitimately unblocked
assert (ts != null);
executeStateChange( newState, ts, lid);
return newState;
}
private void executeStateChange(QueueState newState, OrderableTransaction ts,
LastInitiatorData lid)
{
// Execute state changes
if (newState != m_state) {
// Count millis spent non-empty but blocked
if ((newState == QueueState.BLOCKED_ORDERING) ||
(newState == QueueState.BLOCKED_SAFETY))
{
m_blockTime = System.currentTimeMillis();
}
// Send a heartbeat response on blocked safety transitions
// This side-effect is a little broken. It results in extra
// heartbeat responses in some paths.
if (newState == QueueState.BLOCKED_SAFETY) {
assert(ts != null);
assert(lid != null);
sendHearbeatResponse(ts, lid);
}
m_state = newState;
}
}
private void sendHearbeatResponse(OrderableTransaction ts, LastInitiatorData lid) {
// mailbox might be null in testing
if (m_mailbox == null) return;
HeartbeatResponseMessage hbr =
new HeartbeatResponseMessage(m_hsId, lid.m_lastSeenTxnId, true);
m_mailbox.send(ts.initiatorHSId, hbr);
}
/**
* Determine if it is safe to recover and if it is, what txnid it is safe to recover at.
* Recovery is initiated by the recovering source partition. It can't be initiated until the recovering
* partition has heard from every initiator. This is because it is not possible to pick a point
* in the global txn ordering for the recovery to start at where all subsequent procedure invocations
* that need to be applied after recovery are available unless every initiator has been heard from.
*
* Once the initiators have all been heard from it is necessary to pick the lowest txnid possible for all pending
* work. This means taking the min of the newest candidate transaction | the txnid of the next txn in the queue.
*
* The newest candidate transaction is used if there are no pending txns so recovery can start when
* the system is idle.
*/
public Long safeToRecover() {
boolean safe = true;
for (LastInitiatorData data : m_initiatorData.values()) {
final long lastSeenTxnId = data.m_lastSeenTxnId;
if (lastSeenTxnId == DtxnConstants.DUMMY_LAST_SEEN_TXN_ID) {
safe = false;
}
}
if (!safe) {
return null;
}
OrderableTransaction next = peek();
if (next == null) {
// no work - have heard from all initiators. use a heartbeat
if (m_state == QueueState.BLOCKED_EMPTY) {
return m_newestCandidateTransaction;
}
// waiting for some txn to be 2pc to this site.
else if (m_state == QueueState.BLOCKED_SAFETY) {
return null;
} else if (m_state == QueueState.BLOCKED_ORDERING){
return null;
}
m_recoveryLog.error("Unexpected RPQ state " + m_state + " when attempting to start recovery at " +
" the source site. Consider killing the recovering node and trying again");
return null; // unreachable
}
else {
// bingo - have a real transaction to return as the recovery point
return next.txnId;
}
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("State: ").append(m_state);
for (Map.Entry<Long, LastInitiatorData> entry : m_initiatorData.entrySet()) {
LastInitiatorData lid = entry.getValue();
sb.append(' ');
sb.append(CoreUtils.hsIdToString(entry.getKey()));
sb.append("==");
sb.append(lid.m_lastSeenTxnId);
sb.append(':');
sb.append(lid.m_lastSafeTxnId);
sb.append(' ');
}
sb.append('\n');
sb.append(super.toString());
return sb.toString();
}
}