/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.iv2; import java.util.ArrayDeque; import java.util.Deque; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.voltcore.logging.VoltLogger; import org.voltdb.CatalogContext; import org.voltdb.CatalogSpecificPlanner; import org.voltdb.exceptions.TransactionRestartException; import org.voltdb.messaging.FragmentResponseMessage; import org.voltdb.messaging.FragmentTaskMessage; /** * Provide an implementation of the TransactionTaskQueue specifically for the MPI. * This class will manage separating the stream of reads and writes to different * Sites and block appropriately so that reads and writes never execute concurrently. */ public class MpTransactionTaskQueue extends TransactionTaskQueue { protected static final VoltLogger tmLog = new VoltLogger("TM"); // Track the current writes and reads in progress. If writes contains anything, reads must be empty, // and vice versa private final Map<Long, TransactionTask> m_currentWrites = new HashMap<Long, TransactionTask>(); private final Map<Long, TransactionTask> m_currentReads = new HashMap<Long, TransactionTask>(); private Deque<TransactionTask> m_backlog = new ArrayDeque<TransactionTask>(); private MpRoSitePool m_sitePool = null; MpTransactionTaskQueue(SiteTaskerQueue queue) { super(queue); } void setMpRoSitePool(MpRoSitePool sitePool) { m_sitePool = sitePool; } synchronized void updateCatalog(String diffCmds, CatalogContext context, CatalogSpecificPlanner csp) { m_sitePool.updateCatalog(diffCmds, context, csp); } synchronized void updateSettings(CatalogContext context, CatalogSpecificPlanner csp) { m_sitePool.updateSettings(context, csp); } void shutdown() { if (m_sitePool != null) { m_sitePool.shutdown(); } } /** * Stick this task in the backlog. * Many network threads may be racing to reach here, synchronize to * serialize queue order. * Always returns true in this case, side effect of extending * TransactionTaskQueue. */ @Override synchronized boolean offer(TransactionTask task) { Iv2Trace.logTransactionTaskQueueOffer(task); m_backlog.addLast(task); taskQueueOffer(); return true; } // repair is used by MPI repair to inject a repair task into the // SiteTaskerQueue. Before it does this, it unblocks the MP transaction // that may be running in the Site thread and causes it to rollback by // faking an unsuccessful FragmentResponseMessage. synchronized void repair(SiteTasker task, List<Long> masters, Map<Integer, Long> partitionMasters) { // We know that every Site assigned to the MPI (either the main writer or // any of the MP read pool) will only have one active transaction at a time, // and that we either have active reads or active writes, but never both. // Figure out which we're doing, and then poison all of the appropriate sites. Map<Long, TransactionTask> currentSet; if (!m_currentReads.isEmpty()) { assert(m_currentWrites.isEmpty()); tmLog.debug("MpTTQ: repairing reads"); for (Long txnId : m_currentReads.keySet()) { m_sitePool.repair(txnId, task); } currentSet = m_currentReads; } else { tmLog.debug("MpTTQ: repairing writes"); m_taskQueue.offer(task); currentSet = m_currentWrites; } for (Entry<Long, TransactionTask> e : currentSet.entrySet()) { if (e.getValue() instanceof MpProcedureTask) { MpProcedureTask next = (MpProcedureTask)e.getValue(); tmLog.debug("MpTTQ: poisoning task: " + next); next.doRestart(masters, partitionMasters); MpTransactionState txn = (MpTransactionState)next.getTransactionState(); // inject poison pill FragmentTaskMessage dummy = new FragmentTaskMessage(0L, 0L, 0L, 0L, false, false, false); FragmentResponseMessage poison = new FragmentResponseMessage(dummy, 0L); // Don't care about source HSID here // Provide a TransactionRestartException which will be converted // into a ClientResponse.RESTART, so that the MpProcedureTask can // detect the restart and take the appropriate actions. TransactionRestartException restart = new TransactionRestartException( "Transaction being restarted due to fault recovery or shutdown.", next.getTxnId()); poison.setStatus(FragmentResponseMessage.UNEXPECTED_ERROR, restart); txn.offerReceivedFragmentResponse(poison); } else { // Don't think that EveryPartitionTasks need to do anything here, since they // don't actually run java, they just exist for sequencing. Any cleanup should be // to the duplicate counter in MpScheduler for this transaction. } } // Now, iterate through the backlog and update the partition masters // for all ProcedureTasks Iterator<TransactionTask> iter = m_backlog.iterator(); while (iter.hasNext()) { TransactionTask tt = iter.next(); if (tt instanceof MpProcedureTask) { MpProcedureTask next = (MpProcedureTask)tt; tmLog.debug("Repair updating task: " + next + " with masters: " + masters); next.updateMasters(masters, partitionMasters); } else if (tt instanceof EveryPartitionTask) { EveryPartitionTask next = (EveryPartitionTask)tt; tmLog.debug("Repair updating EPT task: " + next + " with masters: " + masters); next.updateMasters(masters); } } } private void taskQueueOffer(TransactionTask task) { Iv2Trace.logSiteTaskerQueueOffer(task); if (task.getTransactionState().isReadOnly()) { m_sitePool.doWork(task.getTxnId(), task); } else { m_taskQueue.offer(task); } } private boolean taskQueueOffer() { // Do we have something to do? // - If so, is it a write? // - If so, are there reads or writes outstanding? // - if not, pull it from the backlog, add it to current write set, and queue it // - if so, bail for now // - If not, are there writes outstanding? // - if not, while there are reads on the backlog and the pool has capacity: // - pull the read from the backlog, add it to the current read set, and queue it. // - bail when done // - if so, bail for now boolean retval = false; if (!m_backlog.isEmpty()) { // We may not queue the next task, just peek to get the read-only state TransactionTask task = m_backlog.peekFirst(); if (!task.getTransactionState().isReadOnly()) { if (m_currentReads.isEmpty() && m_currentWrites.isEmpty()) { task = m_backlog.pollFirst(); m_currentWrites.put(task.getTxnId(), task); taskQueueOffer(task); retval = true; } } else if (m_currentWrites.isEmpty()) { while (task != null && task.getTransactionState().isReadOnly() && m_sitePool.canAcceptWork()) { task = m_backlog.pollFirst(); assert(task.getTransactionState().isReadOnly()); m_currentReads.put(task.getTxnId(), task); taskQueueOffer(task); retval = true; // Prime the pump with the head task, if any. If empty, // task will be null task = m_backlog.peekFirst(); } } } return retval; } /** * Indicate that the transaction associated with txnId is complete. Perform * management of reads/writes in progress then call taskQueueOffer() to * submit additional tasks to be done, determined by whatever the current state is. * See giant comment at top of taskQueueOffer() for what happens. */ @Override synchronized int flush(long txnId) { int offered = 0; if (m_currentReads.containsKey(txnId)) { m_currentReads.remove(txnId); m_sitePool.completeWork(txnId); } else { assert(m_currentWrites.containsKey(txnId)); m_currentWrites.remove(txnId); assert(m_currentWrites.isEmpty()); } if (taskQueueOffer()) { ++offered; } return offered; } /** * Restart the current task at the head of the queue. This will be called * instead of flush by the currently blocking MP transaction in the event a * restart is necessary. */ @Override synchronized void restart() { if (!m_currentReads.isEmpty()) { // re-submit all the tasks in the current read set to the pool. // the pool will ensure that things submitted with the same // txnID will go to the the MpRoSite which is currently running it for (TransactionTask task : m_currentReads.values()) { taskQueueOffer(task); } } else { assert(!m_currentWrites.isEmpty()); TransactionTask task; // There currently should only ever be one current write. This // is the awkward way to get a single value out of a Map task = m_currentWrites.entrySet().iterator().next().getValue(); taskQueueOffer(task); } } /** * How many Tasks are un-runnable? * @return */ @Override synchronized int size() { return m_backlog.size(); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("MpTransactionTaskQueue:").append("\n"); sb.append("\tSIZE: ").append(m_backlog.size()).append("\n"); if (!m_backlog.isEmpty()) { sb.append("\tHEAD: ").append(m_backlog.getFirst()).append("\n"); } return sb.toString(); } }