/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Deque; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import java.util.TreeSet; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import org.apache.zookeeper_voltpatches.KeeperException; import org.apache.zookeeper_voltpatches.KeeperException.NoNodeException; import org.apache.zookeeper_voltpatches.ZooKeeper; import org.apache.zookeeper_voltpatches.data.Stat; import org.json_voltpatches.JSONObject; import org.voltcore.logging.VoltLogger; import org.voltcore.utils.CoreUtils; import org.voltcore.utils.DBBPool; import org.voltcore.utils.DBBPool.BBContainer; import org.voltcore.utils.Pair; import org.voltdb.catalog.Database; import org.voltdb.catalog.Table; import org.voltdb.iv2.MpInitiator; import org.voltdb.iv2.SiteTaskerQueue; import org.voltdb.iv2.SnapshotTask; import org.voltdb.rejoin.StreamSnapshotDataTarget.StreamSnapshotTimeoutException; import org.voltdb.sysprocs.saverestore.SnapshotPredicates; import org.voltdb.utils.CatalogUtil; import org.voltdb.utils.CompressionService; import org.voltdb.utils.MiscUtils; import com.google_voltpatches.common.collect.ImmutableMap; import com.google_voltpatches.common.collect.ListMultimap; import com.google_voltpatches.common.collect.Lists; import com.google_voltpatches.common.collect.Maps; import com.google_voltpatches.common.util.concurrent.ListenableFuture; /** * Encapsulates the state needed to manage an ongoing snapshot at the * per-execution site level. Also contains some static global snapshot * counters. This class requires callers to maintain thread safety; * generally (exclusively?) it is driven by ExecutionSite, each of * which has a SnapshotSiteProcessor. */ public class SnapshotSiteProcessor { private static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT"); /** Global count of execution sites on this node performing snapshot */ public static final Set<Object> ExecutionSitesCurrentlySnapshotting = Collections.synchronizedSet(new HashSet<Object>()); /** * Ensure only one thread running the setup fragment does the creation * of the targets and the distribution of the work. */ public static final Object m_snapshotCreateLock = new Object(); public static CyclicBarrier m_snapshotCreateSetupBarrier = null; public static CyclicBarrier m_snapshotCreateFinishBarrier = null; public static final Runnable m_snapshotCreateSetupBarrierAction = new Runnable() { @Override public void run() { Runnable r = SnapshotSiteProcessor.m_snapshotCreateSetupBarrierActualAction.getAndSet(null); if (r != null) { r.run(); } } }; public static AtomicReference<Runnable> m_snapshotCreateSetupBarrierActualAction = new AtomicReference<Runnable>(); public static void readySnapshotSetupBarriers(int numSites) { synchronized (SnapshotSiteProcessor.m_snapshotCreateLock) { if (SnapshotSiteProcessor.m_snapshotCreateSetupBarrier == null) { SnapshotSiteProcessor.m_snapshotCreateFinishBarrier = new CyclicBarrier(numSites); SnapshotSiteProcessor.m_snapshotCreateSetupBarrier = new CyclicBarrier(numSites, SnapshotSiteProcessor.m_snapshotCreateSetupBarrierAction); } else if (SnapshotSiteProcessor.m_snapshotCreateSetupBarrier.isBroken()) { SnapshotSiteProcessor.m_snapshotCreateSetupBarrier.reset(); SnapshotSiteProcessor.m_snapshotCreateFinishBarrier.reset(); } } } /** * Sequence numbers for export tables. This is repopulated before each snapshot by each execution site * that reaches the snapshot. */ private static final Map<String, Map<Integer, Pair<Long, Long>>> m_exportSequenceNumbers = new HashMap<String, Map<Integer, Pair<Long, Long>>>(); private static final Map<Integer, TupleStreamStateInfo> m_drTupleStreamInfo = new HashMap<>(); private ExtensibleSnapshotDigestData m_extraSnapshotData; /* * Do some random tasks that are deferred to the snapshot termination thread. * The two I know about are syncing/closing the digest file and catalog copy */ public static final ConcurrentLinkedQueue<Runnable> m_tasksOnSnapshotCompletion = new ConcurrentLinkedQueue<Runnable>(); /* * Random tasks performed on each site after the snapshot tasks are finished but * before the snapshot transaction is finished. */ public static final Map<Integer, PostSnapshotTask> m_siteTasksPostSnapshotting = Collections.synchronizedMap(new HashMap<Integer, PostSnapshotTask>()); /** * Pick a buffer length that is big enough to store at least one of the largest size tuple supported * in the system (2 megabytes). Add a fudge factor for metadata. */ public static final int m_snapshotBufferLength = (1024 * 1024 * 2) + Short.MAX_VALUE; public static final int m_snapshotBufferCompressedLen = CompressionService.maxCompressedLength(m_snapshotBufferLength); /** * Limit the number of buffers that are outstanding at any given time */ private static final AtomicInteger m_availableSnapshotBuffers = new AtomicInteger(16); /** * The last EE out has to shut off the lights. Cache a list * of targets in case this EE ends up being the one that needs * to close each target. */ private volatile ArrayList<SnapshotDataTarget> m_snapshotTargets = null; /** * Map of tasks for tables that still need to be snapshotted. * Once a table has finished serializing stuff, it's removed from the map. * Making it a TreeMap so that it works through one table at time, easier to debug. */ private ListMultimap<Integer, SnapshotTableTask> m_snapshotTableTasks = null; private Map<Integer, TableStreamer> m_streamers = null; private long m_lastSnapshotTxnId; private final int m_snapshotPriority; private boolean m_perSiteLastSnapshotSucceded = true; /** * List of threads to join to block on snapshot completion * when using completeSnapshotWork(). */ private ArrayList<Thread> m_snapshotTargetTerminators = null; /** * When a buffer is returned to the pool a new snapshot task will be offered to the queue * to ensure the EE wakes up and does any potential snapshot work with that buffer */ private final SiteTaskerQueue m_siteTaskerQueue; private final Random m_random = new Random(); /* * Interface that will be checked when scheduling snapshot work in IV2. * Reports whether the site is "idle" for whatever definition that may be. * If the site is idle then work will be scheduled immediately instead of being * throttled */ public interface IdlePredicate { public boolean idle(long now); } private final IdlePredicate m_idlePredicate; /* * Synchronization is handled by SnapshotSaveAPI.startSnapshotting * Store the export sequence numbers for every table and partition. This will * be called by every execution site before the snapshot starts. Then the execution * site that gets the setup permit will use getExportSequenceNumbers to retrieve the full * set and reset the contents. */ public static void populateSequenceNumbersForExecutionSite(SystemProcedureExecutionContext context) { Database database = context.getDatabase(); for (Table t : database.getTables()) { if (!CatalogUtil.isTableExportOnly(database, t)) continue; Map<Integer, Pair<Long,Long>> sequenceNumbers = m_exportSequenceNumbers.get(t.getTypeName()); if (sequenceNumbers == null) { sequenceNumbers = new HashMap<Integer, Pair<Long, Long>>(); m_exportSequenceNumbers.put(t.getTypeName(), sequenceNumbers); } long[] ackOffSetAndSequenceNumber = context.getSiteProcedureConnection().getUSOForExportTable(t.getSignature()); sequenceNumbers.put( context.getPartitionId(), Pair.of( ackOffSetAndSequenceNumber[0], ackOffSetAndSequenceNumber[1])); } TupleStreamStateInfo drStateInfo = context.getSiteProcedureConnection().getDRTupleStreamStateInfo(); m_drTupleStreamInfo.put(context.getPartitionId(), drStateInfo); if (drStateInfo.containsReplicatedStreamInfo) { m_drTupleStreamInfo.put(MpInitiator.MP_INIT_PID, drStateInfo); } } public static Map<String, Map<Integer, Pair<Long, Long>>> getExportSequenceNumbers() { HashMap<String, Map<Integer, Pair<Long, Long>>> sequenceNumbers = new HashMap<String, Map<Integer, Pair<Long, Long>>>(m_exportSequenceNumbers); m_exportSequenceNumbers.clear(); return sequenceNumbers; } public static Map<Integer, TupleStreamStateInfo> getDRTupleStreamStateInfo() { Map<Integer, TupleStreamStateInfo> stateInfo = ImmutableMap.copyOf(m_drTupleStreamInfo); m_drTupleStreamInfo.clear(); return stateInfo; } private long m_quietUntil = 0; public SnapshotSiteProcessor(SiteTaskerQueue siteQueue, int snapshotPriority) { this(siteQueue, snapshotPriority, new IdlePredicate() { @Override public boolean idle(long now) { throw new UnsupportedOperationException(); } }); } public SnapshotSiteProcessor(SiteTaskerQueue siteQueue, int snapshotPriority, IdlePredicate idlePredicate) { m_siteTaskerQueue = siteQueue; m_snapshotPriority = snapshotPriority; m_idlePredicate = idlePredicate; } public void shutdown() throws InterruptedException { m_snapshotCreateSetupBarrier = null; m_snapshotCreateFinishBarrier = null; if (m_snapshotTargetTerminators != null) { for (Thread t : m_snapshotTargetTerminators) { t.join(); } } } public static boolean isSnapshotInProgress() { final int numSitesSnapshotting = SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.size(); if (numSitesSnapshotting > 0) { if (SNAP_LOG.isDebugEnabled()) { SNAP_LOG.debug("Snapshot in progress, " + numSitesSnapshotting + " sites are still snapshotting"); } return true; } return false; } private BBContainer createNewBuffer(final BBContainer origin, final boolean noSchedule) { return new BBContainer(origin.b()) { @Override public void discard() { checkDoubleFree(); origin.discard(); m_availableSnapshotBuffers.incrementAndGet(); if (!noSchedule) { rescheduleSnapshotWork(); } } }; } private void rescheduleSnapshotWork() { /* * If IV2 is enabled, don't run the potential snapshot work jigger * until the quiet period restrictions have been met. In IV2 doSnapshotWork * is always called with ignoreQuietPeriod and the scheduling is instead done * via the STPE in RealVoltDB. * * The goal of the quiet period is to spread snapshot work out over time and minimize * the impact on latency * * If snapshot priority is 0 then running the jigger immediately is the specified * policy anyways. 10 would be the largest delay */ if (m_snapshotPriority > 0) { final long now = System.currentTimeMillis(); //Ask if the site is idle, and if it is queue the work immediately if (m_idlePredicate.idle(now)) { m_siteTaskerQueue.offer(new SnapshotTask()); return; } //Cache the value locally, the dirty secret is that in edge cases multiple threads //will read/write briefly, but it isn't a big deal since the scheduling can be wrong //briefly. Caching it locally will make the logic here saner because it can't change //as execution progresses final long quietUntil = m_quietUntil; /* * If the current time is > than quietUntil then the quiet period is over * and the snapshot work should be done immediately * * Otherwise it needs to be scheduled in the future and the next quiet period * needs to be calculated */ if (now > quietUntil) { m_siteTaskerQueue.offer(new SnapshotTask()); //Now push the quiet period further into the future, //generally no threads will be racing to do this //since the execution site only interacts with one snapshot data target at a time //except when it is switching tables. It doesn't really matter if it is wrong //it will just result in a little extra snapshot work being done close together m_quietUntil = System.currentTimeMillis() + (5 * m_snapshotPriority) + ((long)(m_random.nextDouble() * 15)); } else { //Schedule it to happen after the quiet period has elapsed VoltDB.instance().schedulePriorityWork( new Runnable() { @Override public void run() { m_siteTaskerQueue.offer(new SnapshotTask()); } }, quietUntil - now, 0, TimeUnit.MILLISECONDS); /* * This is the same calculation as above except the future is not based * on the current time since the quiet period was already in the future * and we need to move further past it since we just scheduled snapshot work * at the end of the current quietUntil value */ m_quietUntil = quietUntil + (5 * m_snapshotPriority) + ((long)(m_random.nextDouble() * 15)); } } else { m_siteTaskerQueue.offer(new SnapshotTask()); } } public void initiateSnapshots( SystemProcedureExecutionContext context, SnapshotFormat format, Deque<SnapshotTableTask> tasks, long txnId, ExtensibleSnapshotDigestData extraSnapshotData) { ExecutionSitesCurrentlySnapshotting.add(this); final long now = System.currentTimeMillis(); m_quietUntil = now + 200; m_perSiteLastSnapshotSucceded = true; m_lastSnapshotTxnId = txnId; m_snapshotTableTasks = MiscUtils.sortedArrayListMultimap(); m_streamers = Maps.newHashMap(); m_snapshotTargetTerminators = new ArrayList<Thread>(); m_extraSnapshotData = extraSnapshotData; // Table doesn't implement hashCode(), so use the table ID as key for (Map.Entry<Integer, byte[]> tablePredicates : makeTablesAndPredicatesToSnapshot(tasks).entrySet()) { int tableId = tablePredicates.getKey(); TableStreamer streamer = new TableStreamer(tableId, format.getStreamType(), m_snapshotTableTasks.get(tableId)); if (!streamer.activate(context, tablePredicates.getValue())) { VoltDB.crashLocalVoltDB("Failed to activate snapshot stream on table " + CatalogUtil.getTableNameFromId(context.getDatabase(), tableId), false, null); } m_streamers.put(tableId, streamer); } /* * Resize the buffer pool to contain enough buffers for the number of tasks. The buffer * pool will be cleaned up at the end of the snapshot. * * For the general case of only one snapshot at a time, this will have the same behavior * as before, 5 buffers per snapshot. * * TODO: This is not a good algorithm for general snapshot coalescing. Rate limiting * won't work as expected with this approach. For general snapshot coalescing, * a better approach like pool per output target should be used. */ int maxTableTaskSize = 0; for (Collection<SnapshotTableTask> perTableTasks : m_snapshotTableTasks.asMap().values()) { maxTableTaskSize = Math.max(maxTableTaskSize, perTableTasks.size()); } } /** * This is called from the snapshot IO thread when the deferred setup is finished. It sets * the data targets and queues a snapshot task onto the site thread. */ public void startSnapshotWithTargets(Collection<SnapshotDataTarget> targets, long now) { //Basically asserts that there are no tasks with null targets at this point //getTarget checks and crashes for (SnapshotTableTask t : m_snapshotTableTasks.values()) { t.getTarget(); } ArrayList<SnapshotDataTarget> targetsToClose = Lists.newArrayList(); for (final SnapshotDataTarget target : targets) { if (target.needsFinalClose()) { targetsToClose.add(target); } } m_snapshotTargets = targetsToClose; // Queue the first snapshot task VoltDB.instance().schedulePriorityWork( new Runnable() { @Override public void run() { m_siteTaskerQueue.offer(new SnapshotTask()); } }, (m_quietUntil + (5 * m_snapshotPriority) - now), 0, TimeUnit.MILLISECONDS); m_quietUntil += 5 * m_snapshotPriority; } private Map<Integer, byte[]> makeTablesAndPredicatesToSnapshot(Collection<SnapshotTableTask> tasks) { Map<Integer, SnapshotPredicates> tablesAndPredicates = Maps.newHashMap(); Map<Integer, byte[]> predicateBytes = Maps.newHashMap(); for (SnapshotTableTask task : tasks) { SNAP_LOG.debug("Examining SnapshotTableTask: " + task); // Add the task to the task list for the given table m_snapshotTableTasks.put(task.m_table.getRelativeIndex(), task); // Make sure there is a predicate object for each table, the predicate could contain // empty expressions. So activateTableStream() doesn't have to do a null check. SnapshotPredicates predicates = tablesAndPredicates.get(task.m_table.getRelativeIndex()); if (predicates == null) { predicates = new SnapshotPredicates(task.m_table.getRelativeIndex()); tablesAndPredicates.put(task.m_table.getRelativeIndex(), predicates); } predicates.addPredicate(task.m_predicate, task.m_deleteTuples); } for (Map.Entry<Integer, SnapshotPredicates> e : tablesAndPredicates.entrySet()) { predicateBytes.put(e.getKey(), e.getValue().toBytes()); } return predicateBytes; } /** * Create an output buffer for each task. * @return null if there aren't enough buffers left in the pool. */ private List<BBContainer> getOutputBuffers(Collection<SnapshotTableTask> tableTasks, boolean noSchedule) { final int desired = tableTasks.size(); while (true) { int available = m_availableSnapshotBuffers.get(); //Limit the number of buffers used concurrently if (desired > available) { return null; } if (m_availableSnapshotBuffers.compareAndSet(available, available - desired)) break; } List<BBContainer> outputBuffers = new ArrayList<BBContainer>(tableTasks.size()); for (int ii = 0; ii < tableTasks.size(); ii++) { final BBContainer origin = DBBPool.allocateDirectAndPool(m_snapshotBufferLength); outputBuffers.add(createNewBuffer(origin, noSchedule)); } return outputBuffers; } private void asyncTerminateReplicatedTableTasks(Collection<SnapshotTableTask> tableTasks) { for (final SnapshotTableTask tableTask : tableTasks) { /** * Replicated tables are assigned to a single ES on each site and that ES * is responsible for closing the data target. Done in a separate * thread so the EE can continue working. */ if (tableTask.m_table.getIsreplicated() && tableTask.m_target.getFormat().canCloseEarly()) { final Thread terminatorThread = new Thread("Replicated SnapshotDataTarget terminator ") { @Override public void run() { try { tableTask.m_target.close(); } catch (IOException e) { m_perSiteLastSnapshotSucceded = false; throw new RuntimeException(e); } catch (InterruptedException e) { m_perSiteLastSnapshotSucceded = false; throw new RuntimeException(e); } } }; m_snapshotTargetTerminators.add(terminatorThread); terminatorThread.start(); } } } /* * No schedule means don't try and schedule snapshot work because this is a blocking * task from completeSnapshotWork. This avoids creating thousands of task objects. */ public Future<?> doSnapshotWork(SystemProcedureExecutionContext context, boolean noSchedule) { ListenableFuture<?> retval = null; /* * This thread will null out the reference to m_snapshotTableTasks when * a snapshot is finished. If the snapshot buffer is loaned out that means * it is pending I/O somewhere so there is no work to do until it comes back. */ if (m_snapshotTableTasks == null) { return retval; } if (m_snapshotTargets == null) { return null; } /* * Try to serialize a block from a table, if the table is finished, * remove the tasks from the task map and move on to the next table. If a block is * successfully serialized, break out of the loop and release the site thread for more * transaction work. */ Iterator<Map.Entry<Integer, Collection<SnapshotTableTask>>> taskIter = m_snapshotTableTasks.asMap().entrySet().iterator(); while (taskIter.hasNext()) { Map.Entry<Integer, Collection<SnapshotTableTask>> taskEntry = taskIter.next(); final int tableId = taskEntry.getKey(); final Collection<SnapshotTableTask> tableTasks = taskEntry.getValue(); final List<BBContainer> outputBuffers = getOutputBuffers(tableTasks, noSchedule); if (outputBuffers == null) { // Not enough buffers available if (!noSchedule) { rescheduleSnapshotWork(); } break; } // Stream more and add a listener to handle any failures Pair<ListenableFuture, Boolean> streamResult = m_streamers.get(tableId).streamMore(context, outputBuffers, null); if (streamResult.getFirst() != null) { final ListenableFuture writeFutures = streamResult.getFirst(); writeFutures.addListener(new Runnable() { @Override public void run() { try { writeFutures.get(); } catch (Throwable t) { if (m_perSiteLastSnapshotSucceded) { if (t instanceof StreamSnapshotTimeoutException || t.getCause() instanceof StreamSnapshotTimeoutException) { //This error is already logged by the watchdog when it generates the exception } else { SNAP_LOG.error("Error while attempting to write snapshot data", t); } m_perSiteLastSnapshotSucceded = false; } } } }, CoreUtils.SAMETHREADEXECUTOR); } /** * The table streamer will return false when there is no more data left to pull from that table. The * enclosing loop ensures that the next table is then addressed. */ if (!streamResult.getSecond()) { asyncTerminateReplicatedTableTasks(tableTasks); // XXX: Guava's multimap will clear the tableTasks collection when the entry is // removed from the containing map, so don't use the collection after removal! taskIter.remove(); SNAP_LOG.debug("Finished snapshot tasks for table " + tableId + ": " + tableTasks); } else { break; } } /** * If there are no more tasks then this particular EE is finished doing snapshot work * Check the AtomicInteger to find out if this is the last one. */ if (m_snapshotTableTasks.isEmpty()) { SNAP_LOG.debug("Finished with tasks"); // In case this is a non-blocking snapshot, do the post-snapshot tasks here. runPostSnapshotTasks(context); final ArrayList<SnapshotDataTarget> snapshotTargets = m_snapshotTargets; m_snapshotTargets = null; m_snapshotTableTasks = null; boolean IamLast = false; synchronized (ExecutionSitesCurrentlySnapshotting) { if (!ExecutionSitesCurrentlySnapshotting.contains(this)) { VoltDB.crashLocalVoltDB( "Currently snapshotting site didn't find itself in set of snapshotting sites", true, null); } IamLast = ExecutionSitesCurrentlySnapshotting.size() == 1; if (!IamLast) { ExecutionSitesCurrentlySnapshotting.remove(this); } } /** * If this is the last one then this EE must close all the SnapshotDataTargets. * Done in a separate thread so the EE can go and do other work. It will * sync every file descriptor and that may block for a while. */ if (IamLast) { SNAP_LOG.debug("I AM LAST!"); final long txnId = m_lastSnapshotTxnId; final ExtensibleSnapshotDigestData snapshotDataForZookeeper = m_extraSnapshotData; m_extraSnapshotData = null; final Thread terminatorThread = new Thread("Snapshot terminator") { @Override public void run() { boolean snapshotSucceeded = true; try { /* * Be absolutely sure the snapshot is finished * and synced to disk before another is started */ for (Thread t : m_snapshotTargetTerminators){ if (t == this) { continue; } try { t.join(); } catch (InterruptedException e) { return; } } for (final SnapshotDataTarget t : snapshotTargets) { try { t.close(); } catch (IOException e) { snapshotSucceeded = false; throw new RuntimeException(e); } catch (InterruptedException e) { snapshotSucceeded = false; throw new RuntimeException(e); } } Runnable r = null; while ((r = m_tasksOnSnapshotCompletion.poll()) != null) { try { r.run(); } catch (Exception e) { SNAP_LOG.error("Error running snapshot completion task", e); } } } finally { // Caching the value here before the site removes itself from the // ExecutionSitesCurrentlySnapshotting set, so // logSnapshotCompletionToZK() will not see incorrect values // from the next snapshot try { VoltDB.instance().getHostMessenger().getZK().delete( VoltZK.nodes_currently_snapshotting + "/" + VoltDB.instance().getHostMessenger().getHostId(), -1); } catch (NoNodeException e) { SNAP_LOG.warn("Expect the snapshot node to already exist during deletion", e); } catch (Exception e) { VoltDB.crashLocalVoltDB(e.getMessage(), true, e); } finally { /** * Remove this last site from the set here after the terminator has run * so that new snapshots won't start until * everything is on disk for the previous snapshot. This prevents a really long * snapshot initiation procedure from occurring because it has to contend for * filesystem resources * * Do this before logSnapshotCompleteToZK() because the ZK operations are slow, * and they can trigger snapshot completion interests to fire before this site * removes itself from the set. The next snapshot request may come in and see * this snapshot is still in progress. */ ExecutionSitesCurrentlySnapshotting.remove(SnapshotSiteProcessor.this); } logSnapshotCompleteToZK(txnId, snapshotSucceeded, snapshotDataForZookeeper); } } }; m_snapshotTargetTerminators.add(terminatorThread); terminatorThread.start(); } } return retval; } public static void runPostSnapshotTasks(SystemProcedureExecutionContext context) { SNAP_LOG.debug("Running post-snapshot tasks"); PostSnapshotTask postSnapshotTask = m_siteTasksPostSnapshotting.remove(context.getPartitionId()); if (postSnapshotTask != null) { postSnapshotTask.run(context); } } private static void logSnapshotCompleteToZK( long txnId, boolean snapshotSuccess, ExtensibleSnapshotDigestData extraSnapshotData) { ZooKeeper zk = VoltDB.instance().getHostMessenger().getZK(); // Timeout after 10 minutes final long endTime = System.currentTimeMillis() + TimeUnit.MINUTES.toMillis(10); final String snapshotPath = VoltZK.completed_snapshots + "/" + txnId; boolean success = false; while (!success) { if (System.currentTimeMillis() > endTime) { VoltDB.crashLocalVoltDB("Timed out logging snapshot completion to ZK"); } Stat stat = new Stat(); byte data[] = null; try { data = zk.getData(snapshotPath, false, stat); } catch (NoNodeException e) { // The MPI creates the snapshot completion node asynchronously, // if the node doesn't exist yet, retry continue; } catch (Exception e) { VoltDB.crashLocalVoltDB("This ZK get should never fail", true, e); } if (data == null) { VoltDB.crashLocalVoltDB("Data should not be null if the node exists", false, null); } try { JSONObject jsonObj = new JSONObject(new String(data, "UTF-8")); if (jsonObj.getLong("txnId") != txnId) { VoltDB.crashLocalVoltDB("TxnId should match", false, null); } int remainingHosts = jsonObj.getInt("hostCount") - 1; jsonObj.put("hostCount", remainingHosts); jsonObj.put("didSucceed", snapshotSuccess); if (!snapshotSuccess) { jsonObj.put("isTruncation", false); } extraSnapshotData.mergeToZooKeeper(jsonObj, SNAP_LOG); byte[] zkData = jsonObj.toString().getBytes("UTF-8"); if (zkData.length > 5000000) { SNAP_LOG.warn("ZooKeeper node for snapshot digest unexpectedly large: " + zkData.length); } zk.setData(snapshotPath, zkData, stat.getVersion()); } catch (KeeperException.BadVersionException e) { continue; } catch (Exception e) { VoltDB.crashLocalVoltDB("This ZK call should never fail", true, e); } success = true; } /* * If we are running without command logging there will be no consumer for * the completed snapshot messages. Consume them here to bound space usage in ZK. */ try { TreeSet<String> snapshots = new TreeSet<String>(zk.getChildren(VoltZK.completed_snapshots, false)); while (snapshots.size() > 30) { try { zk.delete(VoltZK.completed_snapshots + "/" + snapshots.first(), -1); } catch (NoNodeException e) {} catch (Exception e) { VoltDB.crashLocalVoltDB( "Deleting a snapshot completion record from ZK should only fail with NoNodeException", true, e); } snapshots.remove(snapshots.first()); } } catch (Exception e) { VoltDB.crashLocalVoltDB("Retrieving list of completed snapshots from ZK should never fail", true, e); } } /** * Is the EE associated with this SnapshotSiteProcessor currently * snapshotting? * * No thread safety here, but assuming single-threaded access from * the IV2 site. */ public boolean isEESnapshotting() { return m_snapshotTableTasks != null; } /** * Do snapshot work exclusively until there is no more. Also blocks * until the fsync() and close() of snapshot data targets has completed. */ public HashSet<Exception> completeSnapshotWork(SystemProcedureExecutionContext context) throws InterruptedException { HashSet<Exception> retval = new HashSet<Exception>(); //Set to 10 gigabytes/sec, basically unlimited //Does nothing if rate limiting is not enabled DefaultSnapshotDataTarget.setRate(1024 * 10); try { while (m_snapshotTableTasks != null) { Future<?> result = doSnapshotWork(context, true); if (result != null) { try { result.get(); } catch (ExecutionException e) { final boolean added = retval.add((Exception)e.getCause()); assert(added); } catch (Exception e) { final boolean added = retval.add((Exception)e.getCause()); assert(added); } } } } finally { //Request default rate again DefaultSnapshotDataTarget.setRate(null); } /** * Block until the sync has actually occurred in the forked threads. * The threads are spawned even in the blocking case to keep it simple. */ if (m_snapshotTargetTerminators != null) { for (final Thread t : m_snapshotTargetTerminators) { t.join(); } m_snapshotTargetTerminators = null; } return retval; } }