SnapshotSiteProcessor.java example

Explorer
voltdb-master
/* This file is part of VoltDB.
 * Copyright (C) 2008-2017 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.voltdb;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;

import org.apache.zookeeper_voltpatches.KeeperException;
import org.apache.zookeeper_voltpatches.KeeperException.NoNodeException;
import org.apache.zookeeper_voltpatches.ZooKeeper;
import org.apache.zookeeper_voltpatches.data.Stat;
import org.json_voltpatches.JSONObject;
import org.voltcore.logging.VoltLogger;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.DBBPool;
import org.voltcore.utils.DBBPool.BBContainer;
import org.voltcore.utils.Pair;
import org.voltdb.catalog.Database;
import org.voltdb.catalog.Table;
import org.voltdb.iv2.MpInitiator;
import org.voltdb.iv2.SiteTaskerQueue;
import org.voltdb.iv2.SnapshotTask;
import org.voltdb.rejoin.StreamSnapshotDataTarget.StreamSnapshotTimeoutException;
import org.voltdb.sysprocs.saverestore.SnapshotPredicates;
import org.voltdb.utils.CatalogUtil;
import org.voltdb.utils.CompressionService;
import org.voltdb.utils.MiscUtils;

import com.google_voltpatches.common.collect.ImmutableMap;
import com.google_voltpatches.common.collect.ListMultimap;
import com.google_voltpatches.common.collect.Lists;
import com.google_voltpatches.common.collect.Maps;
import com.google_voltpatches.common.util.concurrent.ListenableFuture;

/**
 * Encapsulates the state needed to manage an ongoing snapshot at the
 * per-execution site level. Also contains some static global snapshot
 * counters. This class requires callers to maintain thread safety;
 * generally (exclusively?) it is driven by ExecutionSite, each of
 * which has a SnapshotSiteProcessor.
 */
public class SnapshotSiteProcessor {

    private static final VoltLogger SNAP_LOG = new VoltLogger("SNAPSHOT");

    /** Global count of execution sites on this node performing snapshot */
    public static final Set<Object> ExecutionSitesCurrentlySnapshotting =
            Collections.synchronizedSet(new HashSet<Object>());
    /**
     * Ensure only one thread running the setup fragment does the creation
     * of the targets and the distribution of the work.
     */
    public static final Object m_snapshotCreateLock = new Object();
    public static CyclicBarrier m_snapshotCreateSetupBarrier = null;
    public static CyclicBarrier m_snapshotCreateFinishBarrier = null;
    public static final Runnable m_snapshotCreateSetupBarrierAction = new Runnable() {
        @Override
        public void run() {
            Runnable r = SnapshotSiteProcessor.m_snapshotCreateSetupBarrierActualAction.getAndSet(null);
            if (r != null) {
                r.run();
            }
        }
    };
    public static AtomicReference<Runnable> m_snapshotCreateSetupBarrierActualAction =
            new AtomicReference<Runnable>();

    public static void readySnapshotSetupBarriers(int numSites) {
        synchronized (SnapshotSiteProcessor.m_snapshotCreateLock) {
            if (SnapshotSiteProcessor.m_snapshotCreateSetupBarrier == null) {
                SnapshotSiteProcessor.m_snapshotCreateFinishBarrier = new CyclicBarrier(numSites);
                SnapshotSiteProcessor.m_snapshotCreateSetupBarrier =
                        new CyclicBarrier(numSites, SnapshotSiteProcessor.m_snapshotCreateSetupBarrierAction);
            } else if (SnapshotSiteProcessor.m_snapshotCreateSetupBarrier.isBroken()) {
                SnapshotSiteProcessor.m_snapshotCreateSetupBarrier.reset();
                SnapshotSiteProcessor.m_snapshotCreateFinishBarrier.reset();
            }
        }
    }

    /**
     * Sequence numbers for export tables. This is repopulated before each snapshot by each execution site
     * that reaches the snapshot.
     */
    private static final Map<String, Map<Integer, Pair<Long, Long>>> m_exportSequenceNumbers =
        new HashMap<String, Map<Integer, Pair<Long, Long>>>();

    private static final Map<Integer, TupleStreamStateInfo> m_drTupleStreamInfo = new HashMap<>();

    private ExtensibleSnapshotDigestData m_extraSnapshotData;

    /*
     * Do some random tasks that are deferred to the snapshot termination thread.
     * The two I know about are syncing/closing the digest file and catalog copy
     */
    public static final ConcurrentLinkedQueue<Runnable> m_tasksOnSnapshotCompletion =
        new ConcurrentLinkedQueue<Runnable>();

    /*
     * Random tasks performed on each site after the snapshot tasks are finished but
     * before the snapshot transaction is finished.
     */
    public static final Map<Integer, PostSnapshotTask> m_siteTasksPostSnapshotting =
            Collections.synchronizedMap(new HashMap<Integer, PostSnapshotTask>());

    /**
     * Pick a buffer length that is big enough to store at least one of the largest size tuple supported
     * in the system (2 megabytes). Add a fudge factor for metadata.
     */
    public static final int m_snapshotBufferLength = (1024 * 1024 * 2) + Short.MAX_VALUE;
    public static final int m_snapshotBufferCompressedLen =
        CompressionService.maxCompressedLength(m_snapshotBufferLength);

    /**
     * Limit the number of buffers that are outstanding at any given time
     */
    private static final AtomicInteger m_availableSnapshotBuffers = new AtomicInteger(16);

    /**
     * The last EE out has to shut off the lights. Cache a list
     * of targets in case this EE ends up being the one that needs
     * to close each target.
     */
    private volatile ArrayList<SnapshotDataTarget> m_snapshotTargets = null;

    /**
     * Map of tasks for tables that still need to be snapshotted.
     * Once a table has finished serializing stuff, it's removed from the map.
     * Making it a TreeMap so that it works through one table at time, easier to debug.
     */
    private ListMultimap<Integer, SnapshotTableTask> m_snapshotTableTasks = null;
    private Map<Integer, TableStreamer> m_streamers = null;

    private long m_lastSnapshotTxnId;
    private final int m_snapshotPriority;

    private boolean m_perSiteLastSnapshotSucceded = true;

    /**
     * List of threads to join to block on snapshot completion
     * when using completeSnapshotWork().
     */
    private ArrayList<Thread> m_snapshotTargetTerminators = null;

    /**
     * When a buffer is returned to the pool a new snapshot task will be offered to the queue
     * to ensure the EE wakes up and does any potential snapshot work with that buffer
     */
    private final SiteTaskerQueue m_siteTaskerQueue;

    private final Random m_random = new Random();

    /*
     * Interface that will be checked when scheduling snapshot work in IV2.
     * Reports whether the site is "idle" for whatever definition that may be.
     * If the site is idle then work will be scheduled immediately instead of being
     * throttled
     */
    public interface IdlePredicate {
        public boolean idle(long now);
    }

    private final IdlePredicate m_idlePredicate;

    /*
     * Synchronization is handled by SnapshotSaveAPI.startSnapshotting
     * Store the export sequence numbers for every table and partition. This will
     * be called by every execution site before the snapshot starts. Then the execution
     * site that gets the setup permit will  use getExportSequenceNumbers to retrieve the full
     * set and reset the contents.
     */
    public static void populateSequenceNumbersForExecutionSite(SystemProcedureExecutionContext context) {
        Database database = context.getDatabase();
        for (Table t : database.getTables()) {
            if (!CatalogUtil.isTableExportOnly(database, t))
                continue;

            Map<Integer, Pair<Long,Long>> sequenceNumbers = m_exportSequenceNumbers.get(t.getTypeName());
            if (sequenceNumbers == null) {
                sequenceNumbers = new HashMap<Integer, Pair<Long, Long>>();
                m_exportSequenceNumbers.put(t.getTypeName(), sequenceNumbers);
            }

            long[] ackOffSetAndSequenceNumber =
                    context.getSiteProcedureConnection().getUSOForExportTable(t.getSignature());
            sequenceNumbers.put(
                            context.getPartitionId(),
                            Pair.of(
                                ackOffSetAndSequenceNumber[0],
                                ackOffSetAndSequenceNumber[1]));
        }
        TupleStreamStateInfo drStateInfo = context.getSiteProcedureConnection().getDRTupleStreamStateInfo();
        m_drTupleStreamInfo.put(context.getPartitionId(), drStateInfo);
        if (drStateInfo.containsReplicatedStreamInfo) {
            m_drTupleStreamInfo.put(MpInitiator.MP_INIT_PID, drStateInfo);
        }
    }

    public static Map<String, Map<Integer, Pair<Long, Long>>> getExportSequenceNumbers() {
        HashMap<String, Map<Integer, Pair<Long, Long>>> sequenceNumbers =
                new HashMap<String, Map<Integer, Pair<Long, Long>>>(m_exportSequenceNumbers);
        m_exportSequenceNumbers.clear();
        return sequenceNumbers;
    }

    public static Map<Integer, TupleStreamStateInfo> getDRTupleStreamStateInfo() {
        Map<Integer, TupleStreamStateInfo> stateInfo = ImmutableMap.copyOf(m_drTupleStreamInfo);
        m_drTupleStreamInfo.clear();
        return stateInfo;
    }

    private long m_quietUntil = 0;

    public SnapshotSiteProcessor(SiteTaskerQueue siteQueue, int snapshotPriority) {
        this(siteQueue, snapshotPriority, new IdlePredicate() {
            @Override
            public boolean idle(long now) {
                throw new UnsupportedOperationException();
            }
        });
    }

    public SnapshotSiteProcessor(SiteTaskerQueue siteQueue, int snapshotPriority, IdlePredicate idlePredicate) {
        m_siteTaskerQueue = siteQueue;
        m_snapshotPriority = snapshotPriority;
        m_idlePredicate = idlePredicate;
    }

    public void shutdown() throws InterruptedException {
        m_snapshotCreateSetupBarrier = null;
        m_snapshotCreateFinishBarrier = null;
        if (m_snapshotTargetTerminators != null) {
            for (Thread t : m_snapshotTargetTerminators) {
                t.join();
            }
        }
    }


    public static boolean isSnapshotInProgress()
    {
        final int numSitesSnapshotting = SnapshotSiteProcessor.ExecutionSitesCurrentlySnapshotting.size();
        if (numSitesSnapshotting > 0) {
            if (SNAP_LOG.isDebugEnabled()) {
                SNAP_LOG.debug("Snapshot in progress, " + numSitesSnapshotting + " sites are still snapshotting");
            }
            return true;
        }
        return false;
    }

    private BBContainer createNewBuffer(final BBContainer origin, final boolean noSchedule)
    {
        return new BBContainer(origin.b()) {
            @Override
            public void discard() {
                checkDoubleFree();
                origin.discard();
                m_availableSnapshotBuffers.incrementAndGet();

                if (!noSchedule) {
                    rescheduleSnapshotWork();
                }
            }
        };
    }

    private void rescheduleSnapshotWork() {
        /*
         * If IV2 is enabled, don't run the potential snapshot work jigger
         * until the quiet period restrictions have been met. In IV2 doSnapshotWork
         * is always called with ignoreQuietPeriod and the scheduling is instead done
         * via the STPE in RealVoltDB.
         *
         * The goal of the quiet period is to spread snapshot work out over time and minimize
         * the impact on latency
         *
         * If snapshot priority is 0 then running the jigger immediately is the specified
         * policy anyways. 10 would be the largest delay
         */
        if (m_snapshotPriority > 0) {
            final long now = System.currentTimeMillis();
            //Ask if the site is idle, and if it is queue the work immediately
            if (m_idlePredicate.idle(now)) {
                m_siteTaskerQueue.offer(new SnapshotTask());
                return;
            }

            //Cache the value locally, the dirty secret is that in edge cases multiple threads
            //will read/write briefly, but it isn't a big deal since the scheduling can be wrong
            //briefly. Caching it locally will make the logic here saner because it can't change
            //as execution progresses
            final long quietUntil = m_quietUntil;

                    /*
                     * If the current time is > than quietUntil then the quiet period is over
                     * and the snapshot work should be done immediately
                     *
                     * Otherwise it needs to be scheduled in the future and the next quiet period
                     * needs to be calculated
                     */
            if (now > quietUntil) {
                m_siteTaskerQueue.offer(new SnapshotTask());
                //Now push the quiet period further into the future,
                //generally no threads will be racing to do this
                //since the execution site only interacts with one snapshot data target at a time
                //except when it is switching tables. It doesn't really matter if it is wrong
                //it will just result in a little extra snapshot work being done close together
                m_quietUntil =
                        System.currentTimeMillis() +
                                (5 * m_snapshotPriority) + ((long)(m_random.nextDouble() * 15));
            } else {
                //Schedule it to happen after the quiet period has elapsed
                VoltDB.instance().schedulePriorityWork(
                        new Runnable() {
                            @Override
                            public void run()
                            {
                                m_siteTaskerQueue.offer(new SnapshotTask());
                            }
                        },
                        quietUntil - now,
                        0,
                        TimeUnit.MILLISECONDS);

                        /*
                         * This is the same calculation as above except the future is not based
                         * on the current time since the quiet period was already in the future
                         * and we need to move further past it since we just scheduled snapshot work
                         * at the end of the current quietUntil value
                         */
                m_quietUntil =
                        quietUntil +
                                (5 * m_snapshotPriority) + ((long)(m_random.nextDouble() * 15));
            }
        } else {
            m_siteTaskerQueue.offer(new SnapshotTask());
        }
    }

    public void initiateSnapshots(
            SystemProcedureExecutionContext context,
            SnapshotFormat format,
            Deque<SnapshotTableTask> tasks,
            long txnId,
            ExtensibleSnapshotDigestData extraSnapshotData)
    {
        ExecutionSitesCurrentlySnapshotting.add(this);
        final long now = System.currentTimeMillis();
        m_quietUntil = now + 200;
        m_perSiteLastSnapshotSucceded = true;
        m_lastSnapshotTxnId = txnId;
        m_snapshotTableTasks = MiscUtils.sortedArrayListMultimap();
        m_streamers = Maps.newHashMap();
        m_snapshotTargetTerminators = new ArrayList<Thread>();
        m_extraSnapshotData = extraSnapshotData;

        // Table doesn't implement hashCode(), so use the table ID as key
        for (Map.Entry<Integer, byte[]> tablePredicates : makeTablesAndPredicatesToSnapshot(tasks).entrySet()) {
            int tableId = tablePredicates.getKey();
            TableStreamer streamer =
                    new TableStreamer(tableId, format.getStreamType(), m_snapshotTableTasks.get(tableId));
            if (!streamer.activate(context, tablePredicates.getValue())) {
                VoltDB.crashLocalVoltDB("Failed to activate snapshot stream on table " +
                                        CatalogUtil.getTableNameFromId(context.getDatabase(), tableId), false, null);
            }
            m_streamers.put(tableId, streamer);
        }

        /*
         * Resize the buffer pool to contain enough buffers for the number of tasks. The buffer
         * pool will be cleaned up at the end of the snapshot.
         *
         * For the general case of only one snapshot at a time, this will have the same behavior
         * as before, 5 buffers per snapshot.
         *
         * TODO: This is not a good algorithm for general snapshot coalescing. Rate limiting
         * won't work as expected with this approach. For general snapshot coalescing,
         * a better approach like pool per output target should be used.
         */
        int maxTableTaskSize = 0;
        for (Collection<SnapshotTableTask> perTableTasks : m_snapshotTableTasks.asMap().values()) {
            maxTableTaskSize = Math.max(maxTableTaskSize, perTableTasks.size());
        }
    }

    /**
     * This is called from the snapshot IO thread when the deferred setup is finished. It sets
     * the data targets and queues a snapshot task onto the site thread.
     */
    public void startSnapshotWithTargets(Collection<SnapshotDataTarget> targets, long now)
    {
        //Basically asserts that there are no tasks with null targets at this point
        //getTarget checks and crashes
        for (SnapshotTableTask t : m_snapshotTableTasks.values()) {
            t.getTarget();
        }

        ArrayList<SnapshotDataTarget> targetsToClose = Lists.newArrayList();
        for (final SnapshotDataTarget target : targets) {
            if (target.needsFinalClose()) {
                targetsToClose.add(target);
            }
        }
        m_snapshotTargets = targetsToClose;

        // Queue the first snapshot task
        VoltDB.instance().schedulePriorityWork(
                new Runnable() {
                    @Override
                    public void run()
                    {
                        m_siteTaskerQueue.offer(new SnapshotTask());
                    }
                },
                (m_quietUntil + (5 * m_snapshotPriority) - now),
                0,
                TimeUnit.MILLISECONDS);
        m_quietUntil += 5 * m_snapshotPriority;
    }

    private Map<Integer, byte[]>
    makeTablesAndPredicatesToSnapshot(Collection<SnapshotTableTask> tasks) {
        Map<Integer, SnapshotPredicates> tablesAndPredicates = Maps.newHashMap();
        Map<Integer, byte[]> predicateBytes = Maps.newHashMap();

        for (SnapshotTableTask task : tasks) {
            SNAP_LOG.debug("Examining SnapshotTableTask: " + task);

            // Add the task to the task list for the given table
            m_snapshotTableTasks.put(task.m_table.getRelativeIndex(), task);

            // Make sure there is a predicate object for each table, the predicate could contain
            // empty expressions. So activateTableStream() doesn't have to do a null check.
            SnapshotPredicates predicates = tablesAndPredicates.get(task.m_table.getRelativeIndex());
            if (predicates == null) {
                predicates = new SnapshotPredicates(task.m_table.getRelativeIndex());
                tablesAndPredicates.put(task.m_table.getRelativeIndex(), predicates);
            }

            predicates.addPredicate(task.m_predicate, task.m_deleteTuples);
        }

        for (Map.Entry<Integer, SnapshotPredicates> e : tablesAndPredicates.entrySet()) {
            predicateBytes.put(e.getKey(), e.getValue().toBytes());
        }

        return predicateBytes;
    }

    /**
     * Create an output buffer for each task.
     * @return null if there aren't enough buffers left in the pool.
     */
    private List<BBContainer> getOutputBuffers(Collection<SnapshotTableTask> tableTasks, boolean noSchedule)
    {
        final int desired = tableTasks.size();
        while (true) {
            int available = m_availableSnapshotBuffers.get();

            //Limit the number of buffers used concurrently
            if (desired > available) {
                return null;
            }
            if (m_availableSnapshotBuffers.compareAndSet(available, available - desired)) break;
        }

        List<BBContainer> outputBuffers = new ArrayList<BBContainer>(tableTasks.size());

        for (int ii = 0; ii < tableTasks.size(); ii++) {
            final BBContainer origin = DBBPool.allocateDirectAndPool(m_snapshotBufferLength);
            outputBuffers.add(createNewBuffer(origin, noSchedule));
        }

        return outputBuffers;
    }

    private void asyncTerminateReplicatedTableTasks(Collection<SnapshotTableTask> tableTasks)
    {
        for (final SnapshotTableTask tableTask : tableTasks) {
            /**
             * Replicated tables are assigned to a single ES on each site and that ES
             * is responsible for closing the data target. Done in a separate
             * thread so the EE can continue working.
             */
            if (tableTask.m_table.getIsreplicated() &&
                tableTask.m_target.getFormat().canCloseEarly()) {
                final Thread terminatorThread =
                    new Thread("Replicated SnapshotDataTarget terminator ") {
                        @Override
                        public void run() {
                            try {
                                tableTask.m_target.close();
                            } catch (IOException e) {
                                m_perSiteLastSnapshotSucceded = false;
                                throw new RuntimeException(e);
                            } catch (InterruptedException e) {
                                m_perSiteLastSnapshotSucceded = false;
                                throw new RuntimeException(e);
                            }

                        }
                    };
                m_snapshotTargetTerminators.add(terminatorThread);
                terminatorThread.start();
            }
        }
    }

    /*
     * No schedule means don't try and schedule snapshot work because this is a blocking
     * task from completeSnapshotWork. This avoids creating thousands of task objects.
     */
    public Future<?> doSnapshotWork(SystemProcedureExecutionContext context, boolean noSchedule) {
        ListenableFuture<?> retval = null;

        /*
         * This thread will null out the reference to m_snapshotTableTasks when
         * a snapshot is finished. If the snapshot buffer is loaned out that means
         * it is pending I/O somewhere so there is no work to do until it comes back.
         */
        if (m_snapshotTableTasks == null) {
            return retval;
        }

        if (m_snapshotTargets == null) {
            return null;
        }

        /*
         * Try to serialize a block from a table, if the table is finished,
         * remove the tasks from the task map and move on to the next table. If a block is
         * successfully serialized, break out of the loop and release the site thread for more
         * transaction work.
         */
        Iterator<Map.Entry<Integer, Collection<SnapshotTableTask>>> taskIter =
                m_snapshotTableTasks.asMap().entrySet().iterator();
        while (taskIter.hasNext()) {
            Map.Entry<Integer, Collection<SnapshotTableTask>> taskEntry = taskIter.next();
            final int tableId = taskEntry.getKey();
            final Collection<SnapshotTableTask> tableTasks = taskEntry.getValue();

            final List<BBContainer> outputBuffers = getOutputBuffers(tableTasks, noSchedule);
            if (outputBuffers == null) {
                // Not enough buffers available
                if (!noSchedule) {
                    rescheduleSnapshotWork();
                }
                break;
            }


            // Stream more and add a listener to handle any failures
            Pair<ListenableFuture, Boolean> streamResult =
                    m_streamers.get(tableId).streamMore(context, outputBuffers, null);
            if (streamResult.getFirst() != null) {
                final ListenableFuture writeFutures = streamResult.getFirst();
                writeFutures.addListener(new Runnable() {
                    @Override
                    public void run()
                    {
                        try {
                            writeFutures.get();
                        } catch (Throwable t) {
                            if (m_perSiteLastSnapshotSucceded) {
                                if (t instanceof StreamSnapshotTimeoutException ||
                                        t.getCause() instanceof StreamSnapshotTimeoutException) {
                                    //This error is already logged by the watchdog when it generates the exception
                                } else {
                                    SNAP_LOG.error("Error while attempting to write snapshot data", t);
                                }
                                m_perSiteLastSnapshotSucceded = false;
                            }
                        }
                    }
                }, CoreUtils.SAMETHREADEXECUTOR);
            }

            /**
             * The table streamer will return false when there is no more data left to pull from that table. The
             * enclosing loop ensures that the next table is then addressed.
             */
            if (!streamResult.getSecond()) {
                asyncTerminateReplicatedTableTasks(tableTasks);
                // XXX: Guava's multimap will clear the tableTasks collection when the entry is
                // removed from the containing map, so don't use the collection after removal!
                taskIter.remove();
                SNAP_LOG.debug("Finished snapshot tasks for table " + tableId +
                               ": " + tableTasks);
            } else {
                break;
            }
        }

        /**
         * If there are no more tasks then this particular EE is finished doing snapshot work
         * Check the AtomicInteger to find out if this is the last one.
         */
        if (m_snapshotTableTasks.isEmpty()) {
            SNAP_LOG.debug("Finished with tasks");
            // In case this is a non-blocking snapshot, do the post-snapshot tasks here.
            runPostSnapshotTasks(context);
            final ArrayList<SnapshotDataTarget> snapshotTargets = m_snapshotTargets;
            m_snapshotTargets = null;
            m_snapshotTableTasks = null;
            boolean IamLast = false;
            synchronized (ExecutionSitesCurrentlySnapshotting) {
                if (!ExecutionSitesCurrentlySnapshotting.contains(this)) {
                    VoltDB.crashLocalVoltDB(
                            "Currently snapshotting site didn't find itself in set of snapshotting sites", true, null);
                }
                IamLast = ExecutionSitesCurrentlySnapshotting.size() == 1;
                if (!IamLast) {
                    ExecutionSitesCurrentlySnapshotting.remove(this);
                }
            }

            /**
             * If this is the last one then this EE must close all the SnapshotDataTargets.
             * Done in a separate thread so the EE can go and do other work. It will
             * sync every file descriptor and that may block for a while.
             */
            if (IamLast) {
                SNAP_LOG.debug("I AM LAST!");
                final long txnId = m_lastSnapshotTxnId;
                final ExtensibleSnapshotDigestData snapshotDataForZookeeper = m_extraSnapshotData;
                m_extraSnapshotData = null;
                final Thread terminatorThread =
                    new Thread("Snapshot terminator") {
                    @Override
                    public void run() {
                        boolean snapshotSucceeded = true;
                        try {
                            /*
                             * Be absolutely sure the snapshot is finished
                             * and synced to disk before another is started
                             */
                            for (Thread t : m_snapshotTargetTerminators){
                                if (t == this) {
                                    continue;
                                }
                                try {
                                    t.join();
                                } catch (InterruptedException e) {
                                    return;
                                }
                            }
                            for (final SnapshotDataTarget t : snapshotTargets) {
                                try {
                                    t.close();
                                } catch (IOException e) {
                                    snapshotSucceeded = false;
                                    throw new RuntimeException(e);
                                } catch (InterruptedException e) {
                                    snapshotSucceeded = false;
                                    throw new RuntimeException(e);
                                }
                            }

                            Runnable r = null;
                            while ((r = m_tasksOnSnapshotCompletion.poll()) != null) {
                                try {
                                    r.run();
                                } catch (Exception e) {
                                    SNAP_LOG.error("Error running snapshot completion task", e);
                                }
                            }
                        } finally {
                            // Caching the value here before the site removes itself from the
                            // ExecutionSitesCurrentlySnapshotting set, so
                            // logSnapshotCompletionToZK() will not see incorrect values
                            // from the next snapshot

                            try {
                                VoltDB.instance().getHostMessenger().getZK().delete(
                                        VoltZK.nodes_currently_snapshotting + "/" + VoltDB.instance().getHostMessenger().getHostId(), -1);
                            } catch (NoNodeException e) {
                                SNAP_LOG.warn("Expect the snapshot node to already exist during deletion", e);
                            } catch (Exception e) {
                                VoltDB.crashLocalVoltDB(e.getMessage(), true, e);
                            } finally {
                                /**
                                 * Remove this last site from the set here after the terminator has run
                                 * so that new snapshots won't start until
                                 * everything is on disk for the previous snapshot. This prevents a really long
                                 * snapshot initiation procedure from occurring because it has to contend for
                                 * filesystem resources
                                 *
                                 * Do this before logSnapshotCompleteToZK() because the ZK operations are slow,
                                 * and they can trigger snapshot completion interests to fire before this site
                                 * removes itself from the set. The next snapshot request may come in and see
                                 * this snapshot is still in progress.
                                 */
                                ExecutionSitesCurrentlySnapshotting.remove(SnapshotSiteProcessor.this);
                            }

                            logSnapshotCompleteToZK(txnId,
                                                    snapshotSucceeded,
                                                    snapshotDataForZookeeper);
                        }
                    }
                };

                m_snapshotTargetTerminators.add(terminatorThread);
                terminatorThread.start();
            }
        }
        return retval;
    }

    public static void runPostSnapshotTasks(SystemProcedureExecutionContext context)
    {
        SNAP_LOG.debug("Running post-snapshot tasks");
        PostSnapshotTask postSnapshotTask = m_siteTasksPostSnapshotting.remove(context.getPartitionId());
        if (postSnapshotTask != null) {
            postSnapshotTask.run(context);
        }
    }

    private static void logSnapshotCompleteToZK(
            long txnId,
            boolean snapshotSuccess,
            ExtensibleSnapshotDigestData extraSnapshotData) {
        ZooKeeper zk = VoltDB.instance().getHostMessenger().getZK();

        // Timeout after 10 minutes
        final long endTime = System.currentTimeMillis() + TimeUnit.MINUTES.toMillis(10);
        final String snapshotPath = VoltZK.completed_snapshots + "/" + txnId;
        boolean success = false;
        while (!success) {
            if (System.currentTimeMillis() > endTime) {
                VoltDB.crashLocalVoltDB("Timed out logging snapshot completion to ZK");
            }

            Stat stat = new Stat();
            byte data[] = null;
            try {
                data = zk.getData(snapshotPath, false, stat);
            } catch (NoNodeException e) {
                // The MPI creates the snapshot completion node asynchronously,
                // if the node doesn't exist yet, retry
                continue;
            } catch (Exception e) {
                VoltDB.crashLocalVoltDB("This ZK get should never fail", true, e);
            }
            if (data == null) {
                VoltDB.crashLocalVoltDB("Data should not be null if the node exists", false, null);
            }

            try {
                JSONObject jsonObj = new JSONObject(new String(data, "UTF-8"));
                if (jsonObj.getLong("txnId") != txnId) {
                    VoltDB.crashLocalVoltDB("TxnId should match", false, null);
                }
                int remainingHosts = jsonObj.getInt("hostCount") - 1;
                jsonObj.put("hostCount", remainingHosts);
                jsonObj.put("didSucceed", snapshotSuccess);
                if (!snapshotSuccess) {
                    jsonObj.put("isTruncation", false);
                }
                extraSnapshotData.mergeToZooKeeper(jsonObj, SNAP_LOG);
                byte[] zkData = jsonObj.toString().getBytes("UTF-8");
                if (zkData.length > 5000000) {
                    SNAP_LOG.warn("ZooKeeper node for snapshot digest unexpectedly large: " + zkData.length);
                }
                zk.setData(snapshotPath, zkData, stat.getVersion());
            } catch (KeeperException.BadVersionException e) {
                continue;
            } catch (Exception e) {
                VoltDB.crashLocalVoltDB("This ZK call should never fail", true, e);
            }
            success = true;
        }

        /*
         * If we are running without command logging there will be no consumer for
         * the completed snapshot messages. Consume them here to bound space usage in ZK.
         */
        try {
            TreeSet<String> snapshots = new TreeSet<String>(zk.getChildren(VoltZK.completed_snapshots, false));
            while (snapshots.size() > 30) {
                try {
                    zk.delete(VoltZK.completed_snapshots + "/" + snapshots.first(), -1);
                } catch (NoNodeException e) {}
                catch (Exception e) {
                    VoltDB.crashLocalVoltDB(
                            "Deleting a snapshot completion record from ZK should only fail with NoNodeException", true, e);
                }
                snapshots.remove(snapshots.first());
            }
        } catch (Exception e) {
            VoltDB.crashLocalVoltDB("Retrieving list of completed snapshots from ZK should never fail", true, e);
        }
    }


    /**
     * Is the EE associated with this SnapshotSiteProcessor currently
     * snapshotting?
     *
     * No thread safety here, but assuming single-threaded access from
     * the IV2 site.
     */
    public boolean isEESnapshotting() {
        return m_snapshotTableTasks != null;
    }

    /**
     * Do snapshot work exclusively until there is no more. Also blocks
     * until the fsync() and close() of snapshot data targets has completed.
     */
    public HashSet<Exception> completeSnapshotWork(SystemProcedureExecutionContext context)
        throws InterruptedException {
        HashSet<Exception> retval = new HashSet<Exception>();
        //Set to 10 gigabytes/sec, basically unlimited
        //Does nothing if rate limiting is not enabled
        DefaultSnapshotDataTarget.setRate(1024 * 10);
        try {
            while (m_snapshotTableTasks != null) {
                Future<?> result = doSnapshotWork(context, true);
                if (result != null) {
                    try {
                        result.get();
                    } catch (ExecutionException e) {
                        final boolean added = retval.add((Exception)e.getCause());
                        assert(added);
                    } catch (Exception e) {
                        final boolean added = retval.add((Exception)e.getCause());
                        assert(added);
                    }
                }
            }
        } finally {
            //Request default rate again
            DefaultSnapshotDataTarget.setRate(null);
        }

        /**
         * Block until the sync has actually occurred in the forked threads.
         * The threads are spawned even in the blocking case to keep it simple.
         */
        if (m_snapshotTargetTerminators != null) {
            for (final Thread t : m_snapshotTargetTerminators) {
                t.join();
            }
            m_snapshotTargetTerminators = null;
        }

        return retval;
    }
}