StreamSnapshotDataTarget.java example

Explorer
voltdb-master
/* This file is part of VoltDB.
 * Copyright (C) 2008-2017 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.voltdb.rejoin;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.concurrent.Callable;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;

import org.voltcore.logging.VoltLogger;
import org.voltcore.messaging.Mailbox;
import org.voltcore.utils.CoreUtils;
import org.voltcore.utils.DBBPool;
import org.voltcore.utils.DBBPool.BBContainer;
import org.voltdb.SnapshotDataTarget;
import org.voltdb.SnapshotFormat;
import org.voltdb.VoltDB;
import org.voltdb.utils.CompressionService;

import com.google_voltpatches.common.base.Preconditions;
import com.google_voltpatches.common.util.concurrent.Futures;
import com.google_voltpatches.common.util.concurrent.ListenableFuture;
import com.google_voltpatches.common.util.concurrent.SettableFuture;

/**
 * A stream snapshot target for sending snapshot data directly to a rejoining
 * partition.
 */
public class StreamSnapshotDataTarget extends StreamSnapshotBase
implements SnapshotDataTarget, StreamSnapshotAckReceiver.AckCallback {
    private static final VoltLogger rejoinLog = new VoltLogger("REJOIN");

    // triggers specific test code for TestMidRejoinDeath
    static boolean m_rejoinDeathTestMode = System.getProperties().containsKey("rejoindeathtest");

    private static AtomicLong m_totalSnapshotTargetCount = new AtomicLong(0);
    final long m_targetId;

    // shortened when in test mode
    public final static long DEFAULT_WRITE_TIMEOUT_MS = m_rejoinDeathTestMode ? 10000 : Long.getLong("REJOIN_WRITE_TIMEOUT_MS", 60000);
    final static long WATCHDOG_PERIOS_S = 5;

    // schemas for all the tables on this partition
    private final Map<Integer, byte[]> m_schemas = new HashMap<Integer, byte[]>();
    // HSId of the destination mailbox
    private final long m_destHSId;
    // input and output threads
    private final SnapshotSender m_sender;
    private final StreamSnapshotAckReceiver m_ackReceiver;

    // Skip all subsequent writes if one fails
    private final AtomicReference<IOException> m_writeFailed = new AtomicReference<IOException>();
    // true if the failure is already reported to the SnapshotSiteProcessor, prevent throwing
    // the same exception multiple times.
    private boolean m_failureReported = false;

    private volatile IOException m_reportedSerializationFailure = null;

    // number of sent, but un-acked buffers
    final AtomicInteger m_outstandingWorkCount = new AtomicInteger(0);
    // map of sent, but un-acked buffers, packaged up a bit
    private final TreeMap<Integer, SendWork> m_outstandingWork = new TreeMap<Integer, SendWork>();

    int m_blockIndex = 0;
    private final AtomicReference<Runnable> m_onCloseHandler = new AtomicReference<Runnable>(null);

    private final AtomicBoolean m_closed = new AtomicBoolean(false);

    public StreamSnapshotDataTarget(long HSId, byte[] hashinatorConfig, Map<Integer, byte[]> schemas,
                                    SnapshotSender sender, StreamSnapshotAckReceiver ackReceiver)
    {
        this(HSId, hashinatorConfig, schemas, DEFAULT_WRITE_TIMEOUT_MS, sender, ackReceiver);
    }

    public StreamSnapshotDataTarget(long HSId, byte[] hashinatorConfig, Map<Integer, byte[]> schemas,
                                    long writeTimeout, SnapshotSender sender, StreamSnapshotAckReceiver ackReceiver)
    {
        super();
        m_targetId = m_totalSnapshotTargetCount.getAndIncrement();
        m_schemas.putAll(schemas);
        m_destHSId = HSId;
        m_sender = sender;
        m_sender.registerDataTarget(m_targetId);
        m_ackReceiver = ackReceiver;
        m_ackReceiver.setCallback(m_targetId, this);

        rejoinLog.debug(String.format("Initializing snapshot stream processor " +
                "for source site id: %s, and with processorid: %d",
                CoreUtils.hsIdToString(HSId), m_targetId));

        // start a periodic task to look for timed out connections
        VoltDB.instance().scheduleWork(new Watchdog(0, writeTimeout), WATCHDOG_PERIOS_S, -1, TimeUnit.SECONDS);

        if (hashinatorConfig != null) {
            // Send the hashinator config as  the first block
            send(StreamSnapshotMessageType.HASHINATOR, -1, hashinatorConfig);
        }
    }

    /**
     * Packages up a pending write into a piece of work that can be tracked
     * and can be scheduled.
     */
    public static class SendWork {
        BBContainer m_message;
        final long m_targetId;
        final long m_destHSId;
        final long m_ts;

        final boolean m_isEmpty;

        // A listenable future used to notify a listener when this buffer is discarded
        final SettableFuture<Boolean> m_future;

        /**
         * Creates an empty send work to terminate the sender thread
         */
        SendWork() {
            m_isEmpty = true;
            m_targetId = -1;
            m_destHSId = -1;
            m_ts = -1;
            m_future = null;
        }

        SendWork (long targetId, long destHSId,
                  BBContainer message,
                  SettableFuture<Boolean> future) {
            m_isEmpty = false;
            m_targetId = targetId;
            m_destHSId = destHSId;
            m_message = message;
            m_ts = System.currentTimeMillis();
            m_future = future;
        }

        /**
         * Idempotent method to cancel any pending work and release any
         * BBContainters held.
         */
        public synchronized void discard() {
            // discard the buffers and null them out
            if (m_message != null) {
                m_message.discard();
                m_message = null;
            }
        }

        /**
         * Compress the data in the BBContainer provided, then package it up in
         * a RejoinDataMessage instance, and finally hand it off to the messaging
         * subsystem.
         */
        protected int send(Mailbox mb, MessageFactory msgFactory, BBContainer message) throws IOException {
            final ByteBuffer messageBuffer = message.b();
            if (messageBuffer.isDirect()) {
                byte[] data = CompressionService.compressBuffer(messageBuffer);
                mb.send(m_destHSId, msgFactory.makeDataMessage(m_targetId, data));

                if (rejoinLog.isTraceEnabled()) {
                    rejoinLog.trace("Sending direct buffer");
                }

                return data.length;
            } else {
                byte compressedBytes[] =
                    CompressionService.compressBytes(
                            messageBuffer.array(), messageBuffer.position(),
                            messageBuffer.remaining());

                mb.send(m_destHSId, msgFactory.makeDataMessage(m_targetId, compressedBytes));

                if (rejoinLog.isTraceEnabled()) {
                    rejoinLog.trace("Sending heap buffer");
                }

                return compressedBytes.length;
            }
        }

        public synchronized int doWork(Mailbox mb, MessageFactory msgFactory) throws Exception {
            // this work has already been discarded
            if (m_message == null) {
                return 0;
            }

            try {
                return send(mb, msgFactory, m_message);
            } finally {
                // Buffers are only discarded after they are acked. Discarding them here would cause the sender to
                // generate too much work for the receiver.
                m_future.set(true);
            }
        }
    }

    public static class StreamSnapshotTimeoutException extends IOException {
        public StreamSnapshotTimeoutException(String message) {
            super(message);
        }
    }

    public static class SnapshotSerializationException extends IOException {
        public SnapshotSerializationException(String message) {
            super(message);
        }
    }

    /**
     * Task run every so often to look for writes that haven't been acked
     * in writeTimeout time.
     */
    class Watchdog implements Runnable {

        final long m_bytesWrittenSinceConstruction;
        final long m_writeTimeout;

        Watchdog(long bytesWritten, long writeTimout) {
            m_bytesWrittenSinceConstruction = bytesWritten;
            m_writeTimeout = writeTimout;
        }

        @Override
        public void run() {
            if (m_closed.get()) {
                return;
            }

            long bytesWritten = 0;
            try {
                bytesWritten = m_sender.m_bytesSent.get(m_targetId).get();
                rejoinLog.info(String.format("While sending rejoin data to site %s, %d bytes have been sent in the past %s seconds.",
                        CoreUtils.hsIdToString(m_destHSId), bytesWritten - m_bytesWrittenSinceConstruction, WATCHDOG_PERIOS_S));

                checkTimeout(m_writeTimeout);
                if (m_writeFailed.get() != null) {
                    clearOutstanding(); // idempotent
                }
            } catch (Throwable t) {
                rejoinLog.error("Stream snapshot watchdog thread threw an exception", t);
            } finally {
                // schedule to run again
                VoltDB.instance().scheduleWork(new Watchdog(bytesWritten, m_writeTimeout), WATCHDOG_PERIOS_S, -1, TimeUnit.SECONDS);
            }
        }
    }

    /**
     * Called by the watchdog from the periodic work thread to check if the
     * oldest unacked block is older than the timeout interval.
     */
    private synchronized void checkTimeout(final long timeoutMs) {
        final Entry<Integer, SendWork> oldest = m_outstandingWork.firstEntry();
        if (oldest != null) {
            final long now = System.currentTimeMillis();
            SendWork work = oldest.getValue();
            if ((now - work.m_ts) > timeoutMs) {
                StreamSnapshotTimeoutException exception =
                        new StreamSnapshotTimeoutException(String.format(
                                "A snapshot write task failed after a timeout (currently %d seconds outstanding). " +
                                        "Node rejoin may need to be retried",
                                (now - work.m_ts) / 1000));
                rejoinLog.error(exception.getMessage());
                m_writeFailed.compareAndSet(null, exception);
            }
        }
    }

    /**
     * Idempotent, synchronized method to perform all cleanup of outstanding
     * work so buffers aren't leaked.
     */
    synchronized void clearOutstanding() {
        if (m_outstandingWork.isEmpty() && (m_outstandingWorkCount.get() == 0)) {
            return;
        }

        rejoinLog.trace("Clearing outstanding work.");

        for (Entry<Integer, SendWork> e : m_outstandingWork.entrySet()) {
            e.getValue().discard();
        }
        m_outstandingWork.clear();
        m_outstandingWorkCount.set(0);
    }

    /**
     * Synchronized method to handle the arrival of an Ack.
     * @param blockIndex The index of the block that is being acked.
     */
    @Override
    public synchronized void receiveAck(int blockIndex) {
        rejoinLog.trace("Received block ack for index " + String.valueOf(blockIndex));

        m_outstandingWorkCount.decrementAndGet();
        SendWork work = m_outstandingWork.remove(blockIndex);

        // releases the BBContainers and cleans up
        work.discard();
    }

    /**
     * Thread that runs send work (sending snapshot blocks). One per node
     */
    public static class SnapshotSender implements Runnable {
        private final Mailbox m_mb;
        private final MessageFactory m_msgFactory;
        private final LinkedBlockingQueue<SendWork> m_workQueue;
        private final AtomicInteger m_expectedEOFs;

        final Map<Long, AtomicLong> m_bytesSent;
        final Map<Long, AtomicLong> m_worksSent;
        volatile Exception m_lastException = null;

        public SnapshotSender(Mailbox mb)
        {
            this(mb, new DefaultMessageFactory());
        }

        public SnapshotSender(Mailbox mb, MessageFactory msgFactory)
        {
            Preconditions.checkArgument(mb != null);
            m_mb = mb;
            m_msgFactory = msgFactory;
            m_workQueue = new LinkedBlockingQueue<SendWork>();
            m_expectedEOFs = new AtomicInteger();
            m_bytesSent = Collections.synchronizedMap(new HashMap<Long, AtomicLong>());
            m_worksSent = Collections.synchronizedMap(new HashMap<Long, AtomicLong>());
        }

        public void registerDataTarget(long targetId)
        {
            m_expectedEOFs.incrementAndGet();
            m_bytesSent.put(targetId, new AtomicLong());
            m_worksSent.put(targetId, new AtomicLong());
        }

        public void offer(SendWork work)
        {
            m_workQueue.offer(work);
        }

        @Override
        public void run() {
            rejoinLog.trace("Starting stream sender thread");

            while (true) {
                SendWork work;

                try {
                    rejoinLog.trace("Blocking on sending work queue");
                    work = m_workQueue.poll(10, TimeUnit.MINUTES);

                    if (work == null) {
                        rejoinLog.warn("No stream snapshot send work was produced in the past 10 minutes");
                        break;
                    } else if (work.m_isEmpty) {
                        // Empty work indicates the end of the queue.
                        // The sender is shared by multiple data targets, each of them will
                        // send an end-of-queue work, must wait until all end-of-queue works
                        // are received before terminating the thread.
                        if (m_expectedEOFs.decrementAndGet() == 0) {
                            break;
                        } else {
                            continue;
                        }
                    }

                    m_bytesSent.get(work.m_targetId).addAndGet(work.doWork(m_mb, m_msgFactory));
                    m_worksSent.get(work.m_targetId).incrementAndGet();
                }
                catch (Exception e) {
                    m_lastException = e;
                    rejoinLog.error("Error sending a recovery stream message", e);
                }
            }
            CompressionService.releaseThreadLocal();
            rejoinLog.trace("Stream sender thread exiting");
        }
    }

    @Override
    public int getHeaderSize() {
        return contentOffset;
    }

    @Override
    public ListenableFuture<?> write(Callable<BBContainer> tupleData, int tableId) {
        rejoinLog.trace("Starting write");

        try {
            BBContainer chunkC;
            ByteBuffer chunk;
            try {
                chunkC = tupleData.call();
                chunk = chunkC.b();
            } catch (Exception e) {
                return Futures.immediateFailedFuture(e);
            }

            // cleanup and exit immediately if in failure mode
            // or on null imput
            if (m_writeFailed.get() != null || (chunkC == null)) {
                if (chunkC != null) {
                    chunkC.discard();
                }

                if (m_failureReported) {
                    return null;
                } else {
                    m_failureReported = true;
                    return Futures.immediateFailedFuture(m_writeFailed.get());
                }
            }

            // cleanup and exit immediately if in failure mode
            // but here, throw an exception because this isn't supposed to happen
            if (m_closed.get()) {
                chunkC.discard();

                IOException e = new IOException("Trying to write snapshot data " +
                        "after the stream is closed");
                m_writeFailed.set(e);
                return Futures.immediateFailedFuture(e);
            }

            // Have we seen this table before, if not, send schema
            if (m_schemas.containsKey(tableId)) {
                // remove the schema once sent
                byte[] schema = m_schemas.remove(tableId);
                rejoinLog.debug("Sending schema for table " + tableId);

                rejoinLog.trace("Writing schema as part of this write");
                send(StreamSnapshotMessageType.SCHEMA, tableId, schema);
            }

            chunk.put((byte) StreamSnapshotMessageType.DATA.ordinal());
            chunk.putInt(m_blockIndex); // put chunk index
            chunk.putInt(tableId); // put table ID

            chunk.position(0);

            return send(m_blockIndex++, chunkC);
        } finally {
            rejoinLog.trace("Finished call to write");
        }
    }

    private ListenableFuture<Boolean> send(StreamSnapshotMessageType type, int tableId, byte[] content)
    {
        // 1 byte for the type, 4 bytes for the block index, 4 bytes for table Id
        ByteBuffer buf = ByteBuffer.allocate(1 + 4 + 4 + content.length);
        buf.put((byte) type.ordinal());
        buf.putInt(m_blockIndex);
        buf.putInt(tableId);
        buf.put(content);
        buf.flip();

        return send(m_blockIndex++, DBBPool.wrapBB(buf));
    }

    /**
     * Send data to the rejoining node, tracking what was sent for ack tracking.
     * Synchronized to protect access to m_outstandingWork and to keep
     * m_outstandingWorkCount in sync with m_outstandingWork.
     *
     * @param blockIndex Index useful for ack tracking and debugging
     * @param schemaContainer Optional schema for table (can be null)
     * @param chunk Snapshot data to send.
     * @return return a listenable future for the caller to wait until the buffer is sent
     */
    synchronized ListenableFuture<Boolean> send(int blockIndex, BBContainer chunk) {
        SettableFuture<Boolean> sendFuture = SettableFuture.create();
        SendWork sendWork = new SendWork(m_targetId, m_destHSId, chunk, sendFuture);
        m_outstandingWork.put(blockIndex, sendWork);
        m_outstandingWorkCount.incrementAndGet();
        m_sender.offer(sendWork);
        return sendFuture;
    }

    @Override
    public void reportSerializationFailure(IOException ex) {
        m_reportedSerializationFailure = ex;
    }

    @Override
    public boolean needsFinalClose()
    {
        // Streamed snapshot targets always need to be closed by the last site
        return true;
    }

    @Override
    public void close() throws IOException, InterruptedException {
        /*
         * could be called multiple times, because all tables share one stream
         * target
         */
        if (!m_closed.get()) {
            rejoinLog.trace("Closing stream snapshot target");

            // block until all acks have arrived
            waitForOutstandingWork();

            // Send the EOS message after clearing outstanding work so that if there's a failure,
            // we'll send the correct EOS to the receiving end
            sendEOS();

            // Terminate the sender thread after the last block
            m_sender.offer(new SendWork());

            // locked so m_closed is true when the ack thread dies
            synchronized(this) {
                m_closed.set(true);

                assert(m_outstandingWork.size() == 0);
            }

            rejoinLog.trace("Closed stream snapshot target");
        }

        Runnable closeHandle = m_onCloseHandler.get();
        if (closeHandle != null) {
            closeHandle.run();
        }

        if (m_reportedSerializationFailure != null) {
            // There was an error reported by the EE during serialization
            throw m_reportedSerializationFailure;
        }
        // If there was an error during close(), throw it so that the snapshot
        // can be marked as failed.
        if (m_writeFailed.get() != null) {
            throw m_writeFailed.get();
        }
    }

    private void sendEOS()
    {
        // Send EOF
        ByteBuffer buf = ByteBuffer.allocate(1 + 4); // 1 byte type, 4 bytes index
        if (m_writeFailed.get() != null) {
            // signify failure, at least on this end
            buf.put((byte) StreamSnapshotMessageType.FAILURE.ordinal());
        }
        else {
            // success - join the cluster
            buf.put((byte) StreamSnapshotMessageType.END.ordinal());
        }
        buf.putInt(m_blockIndex);
        buf.flip();
        send(m_blockIndex++, DBBPool.wrapBB(buf));

        // Wait for the ack of the EOS message
        waitForOutstandingWork();
    }

    private void waitForOutstandingWork()
    {
        while (m_writeFailed.get() == null && (m_outstandingWorkCount.get() > 0)) {
            Thread.yield();
        }

        // if here because a write failed, cleanup outstanding work
        clearOutstanding();
    }

    @Override
    public long getBytesWritten() {
        return m_sender.m_bytesSent.get(m_targetId).get();
    }

    public long getWorksWritten()
    {
        return m_sender.m_worksSent.get(m_targetId).get();
    }

    @Override
    public void setOnCloseHandler(Runnable onClose) {
        m_onCloseHandler.set(onClose);
    }

    @Override
    public synchronized Throwable getLastWriteException() {
        Exception exception = m_sender.m_lastException;
        if (exception != null) {
            return exception;
        }
        exception = m_ackReceiver.m_lastException;
        if (exception != null) {
            return exception;
        }
        return m_writeFailed.get();
    }

    @Override
    public SnapshotFormat getFormat() {
        return SnapshotFormat.STREAM;
    }

    /**
     * Get the row count if any, of the content wrapped in the given {@link BBContainer}
     * @param tupleData
     * @return the numbers of tuple data rows contained within a container
     */
    @Override
    public int getInContainerRowCount(BBContainer tupleData) {
        // according to TableOutputStream.cpp:TupleOutputStream::endRows() the row count is
        // at offset 4 (second integer)
        ByteBuffer bb = tupleData.b().duplicate();
        bb.position(getHeaderSize());
        bb.getInt(); // skip first four (partition id)

        return bb.getInt();
    }
}