/* This file is part of VoltDB. * Copyright (C) 2008-2017 VoltDB Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.rejoin; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Collections; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; import java.util.concurrent.Callable; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import org.voltcore.logging.VoltLogger; import org.voltcore.messaging.Mailbox; import org.voltcore.utils.CoreUtils; import org.voltcore.utils.DBBPool; import org.voltcore.utils.DBBPool.BBContainer; import org.voltdb.SnapshotDataTarget; import org.voltdb.SnapshotFormat; import org.voltdb.VoltDB; import org.voltdb.utils.CompressionService; import com.google_voltpatches.common.base.Preconditions; import com.google_voltpatches.common.util.concurrent.Futures; import com.google_voltpatches.common.util.concurrent.ListenableFuture; import com.google_voltpatches.common.util.concurrent.SettableFuture; /** * A stream snapshot target for sending snapshot data directly to a rejoining * partition. */ public class StreamSnapshotDataTarget extends StreamSnapshotBase implements SnapshotDataTarget, StreamSnapshotAckReceiver.AckCallback { private static final VoltLogger rejoinLog = new VoltLogger("REJOIN"); // triggers specific test code for TestMidRejoinDeath static boolean m_rejoinDeathTestMode = System.getProperties().containsKey("rejoindeathtest"); private static AtomicLong m_totalSnapshotTargetCount = new AtomicLong(0); final long m_targetId; // shortened when in test mode public final static long DEFAULT_WRITE_TIMEOUT_MS = m_rejoinDeathTestMode ? 10000 : Long.getLong("REJOIN_WRITE_TIMEOUT_MS", 60000); final static long WATCHDOG_PERIOS_S = 5; // schemas for all the tables on this partition private final Map<Integer, byte[]> m_schemas = new HashMap<Integer, byte[]>(); // HSId of the destination mailbox private final long m_destHSId; // input and output threads private final SnapshotSender m_sender; private final StreamSnapshotAckReceiver m_ackReceiver; // Skip all subsequent writes if one fails private final AtomicReference<IOException> m_writeFailed = new AtomicReference<IOException>(); // true if the failure is already reported to the SnapshotSiteProcessor, prevent throwing // the same exception multiple times. private boolean m_failureReported = false; private volatile IOException m_reportedSerializationFailure = null; // number of sent, but un-acked buffers final AtomicInteger m_outstandingWorkCount = new AtomicInteger(0); // map of sent, but un-acked buffers, packaged up a bit private final TreeMap<Integer, SendWork> m_outstandingWork = new TreeMap<Integer, SendWork>(); int m_blockIndex = 0; private final AtomicReference<Runnable> m_onCloseHandler = new AtomicReference<Runnable>(null); private final AtomicBoolean m_closed = new AtomicBoolean(false); public StreamSnapshotDataTarget(long HSId, byte[] hashinatorConfig, Map<Integer, byte[]> schemas, SnapshotSender sender, StreamSnapshotAckReceiver ackReceiver) { this(HSId, hashinatorConfig, schemas, DEFAULT_WRITE_TIMEOUT_MS, sender, ackReceiver); } public StreamSnapshotDataTarget(long HSId, byte[] hashinatorConfig, Map<Integer, byte[]> schemas, long writeTimeout, SnapshotSender sender, StreamSnapshotAckReceiver ackReceiver) { super(); m_targetId = m_totalSnapshotTargetCount.getAndIncrement(); m_schemas.putAll(schemas); m_destHSId = HSId; m_sender = sender; m_sender.registerDataTarget(m_targetId); m_ackReceiver = ackReceiver; m_ackReceiver.setCallback(m_targetId, this); rejoinLog.debug(String.format("Initializing snapshot stream processor " + "for source site id: %s, and with processorid: %d", CoreUtils.hsIdToString(HSId), m_targetId)); // start a periodic task to look for timed out connections VoltDB.instance().scheduleWork(new Watchdog(0, writeTimeout), WATCHDOG_PERIOS_S, -1, TimeUnit.SECONDS); if (hashinatorConfig != null) { // Send the hashinator config as the first block send(StreamSnapshotMessageType.HASHINATOR, -1, hashinatorConfig); } } /** * Packages up a pending write into a piece of work that can be tracked * and can be scheduled. */ public static class SendWork { BBContainer m_message; final long m_targetId; final long m_destHSId; final long m_ts; final boolean m_isEmpty; // A listenable future used to notify a listener when this buffer is discarded final SettableFuture<Boolean> m_future; /** * Creates an empty send work to terminate the sender thread */ SendWork() { m_isEmpty = true; m_targetId = -1; m_destHSId = -1; m_ts = -1; m_future = null; } SendWork (long targetId, long destHSId, BBContainer message, SettableFuture<Boolean> future) { m_isEmpty = false; m_targetId = targetId; m_destHSId = destHSId; m_message = message; m_ts = System.currentTimeMillis(); m_future = future; } /** * Idempotent method to cancel any pending work and release any * BBContainters held. */ public synchronized void discard() { // discard the buffers and null them out if (m_message != null) { m_message.discard(); m_message = null; } } /** * Compress the data in the BBContainer provided, then package it up in * a RejoinDataMessage instance, and finally hand it off to the messaging * subsystem. */ protected int send(Mailbox mb, MessageFactory msgFactory, BBContainer message) throws IOException { final ByteBuffer messageBuffer = message.b(); if (messageBuffer.isDirect()) { byte[] data = CompressionService.compressBuffer(messageBuffer); mb.send(m_destHSId, msgFactory.makeDataMessage(m_targetId, data)); if (rejoinLog.isTraceEnabled()) { rejoinLog.trace("Sending direct buffer"); } return data.length; } else { byte compressedBytes[] = CompressionService.compressBytes( messageBuffer.array(), messageBuffer.position(), messageBuffer.remaining()); mb.send(m_destHSId, msgFactory.makeDataMessage(m_targetId, compressedBytes)); if (rejoinLog.isTraceEnabled()) { rejoinLog.trace("Sending heap buffer"); } return compressedBytes.length; } } public synchronized int doWork(Mailbox mb, MessageFactory msgFactory) throws Exception { // this work has already been discarded if (m_message == null) { return 0; } try { return send(mb, msgFactory, m_message); } finally { // Buffers are only discarded after they are acked. Discarding them here would cause the sender to // generate too much work for the receiver. m_future.set(true); } } } public static class StreamSnapshotTimeoutException extends IOException { public StreamSnapshotTimeoutException(String message) { super(message); } } public static class SnapshotSerializationException extends IOException { public SnapshotSerializationException(String message) { super(message); } } /** * Task run every so often to look for writes that haven't been acked * in writeTimeout time. */ class Watchdog implements Runnable { final long m_bytesWrittenSinceConstruction; final long m_writeTimeout; Watchdog(long bytesWritten, long writeTimout) { m_bytesWrittenSinceConstruction = bytesWritten; m_writeTimeout = writeTimout; } @Override public void run() { if (m_closed.get()) { return; } long bytesWritten = 0; try { bytesWritten = m_sender.m_bytesSent.get(m_targetId).get(); rejoinLog.info(String.format("While sending rejoin data to site %s, %d bytes have been sent in the past %s seconds.", CoreUtils.hsIdToString(m_destHSId), bytesWritten - m_bytesWrittenSinceConstruction, WATCHDOG_PERIOS_S)); checkTimeout(m_writeTimeout); if (m_writeFailed.get() != null) { clearOutstanding(); // idempotent } } catch (Throwable t) { rejoinLog.error("Stream snapshot watchdog thread threw an exception", t); } finally { // schedule to run again VoltDB.instance().scheduleWork(new Watchdog(bytesWritten, m_writeTimeout), WATCHDOG_PERIOS_S, -1, TimeUnit.SECONDS); } } } /** * Called by the watchdog from the periodic work thread to check if the * oldest unacked block is older than the timeout interval. */ private synchronized void checkTimeout(final long timeoutMs) { final Entry<Integer, SendWork> oldest = m_outstandingWork.firstEntry(); if (oldest != null) { final long now = System.currentTimeMillis(); SendWork work = oldest.getValue(); if ((now - work.m_ts) > timeoutMs) { StreamSnapshotTimeoutException exception = new StreamSnapshotTimeoutException(String.format( "A snapshot write task failed after a timeout (currently %d seconds outstanding). " + "Node rejoin may need to be retried", (now - work.m_ts) / 1000)); rejoinLog.error(exception.getMessage()); m_writeFailed.compareAndSet(null, exception); } } } /** * Idempotent, synchronized method to perform all cleanup of outstanding * work so buffers aren't leaked. */ synchronized void clearOutstanding() { if (m_outstandingWork.isEmpty() && (m_outstandingWorkCount.get() == 0)) { return; } rejoinLog.trace("Clearing outstanding work."); for (Entry<Integer, SendWork> e : m_outstandingWork.entrySet()) { e.getValue().discard(); } m_outstandingWork.clear(); m_outstandingWorkCount.set(0); } /** * Synchronized method to handle the arrival of an Ack. * @param blockIndex The index of the block that is being acked. */ @Override public synchronized void receiveAck(int blockIndex) { rejoinLog.trace("Received block ack for index " + String.valueOf(blockIndex)); m_outstandingWorkCount.decrementAndGet(); SendWork work = m_outstandingWork.remove(blockIndex); // releases the BBContainers and cleans up work.discard(); } /** * Thread that runs send work (sending snapshot blocks). One per node */ public static class SnapshotSender implements Runnable { private final Mailbox m_mb; private final MessageFactory m_msgFactory; private final LinkedBlockingQueue<SendWork> m_workQueue; private final AtomicInteger m_expectedEOFs; final Map<Long, AtomicLong> m_bytesSent; final Map<Long, AtomicLong> m_worksSent; volatile Exception m_lastException = null; public SnapshotSender(Mailbox mb) { this(mb, new DefaultMessageFactory()); } public SnapshotSender(Mailbox mb, MessageFactory msgFactory) { Preconditions.checkArgument(mb != null); m_mb = mb; m_msgFactory = msgFactory; m_workQueue = new LinkedBlockingQueue<SendWork>(); m_expectedEOFs = new AtomicInteger(); m_bytesSent = Collections.synchronizedMap(new HashMap<Long, AtomicLong>()); m_worksSent = Collections.synchronizedMap(new HashMap<Long, AtomicLong>()); } public void registerDataTarget(long targetId) { m_expectedEOFs.incrementAndGet(); m_bytesSent.put(targetId, new AtomicLong()); m_worksSent.put(targetId, new AtomicLong()); } public void offer(SendWork work) { m_workQueue.offer(work); } @Override public void run() { rejoinLog.trace("Starting stream sender thread"); while (true) { SendWork work; try { rejoinLog.trace("Blocking on sending work queue"); work = m_workQueue.poll(10, TimeUnit.MINUTES); if (work == null) { rejoinLog.warn("No stream snapshot send work was produced in the past 10 minutes"); break; } else if (work.m_isEmpty) { // Empty work indicates the end of the queue. // The sender is shared by multiple data targets, each of them will // send an end-of-queue work, must wait until all end-of-queue works // are received before terminating the thread. if (m_expectedEOFs.decrementAndGet() == 0) { break; } else { continue; } } m_bytesSent.get(work.m_targetId).addAndGet(work.doWork(m_mb, m_msgFactory)); m_worksSent.get(work.m_targetId).incrementAndGet(); } catch (Exception e) { m_lastException = e; rejoinLog.error("Error sending a recovery stream message", e); } } CompressionService.releaseThreadLocal(); rejoinLog.trace("Stream sender thread exiting"); } } @Override public int getHeaderSize() { return contentOffset; } @Override public ListenableFuture<?> write(Callable<BBContainer> tupleData, int tableId) { rejoinLog.trace("Starting write"); try { BBContainer chunkC; ByteBuffer chunk; try { chunkC = tupleData.call(); chunk = chunkC.b(); } catch (Exception e) { return Futures.immediateFailedFuture(e); } // cleanup and exit immediately if in failure mode // or on null imput if (m_writeFailed.get() != null || (chunkC == null)) { if (chunkC != null) { chunkC.discard(); } if (m_failureReported) { return null; } else { m_failureReported = true; return Futures.immediateFailedFuture(m_writeFailed.get()); } } // cleanup and exit immediately if in failure mode // but here, throw an exception because this isn't supposed to happen if (m_closed.get()) { chunkC.discard(); IOException e = new IOException("Trying to write snapshot data " + "after the stream is closed"); m_writeFailed.set(e); return Futures.immediateFailedFuture(e); } // Have we seen this table before, if not, send schema if (m_schemas.containsKey(tableId)) { // remove the schema once sent byte[] schema = m_schemas.remove(tableId); rejoinLog.debug("Sending schema for table " + tableId); rejoinLog.trace("Writing schema as part of this write"); send(StreamSnapshotMessageType.SCHEMA, tableId, schema); } chunk.put((byte) StreamSnapshotMessageType.DATA.ordinal()); chunk.putInt(m_blockIndex); // put chunk index chunk.putInt(tableId); // put table ID chunk.position(0); return send(m_blockIndex++, chunkC); } finally { rejoinLog.trace("Finished call to write"); } } private ListenableFuture<Boolean> send(StreamSnapshotMessageType type, int tableId, byte[] content) { // 1 byte for the type, 4 bytes for the block index, 4 bytes for table Id ByteBuffer buf = ByteBuffer.allocate(1 + 4 + 4 + content.length); buf.put((byte) type.ordinal()); buf.putInt(m_blockIndex); buf.putInt(tableId); buf.put(content); buf.flip(); return send(m_blockIndex++, DBBPool.wrapBB(buf)); } /** * Send data to the rejoining node, tracking what was sent for ack tracking. * Synchronized to protect access to m_outstandingWork and to keep * m_outstandingWorkCount in sync with m_outstandingWork. * * @param blockIndex Index useful for ack tracking and debugging * @param schemaContainer Optional schema for table (can be null) * @param chunk Snapshot data to send. * @return return a listenable future for the caller to wait until the buffer is sent */ synchronized ListenableFuture<Boolean> send(int blockIndex, BBContainer chunk) { SettableFuture<Boolean> sendFuture = SettableFuture.create(); SendWork sendWork = new SendWork(m_targetId, m_destHSId, chunk, sendFuture); m_outstandingWork.put(blockIndex, sendWork); m_outstandingWorkCount.incrementAndGet(); m_sender.offer(sendWork); return sendFuture; } @Override public void reportSerializationFailure(IOException ex) { m_reportedSerializationFailure = ex; } @Override public boolean needsFinalClose() { // Streamed snapshot targets always need to be closed by the last site return true; } @Override public void close() throws IOException, InterruptedException { /* * could be called multiple times, because all tables share one stream * target */ if (!m_closed.get()) { rejoinLog.trace("Closing stream snapshot target"); // block until all acks have arrived waitForOutstandingWork(); // Send the EOS message after clearing outstanding work so that if there's a failure, // we'll send the correct EOS to the receiving end sendEOS(); // Terminate the sender thread after the last block m_sender.offer(new SendWork()); // locked so m_closed is true when the ack thread dies synchronized(this) { m_closed.set(true); assert(m_outstandingWork.size() == 0); } rejoinLog.trace("Closed stream snapshot target"); } Runnable closeHandle = m_onCloseHandler.get(); if (closeHandle != null) { closeHandle.run(); } if (m_reportedSerializationFailure != null) { // There was an error reported by the EE during serialization throw m_reportedSerializationFailure; } // If there was an error during close(), throw it so that the snapshot // can be marked as failed. if (m_writeFailed.get() != null) { throw m_writeFailed.get(); } } private void sendEOS() { // Send EOF ByteBuffer buf = ByteBuffer.allocate(1 + 4); // 1 byte type, 4 bytes index if (m_writeFailed.get() != null) { // signify failure, at least on this end buf.put((byte) StreamSnapshotMessageType.FAILURE.ordinal()); } else { // success - join the cluster buf.put((byte) StreamSnapshotMessageType.END.ordinal()); } buf.putInt(m_blockIndex); buf.flip(); send(m_blockIndex++, DBBPool.wrapBB(buf)); // Wait for the ack of the EOS message waitForOutstandingWork(); } private void waitForOutstandingWork() { while (m_writeFailed.get() == null && (m_outstandingWorkCount.get() > 0)) { Thread.yield(); } // if here because a write failed, cleanup outstanding work clearOutstanding(); } @Override public long getBytesWritten() { return m_sender.m_bytesSent.get(m_targetId).get(); } public long getWorksWritten() { return m_sender.m_worksSent.get(m_targetId).get(); } @Override public void setOnCloseHandler(Runnable onClose) { m_onCloseHandler.set(onClose); } @Override public synchronized Throwable getLastWriteException() { Exception exception = m_sender.m_lastException; if (exception != null) { return exception; } exception = m_ackReceiver.m_lastException; if (exception != null) { return exception; } return m_writeFailed.get(); } @Override public SnapshotFormat getFormat() { return SnapshotFormat.STREAM; } /** * Get the row count if any, of the content wrapped in the given {@link BBContainer} * @param tupleData * @return the numbers of tuple data rows contained within a container */ @Override public int getInContainerRowCount(BBContainer tupleData) { // according to TableOutputStream.cpp:TupleOutputStream::endRows() the row count is // at offset 4 (second integer) ByteBuffer bb = tupleData.b().duplicate(); bb.position(getHeaderSize()); bb.getInt(); // skip first four (partition id) return bb.getInt(); } }