/* This file is part of VoltDB. * Copyright (C) 2008-2010 VoltDB Inc. * * VoltDB is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * VoltDB is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with VoltDB. If not, see <http://www.gnu.org/licenses/>. */ package org.voltdb.network; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.GatheringByteChannel; import java.nio.channels.SelectionKey; import java.util.ArrayDeque; import org.apache.log4j.Logger; import org.voltdb.messaging.FastSerializable; import org.voltdb.messaging.FastSerializer; import org.voltdb.utils.DBBPool; import org.voltdb.utils.DBBPool.BBContainer; import org.voltdb.utils.DeferredSerialization; import org.voltdb.utils.EstTime; import org.voltdb.utils.VoltLoggerFactory; /** * * Provide a queue for ByteBuffers and DeferredSerializations and drain them to gathering ByteChannel. * Uses a thread local memory pool for serializing messages that are < MAX_GATHERING_WRITE size and HeapByteBuffers * otherwise. Jumps through serious hoops to avoid ever writing large HeapByteBuffers to the channel * because Java will allocate a DirectByteBuffer and copy ALL the data into the DirectByteBuffer even if only * a small fraction can reasonably be written to the channel. This wastes time in copying data that can never possibly * make it into the channel in non blocking mode and space because the DirectByteBuffer is never released unlike * the pool which shrinks after a long time without usage. * * The value m_port.m_expectedOutgoingMessageSize is used to set the initial storage a FastSerializer will * allocate for when doing deferred serialization of FastSerializables. FastSerializable + enqueue is the * best way to serialize data unless you can't pick a good value for m_port.m_expectedOutgoingMessageSize. * In most cases you are optimizing for the bulk of your message and it is fine to guess a little high as the memory * allocation works well. */ public class NIOWriteStream implements WriteStream { private static final Logger LOG = Logger.getLogger(NIOWriteStream.class); /** * Reference to the port for changing interest ops */ private final VoltPort m_port; private static final Logger networkLog = Logger.getLogger("NETWORK", VoltLoggerFactory.instance()); private boolean m_isShutdown = false; /** * Contains serialized buffers ready to write to the socket */ private final ArrayDeque<BBContainer> m_queuedBuffers = new ArrayDeque<BBContainer>(); /** * Contains messages waiting to be serialized and written to the socket */ private final ArrayDeque<DeferredSerialization> m_queuedWrites1 = new ArrayDeque<DeferredSerialization>(); private final ArrayDeque<DeferredSerialization> m_queuedWrites2 = new ArrayDeque<DeferredSerialization>(); private ArrayDeque<DeferredSerialization> m_queuedWrites = m_queuedWrites1; private final int m_maxQueuedWritesBeforeBackpressure = 100; private final Runnable m_offBackPressureCallback; private final Runnable m_onBackPressureCallback; private final QueueMonitor m_monitor; private long m_bytesWritten = 0; private long m_messagesWritten = 0; /* * Used to provide incremental reads of the amount of * data written */ private long m_lastBytesWritten = 0; private long m_lastMessagesWritten = 0; synchronized long[] getBytesAndMessagesWritten(boolean interval) { if (interval) { final long bytesWrittenThisTime = m_bytesWritten - m_lastBytesWritten; m_lastBytesWritten = m_bytesWritten; final long messagesWrittenThisTime = m_messagesWritten - m_lastMessagesWritten; m_lastMessagesWritten = m_messagesWritten; return new long[] { bytesWrittenThisTime, messagesWrittenThisTime }; } else { return new long[] {m_bytesWritten, m_messagesWritten}; } } /** * Set to -1 when there are no pending writes. If there is a pending write it is set to the time * of the last successful write or the time the oldest pending write was queued. */ private long m_lastPendingWriteTime = -1; NIOWriteStream(VoltPort port) { this(port, null, null, null); } NIOWriteStream ( VoltPort port, Runnable offBackPressureCallback, Runnable onBackPressureCallback, QueueMonitor monitor) { m_port = port; m_offBackPressureCallback = offBackPressureCallback; m_onBackPressureCallback = onBackPressureCallback; m_monitor = monitor; } synchronized public boolean isEmpty() { return m_queuedBuffers.isEmpty() && m_queuedWrites.isEmpty(); } /** * Returns true when a drainTo invocation was unable to completely drain all queued bytes || * > 1000 writes are queued */ @Override public boolean hadBackPressure() { return m_hadBackPressure; } @Override public void setBackPressure(boolean enable) { if (enable) { this.backpressureStarted(); } else { this.backpressureEnded(); } } /** * Called when not all queued data could be flushed to the channel */ public final void backpressureStarted() { if (networkLog.isTraceEnabled()) { networkLog.trace("Backpressure started for client " + m_port); } if (m_hadBackPressure == false) { m_hadBackPressure = true; if (m_onBackPressureCallback != null) { m_onBackPressureCallback.run(); } } } /** * Called when all queued data is flushed to the channel */ public final void backpressureEnded() { if (networkLog.isTraceEnabled()) { networkLog.trace("Backpressure ended for client " + m_port); } if (m_hadBackPressure == true) { m_hadBackPressure = false; if (m_offBackPressureCallback != null) { m_offBackPressureCallback.run(); } } } /** * Boolean used to store the latest back pressure state * Does this need to be volatile? */ private volatile boolean m_hadBackPressure = false; /** * The maximum amount that the stream will attempt to write to the channel under any circumstances */ static final int MAX_GATHERING_WRITE = 262144; /** * Does the work of queueing addititional buffers that have been serialized * and choosing between gathering and regular writes to the channel. Also splits up very large * writes of HeapByteBuffers into many smaller writes so Java doesn't allocate a monster DirectByteBuffer * that will never be freed * @param channel * @param additional * @return * @throws IOException */ int drainTo (final GatheringByteChannel channel, final BBContainer additional[]) throws IOException { /* * Add the containers that were serialized when swapAndSerializeQueuedWrites was * invoked to the end of the queue so they will eventually be written */ if (additional != null) { for (final BBContainer c : additional) { m_queuedBuffers.offer(c); } } int bytesWritten = 0; long rc = 0; do { //For gathering write (many small writes) store the list of buffers here ByteBuffer buffers[] = null; //For regular writes store the buffer to write here ByteBuffer buffer = null; /* * Nothing to write */ if (m_queuedBuffers.isEmpty()) { if (m_hadBackPressure && m_queuedWrites.size() <= m_maxQueuedWritesBeforeBackpressure) { backpressureEnded(); } m_lastPendingWriteTime = -1; updateQueued(-bytesWritten, false); m_bytesWritten += bytesWritten; return bytesWritten; } /* * Peek the first buffer and inspect it to see if it needs any special handling * If it is to large to use as part of a gathering write then branch and just focus on * writing the one */ final int queuedForWrite = 0; final BBContainer peekedBuffer = m_queuedBuffers.peek(); if (peekedBuffer.b.remaining() > MAX_GATHERING_WRITE) { /* * If the buffer is not direct and it is this large it should be split up in separate writes */ if (!peekedBuffer.b.isDirect()) { /** * Split a big heap byte buffer into many smaller slices * so Java doesn't allocate a bunch of direct ByteBuffers */ //The source buffer for the slices that holds the memory goes in first //so it will be discarded once all the slices have been read. m_queuedBuffers.push(peekedBuffer); final int originalPosition = peekedBuffer.b.position(); do { final int amountToSplit = Math.min(peekedBuffer.b.remaining(), MAX_GATHERING_WRITE); peekedBuffer.b.position(peekedBuffer.b.limit() - amountToSplit); final BBContainer splice = DBBPool.wrapBB(peekedBuffer.b.slice()); m_queuedBuffers.push(splice); m_messagesWritten--;//corrects message count peekedBuffer.b.limit(peekedBuffer.b.position()); peekedBuffer.b.position(originalPosition); } while(peekedBuffer.b.hasRemaining()); buffer = m_queuedBuffers.peek().b; } else { //Elect to do a single regular write instead of all the gathering write work //It is okay not to split it into many smaller buffers because it is direct buffer = peekedBuffer.b; } } else { /* * Iterate over the queued buffers until we have 10, there are none left, or * adding another would make the write too big */ final int queuedBuffersSize = m_queuedBuffers.size(); buffers = new ByteBuffer[queuedBuffersSize < 10 ? queuedBuffersSize : 10]; int ii = 0; for (final BBContainer c : m_queuedBuffers) { /* * Don't queue insanely large gathering writes. It does bad things * This catches a series of writes that would be too large. * This doesn't skip the buffer that is too large and look * for a smaller one in order to ensure that a large buffer * doesn't get neglected */ final int potentialQueuedForWrite = queuedForWrite + c.b.remaining(); if (potentialQueuedForWrite > MAX_GATHERING_WRITE) { //Create a new correctly sized array //to pass in for the gathering write final ByteBuffer oldArray[] = buffers; buffers = new ByteBuffer[ii]; for(int zz = 0; zz < ii; zz++) { buffers[zz] = oldArray[zz]; } break; } /* * Never ever ever try and do a gathering write with a buffer that is * not direct. Java will allocate a DirectByteBuffer every time and not * pool them at all. If the buffer is not direct it will be skipped and * the gathering write will go ahead with what has already come off of the queue * @TODO A potential optimization might be to continue looking ahead in the queue for * more direct buffers. Probably more complex then it is worth. */ if (!c.b.isDirect()) { if (ii == 0) { buffers = null; buffer = c.b; break; } else { final ByteBuffer oldArray[] = buffers; buffers = new ByteBuffer[ii]; for(int zz = 0; zz < ii; zz++) { buffers[zz] = oldArray[zz]; } break; } } /* * The regular case where there is nothing wrong * and the buffer can be added to the list for the gathering write */ buffers[ii++] = c.b; if (ii == 10) { break; } } } /* * Choose between a gathering write vs. a single buffer write based * on the presence of the buffers array */ rc = 0; if (buffers != null) { assert(checkAllDirect(buffers)); rc = channel.write (buffers); //Discard the buffer back to a pool if no data remains for (final ByteBuffer b : buffers) { if (!b.hasRemaining()) { m_queuedBuffers.poll().discard(); m_messagesWritten++; } else { if (!m_hadBackPressure) { backpressureStarted(); } break; } } } else { rc = channel.write(buffer); //Discard the buffer back to a pool if no data remains if (buffer.hasRemaining()) { if (!m_hadBackPressure) { backpressureStarted(); } } else { m_queuedBuffers.poll().discard(); m_messagesWritten++; } } bytesWritten += rc; } while (rc > 0); //This extra check is necessary because sometimes a buffer with nothing remaining //has to be queued in the above loop resulting in rc == 0. Since rc == 0 //it won't loop around a last time and see that there are no more queued buffers //and thus no backpressure if (m_queuedBuffers.isEmpty() && m_hadBackPressure && m_queuedWrites.size() <= m_maxQueuedWritesBeforeBackpressure) { backpressureEnded(); } if (!isEmpty()) { if (bytesWritten > 0) { m_lastPendingWriteTime = EstTime.currentTimeMillis(); } } else { m_lastPendingWriteTime = -1; } updateQueued(-bytesWritten, false); m_bytesWritten += bytesWritten; return bytesWritten; } /** * Used for assertions. Returns false if one of the buffers is not direct * @param buffers * @return */ private final boolean checkAllDirect(final ByteBuffer buffers[]) { for (final ByteBuffer b : buffers ) { if (!b.isDirect()) { return false; } } return true; } /** * Queued a container for writing. This isn't the ideal API to use since the serialization has been done * outside of a network thread * @param c */ @Override public boolean enqueue(final BBContainer c) { assert(c != null); assert(c.b != null); if (c.b.remaining() == 0) { c.discard(); return false; } synchronized (this) { if (m_isShutdown) { c.discard(); return false; } updateLastPendingWriteTimeAndQueueBackpressure(); updateQueued(c.b.remaining(), false); m_queuedBuffers.offer(c); m_port.setInterests( SelectionKey.OP_WRITE, 0); } return true; } /** * Queue a FastSerializable object for writing. This is 3rd best way to serialize and queue messages. * Since no expected message size is provided the default one for this port is used which may not be accurate * for this particular message. Because FastSerializer is used to serialize the object there is some * overhead incurred if the FastSerializer has to resize and every time the FastSerializer has to check * if it needs to grow. * @param f */ @Override public boolean enqueue(final FastSerializable f) { synchronized (this) { if (m_isShutdown) { return false; } updateLastPendingWriteTimeAndQueueBackpressure(); m_queuedWrites.offer(new DeferredSerialization() { @Override public BBContainer serialize(final DBBPool pool) throws IOException { final FastSerializer fs = new FastSerializer(pool, m_port.m_expectedOutgoingMessageSize); return fs.writeObjectForMessaging(f); } @Override public void cancel() {} }); m_port.setInterests( SelectionKey.OP_WRITE, 0); } return true; } /** * Queue a FastSerializable object for writing. This is 2nd best way to serialize and queue messages. * The expected message size is used to size the initial allocation for the FastSerializer. * Because FastSerializer is used to serialize the object there is some over head incurred * when the FastSerializer has to check if it needs to grow, but the cost is pretty minor compared * to the cost of actually growing the FastSerializer. * @param f */ @Override public boolean enqueue(final FastSerializable f, final int expectedSize) { synchronized (this) { if (m_isShutdown) { return false; } updateLastPendingWriteTimeAndQueueBackpressure(); m_queuedWrites.offer(new DeferredSerialization() { @Override public BBContainer serialize(final DBBPool pool) throws IOException { final FastSerializer fs = new FastSerializer(pool, expectedSize); return fs.writeObjectForMessaging(f); } @Override public void cancel() {} }); m_port.setInterests( SelectionKey.OP_WRITE, 0); } return true; } /** * Queue a message and defer the serialization of the message until later. This is the ideal mechanism * for serializing and queueing network writes. It allows the sender to define an efficient serialization * mechanism that performs a single allocation of the correct size without the overhead of FastSerializer * which has to constantly check if it needs to grow. * @param ds A deferred serialization task that will generate the message */ @Override public boolean enqueue(final DeferredSerialization ds) { synchronized (this) { if (m_isShutdown) { ds.cancel(); return false; } updateLastPendingWriteTimeAndQueueBackpressure(); m_queuedWrites.offer(ds); m_port.setInterests( SelectionKey.OP_WRITE, 0); } return true; } /** * Queue a ByteBuffer for writing to the network. If the ByteBuffer is not direct then it will * be copied to a DirectByteBuffer if it is less then DBBPool.MAX_ALLOCATION_SIZE. This method * is a backup for code that isn't able to defer its serialization to a network thread * for whatever reason. It is reasonably efficient if a DirectByteBuffer is passed in, * but it would be better to keep allocations of DirectByteBuffers inside the network pools. * @param b */ @Override public boolean enqueue(final ByteBuffer b) { assert(b != null); assert(!b.isDirect());//Don't queue direct buffers, they leak memory without a container if (b.remaining() == 0) { return false; } synchronized (this) { if (m_isShutdown) { return false; } updateLastPendingWriteTimeAndQueueBackpressure(); /* * Attempt to use one of our own pooled direct byte buffers * so that a gathering write can be done later. Java gathering * writes spam direct byte buffers if many regular BBs are used */ if (b.remaining() < DBBPool.MAX_ALLOCATION_SIZE){ m_queuedWrites.offer(new DeferredSerialization() { @Override public BBContainer serialize(final DBBPool pool) { final BBContainer c = pool.acquire(b.remaining()); assert(c.b.isDirect()); c.b.put(b); c.b.flip(); return c; } @Override public void cancel() {} }); } else { updateQueued(b.remaining(), false); m_queuedBuffers.offer(DBBPool.wrapBB(b)); } m_port.setInterests( SelectionKey.OP_WRITE, 0); } return true; } /** * Swap the two queues of DeferredSerializations and serialize everything in the queue * and return the resulting ByteBuffers as an array. * @return * @throws IOException */ final BBContainer[] swapAndSerializeQueuedWrites(final DBBPool pool) throws IOException { ArrayDeque<DeferredSerialization> oldlist; synchronized (this) { if (m_queuedWrites.isEmpty()) { return null; } else { if (m_queuedWrites == m_queuedWrites1) { oldlist = m_queuedWrites1; m_queuedWrites = m_queuedWrites2; } else { oldlist = m_queuedWrites2; m_queuedWrites = m_queuedWrites1; } } } final BBContainer results[] = new BBContainer[oldlist.size()]; int ii = 0; DeferredSerialization ds = null; int bytesQueued = 0; while ((ds = oldlist.poll()) != null) { results[ii] = ds.serialize(pool); bytesQueued += results[ii].b.remaining(); assert(results[ii] != null); assert(results[ii].b != null); ii++; } updateQueued(bytesQueued, true); return results; } /** * Free the pool resources that are held by this WriteStream. The pool itself is thread local * and will be freed when the thread terminates. */ synchronized void shutdown() { int bytesReleased = 0; m_isShutdown = true; BBContainer c = null; while ((c = m_queuedBuffers.poll()) != null) { bytesReleased += c.b.remaining(); c.discard(); } updateQueued(-bytesReleased, false); DeferredSerialization ds = null; while ((ds = m_queuedWrites.poll()) != null) { ds.cancel(); } } @Override public synchronized int calculatePendingWriteDelta(final long now) { if (m_lastPendingWriteTime == -1) { return 0; } return (int)(now - m_lastPendingWriteTime); } private void updateLastPendingWriteTimeAndQueueBackpressure() { if (m_lastPendingWriteTime == -1) { m_lastPendingWriteTime = EstTime.currentTimeMillis(); } if (m_queuedWrites.size() > m_maxQueuedWritesBeforeBackpressure && !m_hadBackPressure) { backpressureStarted(); } } private void updateQueued(int queued, boolean noBackpressureSignal) { if (m_monitor != null) { boolean shouldSignalBackpressure = m_monitor.queue(queued); if (!noBackpressureSignal && shouldSignalBackpressure) { if (!m_hadBackPressure) { backpressureStarted(); } } } } }