AbstractMasterTask.java example

Explorer
blazegraph-master
- database-master
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on Apr 16, 2009
 */

package com.bigdata.service.ndx.pipeline;

import java.util.Iterator;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.log4j.Logger;

import com.bigdata.btree.keys.KVO;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.relation.accesspath.BlockingBuffer;
import com.bigdata.relation.accesspath.BufferClosedException;
import com.bigdata.relation.accesspath.IAsynchronousIterator;
import com.bigdata.resources.StaleLocatorException;
import com.bigdata.service.Split;
import com.bigdata.service.ndx.ISplitter;
import com.bigdata.util.concurrent.AbstractHaltableProcess;

/**
 * Abstract base class for a master task which consumes chunks of elements
 * written onto a {@link BlockingBuffer} and distributes those chunks to
 * subtasks according to some abstraction which is not defined by this class.
 * 
 * <h3>Design discussion</h3>
 * 
 * The asynchronous write API exposes a blocking buffer to the application which
 * accepts concurrent writes of {@link KVO}[] chunks, in which each {@link KVO}
 * represents a tuple to be written on a scale-out index. The buffer is drained
 * by a master task, which transfers chunks to sinks tasks, each of which writes
 * on a specific index partition.
 * <p>
 * The master is provisioned with a CTOR which is used to convert the
 * {@link KVO}s into <code>unsigned byte[]</code> <i>keys</i> and
 * <code>byte[]</code> <i>values</i> for writes on the scale-out index. The
 * master may be provisioned with a mechanism to filter out duplicate tuples.
 * <p>
 * Writes on the index partitions are asynchronous with respect to the
 * application. However, a {@link KVOC} / {@link KVOLatch} combination can be
 * used to coordinate notification when some set of tuples of interest have been
 * successfully written onto the scale-out index. This combination can also be
 * used to pass back values from the write operation if they are assigned by
 * side-effect onto the {@link KVO#obj} reference.
 * <p>
 * The asynchronous write implementation is divided into a master, with an input
 * queue and a redirect queue, and sinks, each of which has an input queue and
 * writes chunks onto a specific index partition for the scale-out index. The
 * input queues for the master and the sinks are bounded. The redirect input
 * queue is unbounded. The master and each sink is assigned its own worker
 * thread.
 * <p>
 * The master transfers chunks from its input queue to the sinks. It polls the
 * redirect queue for a chunk. If that queue was empty, then it polls a chunk
 * from the input queue. If no chunks are available, it needs to check again.
 * The master stops polling the input queue when the input queue is closed, but
 * it continues to drain the redirect queue until all sinks are done or the
 * master is canceled.
 * <p>
 * Note: The requirement for polling arises because: (a) we are not coordinating
 * signals for the arrival of a chunk on the input or redirect queues; and (b) a
 * chunk can be redirected at any time if there is an outstanding write by a
 * sink on an index partition.
 * <p>
 * The atomic decision to terminate the master is made using a {@link #lock}.
 * The lock is specific to the life cycle of the sinks. The lock is held when a
 * sink is created. When a sink terminates, its last action is to grab the lock
 * and signal the {@link #subtaskDone} {@link Condition}. The master terminates
 * when, while holding the lock, it observes that no sinks are running AND the
 * redirect queue is empty. Since chunks are placed onto the redirect queue by
 * sinks (and there are no sinks running) and by the master (which is not
 * issuing a redirect since it is running its termination logic) these criteria
 * are sufficient for termination. However, the sink must ensure that its buffer
 * is closed before it terminates, even if it terminates by exception, so that
 * an attempt to transfer a chunk to the sink will not block forever.
 * <p>
 * Once the master is holding a chunk, it splits the chunk into a set of dense
 * chunks correlated with the locators of the index partitions on which those
 * chunks will be written. The split operation is NOT atomic, but it is
 * consistent in the following sense. If a {@link Split} is identified based on
 * old information, then the chunk will be directed to an index partition which
 * no longer exists. An attempt to write on that index partition will result in
 * a stale locator exception, which is handled.
 * <p>
 * Once the chunk has been split, the split chunks are transferred to the
 * appropriate sink(s). Since the master is not holding any locks, a blocking
 * put() may be used to transfer the chunk to sink.
 * <p>
 * The sink drain chunks from its input queue. If the input queue is empty and
 * the idle timeout expires before a chunk is transferred to the input queue,
 * then the sink will be asynchronously closed and an
 * {@link IdleTimeoutException} will be set on the input queue. If the master
 * attempts to transfer a chunk to the sink's input queue after the idle
 * timeout, then an exception wrapping the idle timeout exception will be
 * thrown. The master handles the wrapped idle timeout exception by re-opening
 * the sink and will retry the transfer of the chunk to the (new) sink's input
 * queue. After the sink closes it's input queue by an idle timeout, it will
 * continue to drain the input queue until it is empty, at which point the sink
 * will terminate (this handles the case where the master concurrently
 * transferred a chunk to the sink's input queue before it was closed by the
 * idle time out).
 * <p>
 * The sink combines chunks drained from its input queue until the target chunk
 * size for a write is achieved or until the chunk timeout for the sink is
 * exceeded. The sink then writes on the index partition corresponding to its
 * locator. This write occurs in the thread assigned to the sink and the sink
 * will block during the write request.
 * <p>
 * If a stale locator exception is received by the sink in response to a write,
 * it will: (a) notify the client of the stale locator exception; (b) close the
 * input queue, setting the stale locator exception as the cause; (c) place the
 * chunk for that write onto the master's (unbounded) redirect queue; and (d)
 * drain its input queue and transfer the chunks to the master's redirect queue.
 * <p>
 * If the master attempts to transfer a chunk to the input queue for a sink
 * which has closed its input queue in response to a stale locator exception,
 * then an exception will be thrown with the stale locator exception as the
 * inner cause. The master will trap that exception and place the chunk on the
 * redirect queue instead.
 * <p>
 * If the sink RMI is successful, the sink will invoke the optional result
 * handler and touch each tuple in the chunk using KVO#done(). These protocols
 * can be used to pass results from asynchronous writes back to the application.
 * 
 * @param <H>
 *            The generic type of the value returned by {@link Callable#call()}
 *            for the master.
 * @param <E>
 *            The generic type of the elements in the chunks stored in the
 *            {@link BlockingBuffer}.
 * @param <S>
 *            The generic type of the subtask implementation class.
 * @param <L>
 *            The generic type of the locator object used to lookup a subtask in
 *            the internal map (must be unique and must implement hashCode() and
 *            equals() per their contracts).
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 * @see ISplitter
 * 
 * @todo Update javadoc to reflect that the master no longer waits for a closed
 *       sink in {@link #getSink(Object, boolean)} but instead places the sink
 *       onto the {@link #finishedSubtaskQueue}.
 */
public abstract class AbstractMasterTask<//
H extends AbstractMasterStats<L, ? extends AbstractSubtaskStats>, //
E, //
S extends AbstractSubtask,//
L>//
        extends AbstractHaltableProcess implements Callable<H>, IMasterTask<E,H> {

    static protected transient final Logger log = Logger
            .getLogger(AbstractMasterTask.class);

    /**
     * The top-level buffer on which the application is writing.
     */
    protected final BlockingBuffer<E[]> buffer;

    /**
     * A unbounded queue of chunks for which a {@link StaleLocatorException} was
     * received. When this buffer is not empty, it is drained by preference over
     * the {@link #buffer}.
     * <p>
     * This design allows us to avoid deadlocks when a sink is full and the
     * master is blocked attempting to add another chunk to that sink. If a
     * {@link StaleLocatorException} is thrown for the outstanding write by that
     * sink, then this situation would deadlock since the {@link #lock} is
     * already held and the sink is unable to drain.
     */
    private final BlockingQueue<E[]> redirectQueue = new LinkedBlockingQueue<E[]>(/* unbounded */);

    /**
     * The #of chunks on the master's redirectQueue.
     */
    public final int getRedirectQueueSize() {

        return redirectQueue.size();
        
    }

    /**
     * Places a chunk onto the master's redirectQueue.
     * 
     * @param chunk
     *            The chunk.
     * 
     * @throws InterruptedException
     */
    protected final void redirectChunk(final E[] chunk)
            throws InterruptedException {

        /*
         * @todo Acquiring the lock here should not be (and probably is not)
         * necessary for the master's atomic termination condition.
         */
        lock.lockInterruptibly();
        try {
            redirectQueue.put(chunk);
        } finally {
            lock.unlock();
        }

    }
    
    public BlockingBuffer<E[]> getBuffer() {

        return buffer;
        
    }
    
    /**
     * The iterator draining the {@link #buffer}.
     * <p>
     * Note: DO NOT close this iterator from within {@link #call()} as that
     * would cause this task to interrupt itself!
     */
    protected final IAsynchronousIterator<E[]> src;

    /**
     * Map from the index partition identifier to the open subtask handling
     * writes bound for that index partition.
     */
    private final ConcurrentHashMap<L, S> sinks;

    /**
     * Maps an operation across the subtasks.
     * 
     * @param op
     *            The operation, which should be light weight
     * 
     * @throws InterruptedException
     * @throws ExecutionException
     *             if a subtask throws an exception.
     */
    public void mapOperationOverSubtasks(final SubtaskOp<S> op)
            throws InterruptedException, ExecutionException {

        final Iterator<S> itr = sinks.values().iterator();

        while(itr.hasNext()) {
            
            try {

                op.call(itr.next());
                
            } catch (Exception ex) {
                
                throw new ExecutionException(ex);
                
            }
            
        }

    }

    /**
     * Lock used for sink life cycle events <em>only</em>.
     */
    private final ReentrantLock lock = new ReentrantLock();

    /**
     * Condition is signaled by a subtask when it is finished. This is used by
     * {@link #awaitAll()} to while waiting for subtasks to complete. If all
     * subtasks in {@link #subtasks} are complete when this signal is received
     * then the master may terminate.
     */
    private final Condition subtaskDone = lock.newCondition();

    /**
     * A queue of subtasks which have finished running. The master polls this
     * queue in {@link #call()} in order to clear finished sinks from
     * {@link #sinks} in a timely manner during normal operations.
     * {@link #awaitAll()} and {@link #cancelAll(boolean)} both handle this in
     * their own way.
     */
    private final BlockingQueue<S> finishedSubtaskQueue = new LinkedBlockingQueue<S>();

    /**
     * Notify the master that a subtask is done. The subtask is placed onto the
     * {@link #finishedSubtaskQueue} queue. The master polls that queue in
     * {@link #call()} and {@link #awaitAll()} and checks the {@link Future} of
     * each finished subtask using {@link #drainFutures()}. If a {@link Future}
     * reports an error, then the master is halted. This is how we ensure that
     * all subtasks complete normally.
     * 
     * @param subtask
     *            The subtask.
     * 
     * @throws InterruptedException
     */
    protected void notifySubtaskDone(final AbstractSubtask subtask)
            throws InterruptedException {
        
        if (subtask == null)
            throw new IllegalArgumentException();
        
        if (subtask.buffer.isOpen())
            throw new IllegalStateException();

        lock.lockInterruptibly();
        
        try {
    
            // atomic transfer from [sinks] to [finishedSubtasks].
            moveSinkToFinishedQueueAtomically((L) subtask.locator,
                    (AbstractSubtask) subtask);

            // signal condition (used by awaitAll()).
            subtaskDone.signalAll();

        } finally {
        
            lock.unlock();
            
        }
    
    }

    /**
     * Statistics for this (and perhaps other) masters.
     */
    public final H stats;
    
    public H getStats() {
        
        return stats;
        
    }

    /**
     * The timeout in nanoseconds before closing an idle output sink.
     */
    protected final long sinkIdleTimeoutNanos;

    /**
     * The time in nanoseconds that the {@link AbstractSubtask sink} will wait
     * inside of the {@link IAsynchronousIterator} when it polls the iterator
     * for a chunk. If this value is too large then the sink will block for
     * noticeable lengths of time and will be less responsive to interrupts.
     * Something in the 10s of milliseconds is appropriate.
     */
    protected final long sinkPollTimeoutNanos;

    /**
     * 
     * @param stats
     *            Statistics for the master.
     * @param buffer
     *            The buffer on which data is written by the application and
     *            from which it is drained by the master.
     * @param sinkIdleTimeoutNanos
     *            The time in nanoseconds after which an idle sink will be
     *            closed. Any buffered writes are flushed when the sink is
     *            closed. This must be GTE the <i>sinkChunkTimeout</i>
     *            otherwise the sink will decide that it is idle when it was
     *            just waiting for enough data to prepare a full chunk.
     * @param sinkPollTimeoutNanos
     *            The time in nanoseconds that the {@link AbstractSubtask sink}
     *            will wait inside of the {@link IAsynchronousIterator} when it
     *            polls the iterator for a chunk. If this value is too large
     *            then the sink will block for noticeable lengths of time and
     *            will be less responsive to interrupts. Something in the 10s of
     *            milliseconds is appropriate.
     * 
     * @todo sinkQueueCapacity, sinkChunkSize, and sinkChunkTimeoutNanos should
     *       be arguments for this class and a default
     *       {@link #newSubtaskBuffer()} implementation should be provided. The
     *       unit tests for the {@link AbstractMasterTask} need to be updated
     *       for that change, as do the other derived classes.
     */
    public AbstractMasterTask(final H stats, final BlockingBuffer<E[]> buffer,
            final long sinkIdleTimeoutNanos, final long sinkPollTimeoutNanos) {

        if (stats == null)
            throw new IllegalArgumentException();

        if (buffer == null)
            throw new IllegalArgumentException();

        if (sinkIdleTimeoutNanos <= 0)
            throw new IllegalArgumentException();

        if (sinkPollTimeoutNanos <= 0)
            throw new IllegalArgumentException();
        
        this.stats = stats;

        this.buffer = buffer;

        this.sinkIdleTimeoutNanos = sinkIdleTimeoutNanos;
        
        this.sinkPollTimeoutNanos = sinkPollTimeoutNanos;
        
        this.src = buffer.iterator();

        /*
         * The current access to this map is relatively limited. It is accessed
         * when assembling the performance counters and by the worker thread for
         * the master task. The size of the map is the #of index partitions. The
         * #of index partitions does not change rapidly, so this map will not be
         * resized frequently. The only time there is a rapid change in the #of
         * index partitions is when we scatter-split an index.
         */
        this.sinks = new ConcurrentHashMap<L, S>();
        
        stats.addMaster(this);

    }

    public H call() throws Exception {

        /*
         * Note: If idle timeouts are allowed then we need to reopen the sink if
         * it has closed by a timeout.
         */

        final boolean reopen = sinkIdleTimeoutNanos != 0;
        
        try {

            /*
             * Note: This polls the master's input buffer so it can check the
             * redirectQueue in a timely manner.
             */
            while (true) {

                halted();

                drainFutures();
                
                // drain the redirectQueue if not empty.
                E[] a = redirectQueue.poll();

                if (a == null) {

                    // poll the master's input queue.
                    if (src.hasNext(buffer.getChunkTimeout(),
                            TimeUnit.NANOSECONDS)) {
                        
                        // drain chunk from the master's input queue.
                        a = src.next();
                        
                    } else {

                        /*
                         * Nothing available right now.
                         */
        
                        if (!buffer.isOpen() && buffer.isEmpty()) {

                            /*
                             * If the master's input buffer is closed and has
                             * been drained then we stop polling here, but we
                             * will continue to drain the redirectQueue in
                             * awaitAll().
                             */
                            
                            break;

                        } else {

                            // Nothing available - poll again.
                            continue;

                        }

                    }

                } else {

                    if (log.isInfoEnabled())
                        log.info("Read chunk from redirectQueue");

                }

                // empty chunk?
                if (a.length == 0)
                    continue;
                
                synchronized (stats) {
                    // update the master stats.
                    stats.chunksIn.incrementAndGet();
                    stats.elementsIn.addAndGet(a.length);
                }

                handleChunk(a, reopen);
                
            } // while(true)

            awaitAll();

        } catch (Throwable t) {

            log.error("Cancelling: job=" + this + ", cause=" + t, t);

            try {
                cancelAll(true/* mayInterruptIfRunning */);
            } catch (Throwable t2) {
                log.error(t2);
            }

            throw new RuntimeException(t);

        }

        // Done.
        return stats;

    }

    /**
     * Handle the next chunk of elements from the {@link #buffer}.
     * 
     * @param chunk
     *            A chunk.
     * @param reopen
     *            When <code>false</code> it is an error if the output buffer
     *            has been closed. When <code>true</code> the output buffer
     *            will be (re-)opened as necessary. This will be
     *            <code>false</code> when invoked by {@link #call()} (since
     *            the output buffers are not closed until the master's buffer is
     *            closed) and should be <code>true</code> if you are handling
     *            redirects.
     */
    abstract protected void handleChunk(E[] chunk, boolean reopen)
            throws InterruptedException;

    /**
     * Extension hook for implementations where the clients accept work for
     * asynchronous processing and notify the master as work items completed
     * successfully or fail. The {@link AbstractMasterTask} will not terminate
     * unless this method returns <code>true</code> when queried while holding
     * the {@link #lock}. A true return indicates that there are no pending
     * work items. The default implementation returns <code>true</code>.
     * <p>
     * The work perform by the client must be idempotent (it must be safe to
     * re-perform the operation). Pending work items may be in an unknown state,
     * the master may submit the same work item to multiple clients (where that
     * is permitted by the locator semantics), the client task may fail before
     * the work item is complete, and a failed client can cause work items
     * associated with that client to be posted to another client.
     * <p>
     * To handle master termination, the pending set must track outstanding work
     * items. Those work item should be removed from the pending set as soon as
     * any client has successfully completed that work item (since the work is
     * idempotent).
     * <p>
     * To handle client failure, the subtask must track the pending set for its
     * client. If the client dies, then the subtask must handle the client by
     * placing all pending work items for that client (including any in the
     * chunk for the current request) onto the {@link #redirectQueue}.
     */
    protected boolean nothingPending() {

        return true;
        
    }

    /**
     * Extension hook invoked when the master's buffer is exhausted by
     * {@link #awaitAll()}. The default implementation is a NOP.
     */
    protected void willShutdown() throws InterruptedException {
        
        // NOP.
        
    }
    
    /**
     * Await the completion of the writes on each index partition. The master
     * will terminate when there are no active subtasks and the redirect queue
     * is empty. That condition is tested atomically.
     * 
     * @throws ExecutionException
     *             This will report the first cause.
     * @throws InterruptedException
     *             If interrupted while awaiting the {@link #lock} or the child
     *             tasks.
     * 
     * @todo The test suite should include a case where a set of redirects
     *       appear just as the sinks are processing their last chunk.
     */
    private void awaitAll() throws InterruptedException, ExecutionException {

        if(buffer.isOpen()) {

            /*
             * The buffer must be closed as a precondition otherwise the
             * application can continue to write data on the master's input
             * queue and the termination conditions can not be satisified.
             */

            throw new IllegalStateException();
            
        }

        /*
         * Extension hook may be used to map the pending set over the available
         * clients during shutdown.
         */
        willShutdown();
        
        while (true) {

            halted();

            /*
             * Note: We need to hold this lock in order to make an decision
             * atomic concerning whether or not the master's termination
             * conditions have been satisfied.
             * 
             * Note: If we poll the redirectQueue and find that there is another
             * chunk to be processed, then we MUST NOT hold the lock when
             * processing that chunk. Doing so will lead to a deadlock in
             * AbstractSubtask#call() when it tries to grab the lock so that it
             * can signal subtaskDone. Therefore we process the chunks outside
             * of this code block, once we have released the lock.
             */
            final E[] a;
            lock.lockInterruptibly();
            try {

                a = redirectQueue.poll();

                if (a == null) {

                    /*
                     * There is nothing available from the redirect queue.
                     */
                    
                    if (finishedSubtaskQueue.isEmpty() && sinks.isEmpty()
                            && redirectQueue.isEmpty() && nothingPending()) {

                        /*
                         * We are done since there are no running sinks, and no
                         * sinks whose Future we still need to test, and the
                         * redirectQueue is empty.
                         * 
                         * Note: We MUST be holding the lock for this
                         * termination condition to be atomic.
                         */
                        
                        if (log.isInfoEnabled())
                            log.info("All subtasks are done: " + this);
                        
                        return;

                    }

                    /*
                     * Check for sinks which have finished.
                     */
                    
                    if (log.isDebugEnabled())
                        log.debug("Waiting for " + sinks.size()
                                + " subtasks : " + this);

                    drainFutures();
                    
                    if (!finishedSubtaskQueue.isEmpty()) {

                        /*
                         * Yield the lock and wait up to a timeout for a sink to
                         * complete. We can not wait that long because we are
                         * still polling the redirect queue!
                         * 
                         * @todo config timeout
                         */

                        subtaskDone.await(50, TimeUnit.MILLISECONDS);

                    }

                }

            } finally {

                lock.unlock();

            }

            if (a != null) {

                /*
                 * Handle a redirected chunk.
                 * 
                 * Note: We DO NOT hold the [lock] here!
                 */

                handleChunk(a, true/* reopen */);
                
            }

        } // while(true)

    }

    /**
     * Cancel all running tasks, discarding any buffered data.
     * <p>
     * Note: This method does not wait on the canceled tasks.
     * <p>
     * Note: The caller should have already invoked {@link #halt(Throwable)}.
     */
    private void cancelAll(final boolean mayInterruptIfRunning)
            throws InterruptedException {

        log.warn("Cancelling job: " + this);

        /*
         * Close the buffer (nothing more may be written).
         * 
         * Note: We DO NOT close the iterator draining the buffer since that
         * would cause this task to interrupt itself!
         */
        buffer.close();
        
        // Clear the backing queue.
        buffer.clear();
        
        // cancel the futures.
        for (S sink : sinks.values()) {

            final Future<?> f = sink.buffer.getFuture();

            if (!f.isDone()) {

                f.cancel(mayInterruptIfRunning);

            }

        }

        // wait for all sinks to complete.
        for( S sink : sinks.values()) {
            final Future<?> f = sink.buffer.getFuture();
            try {
                f.get();
            } catch (InterruptedException ex) {
                throw ex;
            } catch (ExecutionException ex) {
                /*
                 * Ignore exceptions here since we are halting anyway and we can
                 * expect a bunch of canceled tasks because we just interrupted
                 * all of the subtasks.
                 */
                log.warn("sink=" + sink + " : " + ex);
            }
            
        }
        
        // clear references to the sinks.
        sinks.clear();
        
        // clear queue of finished sinks (we do not need to check their futures).
        finishedSubtaskQueue.clear();

        // clear the redirect queue.
        redirectQueue.clear();
        
    }

    /**
     * Return the sink for the locator. The sink is created if it does not exist
     * using {@link #newSubtaskBuffer()} and
     * {@link #newSubtask(Object, BlockingBuffer)}.
     * <p>
     * Note: The caller is single threaded since this is invoked from the
     * master's thread. This code depends on that assumption.
     * 
     * @param locator
     *            The locator (unique subtask key).
     * @param reopen
     *            <code>true</code> IFF a closed buffer should be re-opened
     *            (in fact, this causes a new buffer to be created and the new
     *            buffer will be drained by a new
     *            {@link IndexPartitionWriteTask}).
     * 
     * @return The sink for that locator.
     * 
     * @throws IllegalArgumentException
     *             if the argument is <code>null</code>.
     * @throws InterruptedException
     *             if interrupted.
     * @throws RuntimeException
     *             if {@link #halted()}
     */
    protected S getSink(final L locator, final boolean reopen)
            throws InterruptedException {

        if (locator == null)
            throw new IllegalArgumentException();

        // operation not allowed if halted.
        halted();

        S sink = sinks.get(locator);

        if (sink != null && sink.buffer.isOpen()) {

            /*
             * The sink is good, so return it.
             * 
             * Note: the caller must handle the exception if the sink is
             * asynchronously closed before the caller can use it.
             */

            return sink;

        }

        /*
         * @todo This lock should not be necessary (and probably is not) since
         * the caller is single threaded.
         */
        lock.lockInterruptibly();
        try {

            if (reopen && sink != null && !sink.buffer.isOpen()) {

                if (log.isInfoEnabled())
                    log.info("Reopening sink (was closed): " + this
                            + ", locator=" + locator);

                /*
                 * Note: Instead of waiting for the sink here, the sink will
                 * notify the master when it is done and be placed onto the
                 * [finishedSinks] queue. When it's Future#isDone(), we will
                 * drain it from the queue and check it's Future for errors.
                 * 
                 * This allows the master to not block when a sink is closed but
                 * still awaiting an RMI. It also implies that there can be two
                 * active sinks for the same locator, but only one will be
                 * recorded in [sinks] at any given time.
                 */

                // remove the old sink from the map (IFF that is the reference
                // found).
                moveSinkToFinishedQueueAtomically(locator, sink);

                // clear reference so we can re-open the sink.
                sink = null;

            }

            if (sink == null) {

                if (log.isInfoEnabled())
                    log.info("Creating output buffer: " + this + ", locator="
                            + locator);

                final BlockingBuffer<E[]> out = newSubtaskBuffer();

                sink = newSubtask(locator, out);

                final S oldval = sinks.put(locator, sink);

                // should not be an entry in the map for that locator.
                assert oldval == null : "locator=" + locator;

                // if (oldval == null) {

                /**
                 * Start subtask.
                 * 
                 * @see <a
                 *      href="https://sourceforge.net/apps/trac/bigdata/ticket/707">
                 *      BlockingBuffer.close() does not unblock threads </a>
                 */
                {
                    
                    // Wrap the computation as a FutureTask.
                    @SuppressWarnings({ "unchecked", "rawtypes" })
                    final FutureTask<? extends AbstractSubtaskStats> ft = new FutureTask(
                            sink);
                    
                    // Set Future on the BlockingBuffer.
                    out.setFuture(ft);
                    
                    // Assign a worker thread to the sink.
                    submitSubtask(ft);
                    
                }

                stats.subtaskStartCount.incrementAndGet();

                // } else {
                //
                // // concurrent create of the sink.
                // sink = oldval;
                //
                // }

            }

            return sink;

        } finally {

            lock.unlock();

        }

    }

    /**
     * Factory for a new buffer for a subtask.
     */
    abstract protected BlockingBuffer<E[]> newSubtaskBuffer();
    
    /**
     * Factory for a new subtask.
     * 
     * @param locator
     *            The unique key for the subtask.
     * @param out
     *            The {@link BlockingBuffer} on which the master will write for
     *            that subtask.
     *            
     * @return The subtask.
     */
    abstract protected S newSubtask(L locator, BlockingBuffer<E[]> out);
    
//    /**
//     * Submit the subtask to an {@link Executor}.
//     * 
//     * @param subtask
//     *            The subtask.
//     * 
//     * @return The {@link Future}.
//     */
//    abstract protected Future<? extends AbstractSubtaskStats> submitSubtask(S subtask);
    /**
     * Submit the subtask to an {@link Executor}.
     * 
     * @param subtask
     *            The {@link FutureTask} used to execute thee subtask.
     *            
     * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/707">
     *      BlockingBuffer.close() does not unblock threads </a>
     */
    abstract protected void submitSubtask(
            FutureTask<? extends AbstractSubtaskStats> subtask);

    /**
     * Drains any {@link Future}s from {@link #finishedSubtaskQueue} which are done
     * and halts the master if there is an error for a {@link Future}.
     * 
     * @throws InterruptedException
     *             if interrupted.
     */
    private void drainFutures() throws InterruptedException, ExecutionException {

        while (true) {
        
            halted();
            
            /*
             * Poll the queue of sinks that have finished running
             * (non-blocking).
             * 
             * Note: We don't actually remove the subtask from the queue until
             * its Future is available.
             */
            final S sink = finishedSubtaskQueue.peek();
            
            if (sink == null) {
                
                // queue is empty.
                return;
                
            }

            if(sink.buffer.isOpen())
                throw new IllegalStateException(sink.toString());

            final Future<?> f = sink.buffer.getFuture();

            if(!sink.buffer.getFuture().isDone()) {
             
                // Future is not available for the next sink in the queue.
                return;
                
            }
            
            try {

                // check the future.
                f.get();

                // remove the head of the queue.
                if (sink != finishedSubtaskQueue.remove()) {

                    // The wrong sink is at the head of the queue.
                    throw new AssertionError();
                    
                }
                
            } catch(ExecutionException ex) {
                
                // halt on error.
                throw halt(ex);
                
            } finally {

                /*
                 * Increment this counter immediately when the subtask is done
                 * regardless of the outcome. This is used by some unit tests to
                 * verify idle timeouts and the like.
                 */
                stats.subtaskEndCount.incrementAndGet();
                
                if (log.isDebugEnabled())
                    log.debug("subtaskEndCount incremented: " + sink.locator);
            }

        }

    }

    /**
     * Transfer a sink from {@link #sinks} to {@link #finishedSubtaskQueue}. The
     * entry for the locator is removed from {@link #sinks} atomically IFF that
     * map the given <i>sink</i> is associated with the given <i>locator</i> in
     * that map.
     * <p>
     * This is done atomically using the {@link #lock}. This is invoked both by
     * {@link AbstractSubtask#call()} (when it is preparing to exit call()) and
     * by {@link #getSink(Object, boolean)} (when it discovers that the sink for
     * a locator is closed, but not yet finished with its work).
     * <p>
     * Note: The sink MUST be atomically transferred from {@link #sinks} to
     * {@link #finishedSubtaskQueue} and {@link #awaitAll()} MUST verify that
     * both are empty in order for the termination condition to be atomic.
     * 
     * @param locator
     *            The locator.
     * @param sink
     *            The sink.
     * 
     * @todo this method really should be private. It is exposed for one of the
     *       unit tests.
     */
    protected void moveSinkToFinishedQueueAtomically(final L locator,
            final AbstractSubtask sink) throws InterruptedException {

        if (locator == null)
            throw new IllegalArgumentException();

        if (sink == null)
            throw new IllegalArgumentException();

        lock.lockInterruptibly();
        try {

            /*
             * Place on queue (check by call(), awaitAll()). It is safe to do
             * this even if the sink has already been moved, in which case its
             * Future will be checked twice, which is not a problem.
             */
            finishedSubtaskQueue.put((S) sink);

            /*
             * Remove map entry IFF it is for the same reference.
             */
            if (sinks.remove(locator, sink)) {

                if (log.isDebugEnabled())
                    log.debug("Removed output buffer: " + locator);

            }

        } finally {
        
            lock.unlock();
            
        }

    }

    /**
     * Resolves the output buffer onto which the split must be written and adds
     * the data to that output buffer.
     * 
     * @param split
     *            The {@link Split} identifies both the tuples to be dispatched
     *            and the {@link PartitionLocator} on which they must be
     *            written.
     * @param a
     *            The array of tuples. Only those tuples addressed by the
     *            <i>split</i> will be written onto the output buffer.
     * @param reopen
     *            <code>true</code> IFF a closed buffer should be re-opened (in
     *            fact, this causes a new buffer to be created and the new
     *            buffer will be drained by a new {@link AbstractSubtask}).
     * 
     * @throws InterruptedException
     *             if the thread is interrupted.
     */
    @SuppressWarnings("unchecked")
    protected void addToOutputBuffer(final L locator, final E[] a,
            final int fromIndex, final int toIndex, boolean reopen)
            throws InterruptedException {

        final int n = (toIndex - fromIndex);

        if (n == 0)
            return;

        /*
         * Make a dense chunk for this split.
         */

        final E[] b = (E[]) java.lang.reflect.Array.newInstance(a.getClass()
                .getComponentType(), n);

        System.arraycopy(a, fromIndex, b, 0, n);

        final long begin = System.nanoTime();

        boolean added = false;

        while (!added) {

            halted();

            // resolve the sink / create iff necessary.
            final S sink = getSink(locator, reopen);

            try {

                added = sink.buffer.add(b, offerWarningTimeoutNanos,
                        TimeUnit.NANOSECONDS);
                
                final long now = System.nanoTime();
                
                if (added) {

                    /*
                     * Update timestamp of the last chunk written on that sink.
                     */
                    sink.lastChunkNanos = now;

                } else {
                    
                    log.warn("Sink is slow: elapsed="
                            + TimeUnit.NANOSECONDS.toMillis(now - begin)
                            + "ms, sink=" + sink);
                    
                }
                
            } catch (BufferClosedException ex) {

                if (ex.getCause() instanceof StaleLocatorException) {

                    /*
                     * Note: The sinks sets the exception when closing the
                     * buffer when handling the stale locator exception and
                     * transfers the outstanding and all queued chunks to the
                     * redirectQueue.
                     * 
                     * When we trap the stale locator exception here we need to
                     * transfer the chunk to the redirectQueue since the buffer
                     * was closed (and drained) asynchronously.
                     */

                    if (log.isInfoEnabled())
                        log
                                .info("Sink closed asynchronously by stale locator exception: "
                                        + sink);

                    redirectChunk( b );

                    added = true;
                    
                } else if (ex.getCause() instanceof IdleTimeoutException
                        || ex.getCause() instanceof MasterExhaustedException) {

                    /*
                     * Note: The sinks sets the exception if it closes the input
                     * queue by idle timeout or because the master was
                     * exhausted.
                     * 
                     * These exceptions are trapped here and cause the sink to
                     * be re-opened (simply by restarting the loop).
                     */

                    if (log.isInfoEnabled())
                        log.info("Sink closed asynchronously: cause="
                                + ex.getCause() + ", sink=" + sink);

                    // definitely re-open!
                    reopen = true;
                    
                } else {

                    // anything else is a problem.
                    throw ex;

                }

            }

        }

        synchronized (stats) {

            stats.chunksTransferred.incrementAndGet();
            stats.elementsTransferred.addAndGet(b.length);
            stats.elementsOnSinkQueues.addAndGet(b.length);
            stats.elapsedSinkOfferNanos += (System.nanoTime() - begin);

        }

    }

    /**
     * This timeout is used to log warning messages when a sink is slow.
     */
    private final static long offerWarningTimeoutNanos = TimeUnit.MILLISECONDS
            .toNanos(5000);

}