ChunkedRunningQuery.java example

Explorer
blazegraph-master
- database-master
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 */
/*
 * Created on Aug 31, 2010
 */
package com.bigdata.bop.engine;

import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.Executor;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.log4j.Logger;

import com.bigdata.bop.BOp;
import com.bigdata.bop.BOpContext;
import com.bigdata.bop.BOpUtility;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.NoSuchBOpException;
import com.bigdata.bop.PipelineOp;
import com.bigdata.bop.fed.FederatedRunningQuery;
import com.bigdata.concurrent.FutureTaskMon;
import com.bigdata.journal.ITx;
import com.bigdata.journal.Journal;
import com.bigdata.rdf.sparql.ast.QueryHints;
import com.bigdata.relation.accesspath.BufferClosedException;
import com.bigdata.relation.accesspath.DelegateBuffer;
import com.bigdata.relation.accesspath.IAsynchronousIterator;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.relation.accesspath.IMultiSourceCloseableIterator;
import com.bigdata.relation.accesspath.MultiSourceSequentialCloseableIterator;
import com.bigdata.rwstore.sector.IMemoryManager;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.util.concurrent.Memoizer;

import cutthecrap.utils.striterators.ICloseableIterator;

/**
 * {@link IRunningQuery} implementation based on the assignment of
 * {@link IChunkMessage}(s) to an operator task. Operators (other than those
 * with "operator-at-once" evaluation semantics) will typically executed
 * multiple times, consuming at least one {@link IChunkMessage} each time they
 * are evaluated. {@link IChunkMessage}s target a specific operator (bopId) and
 * shard (shardId). In scale-out, binding sets will be mapped across the target
 * access path and may be replicated to one or more nodes depending on the
 * distribution of the shards. This evaluation strategy is compatible with both
 * the {@link Journal} (aka standalone) and the {@link IBigdataFederation} (aka
 * clustered or scale-out).
 * <p>
 * Note: The challenge with this implementation is managing the amount of data
 * buffered on the JVM heap without introducing control structures which can
 * result in deadlock or starvation. This has been addressed to a large extent
 * by sharing a lock between this class and the per-operator input work queues
 * using modified version of the JSR 166 classes. For high volume operator at
 * once evaluation, we need to buffer the data on the native process heap using
 * the {@link IMemoryManager}.
 */
public class ChunkedRunningQuery extends AbstractRunningQuery {

    private final static transient Logger log = Logger
            .getLogger(ChunkedRunningQuery.class);

    private final static boolean INFO = log.isInfoEnabled();
    
    /**
     * Logger for the {@link ChunkTask}.
     */
    private final static Logger chunkTaskLog = Logger
            .getLogger(ChunkTask.class);
    
    /**
     * Used to map {@link IBindingSet}s across the federation or to migrate
     * solutions from the managed object heap onto the native heap.
     * 
     * @see BLZG-533 Vector query engine on native heap.
     */
    private final IChunkHandler chunkHandler;

    /**
     * A collection of (bopId,partitionId) keys mapped onto a collection of
     * operator task evaluation contexts for currently executing operators for
     * this query.
     */
    private final ConcurrentHashMap<BSBundle, ConcurrentHashMap<ChunkFutureTask,ChunkFutureTask>> operatorFutures;

    /**
     * A map of unbounded work queues for each (bopId,partitionId). Empty queues
     * are removed from the map.
     * <p>
     * The map is guarded by the {@link #lock}.
     * <p>
     * Note: Using a hash map here means that {@link #consumeChunks()} will draw
     * from operator queues more or less at random. Using an ordered map will
     * impose a bias, depending on the natural ordering of the map keys.
     * 
     * @see BSBundle#compareTo(BSBundle)
     * 
     *      FIXME Either this and/or {@link #operatorFutures} must be a weak
     *      value map in order to ensure that entries are eventually cleared in
     *      scale-out where the #of entries can potentially be very large since
     *      they are per (bopId,shardId). While these maps were initially
     *      declared as {@link ConcurrentHashMap} instances, if we remove
     *      entries once the map/queue entry is empty, this appears to open a
     *      concurrency hole which does not exist if we leave entries with empty
     *      map/queue values in the map. Changing to a weak value map should
     *      provide the necessary pruning of unused entries without opening up
     *      this concurrency hole.
     */
    private final ConcurrentMap<BSBundle, BlockingQueue<IChunkMessage<IBindingSet>>> operatorQueues;

    /**
     * Set to <code>true</code> to make {@link #operatorQueues} and ordered map.
     * When <code>true</code>, {@link #consumeChunk()} will have an ordered bias
     * in how it schedules work. [The historical behavior is present when this
     * is <code>false</code>.]
     */
    private static final boolean orderedOperatorQueueMap = false;
    
    /**
     * FIXME It appears that this is Ok based on a single unit test known to
     * fail when {@link #removeMapOperatorQueueEntries} is <code>true</code>,
     * but I expect that a similar concurrency problem could also exist for the
     * {@link #operatorFutures} even through it does not produce a deadlock.
     */
    static private final boolean removeMapOperatorFutureEntries = false;
    
    /**
     * FIXME See operatorQueues for why removing the map entries appears to
     * cause problems. This is problem is demonstrated by
     * TestQueryEngine#test_query_slice_noLimit() when
     * {@link PipelineOp.Annotations#PIPELINE_QUEUE_CAPACITY} is ONE (1).
     */
    static private final boolean removeMapOperatorQueueEntries = false;

    /**
     * When <code>true</code> the {@link IHaltOpMessage} is queued on an
     * {@link Executor} and will be delivered asynchronously to the query
     * controller. When <code>false</code> the message is delivered in the same
     * thread that ran the {@link ChunkTask}. This has implications for the
     * relative timing of changes to the {@link RunState} of the
     * {@link IRunningQuery}.
     * <p>
     * Note: it appears to be valid to use either synchronous or asynchronous
     * messaging for the {@link IHaltOpMessage}s. However, this can always be
     * changes to <code>true</code> in order to rule out the asynchronous
     * arrival of the {@link IHaltOpMessage} as a source of concurrency issues.
     * 
     * TODO Test the performance impact of this option, e.g., on BSBM.
     */
    static private final boolean asynchronousHaltMessage = true;
    
//    /**
//     * The chunks available for immediate processing (they must have been
//     * materialized).
//     * <p>
//     * Note: This is package private so it will be visible to the
//     * {@link QueryEngine}.
//     */
//    final/* private */BlockingQueue<IChunkMessage<IBindingSet>> chunksIn = new LinkedBlockingDeque<IChunkMessage<IBindingSet>>();
    
    /**
     * @param queryEngine
     *            The {@link QueryEngine} on which the query is running. In
     *            scale-out, a query is typically instantiated on many
     *            {@link QueryEngine}s.
     * @param queryId
     *            The identifier for that query.
     * @param controller
     *            <code>true</code> iff the {@link QueryEngine} is the query
     *            controller for this query (the {@link QueryEngine} which will
     *            coordinate the query evaluation).
     * @param clientProxy
     *            The query controller. In standalone, this is the same as the
     *            <i>queryEngine</i>. In scale-out, this is a proxy for the
     *            query controller whenever the query is instantiated on a node
     *            other than the query controller itself.
     * @param query
     *            The query.
     * @param saStats
     *            Statistics object containing static analysis statistics
     * 
     * @throws IllegalArgumentException
     *             if any argument is <code>null</code>.
     * @throws IllegalArgumentException
     *             if the <i>readTimestamp</i> is {@link ITx#UNISOLATED}
     *             (queries may not read on the unisolated indices).
     * @throws IllegalArgumentException
     *             if the <i>writeTimestamp</i> is neither
     *             {@link ITx#UNISOLATED} nor a read-write transaction
     *             identifier.
     */
    public ChunkedRunningQuery(final QueryEngine queryEngine, final UUID queryId,
            final boolean controller, final IQueryClient clientProxy,
            final PipelineOp query, final IChunkMessage<IBindingSet> realSource) {

        super(queryEngine, queryId, controller, clientProxy, query, realSource);

        this.chunkHandler = getChunkHandler(queryEngine, query);
        
        this.operatorFutures = new ConcurrentHashMap<BSBundle, ConcurrentHashMap<ChunkFutureTask, ChunkFutureTask>>();

        if (orderedOperatorQueueMap) {

            this.operatorQueues = new ConcurrentSkipListMap<BSBundle, BlockingQueue<IChunkMessage<IBindingSet>>>();
            
        } else {
            
            this.operatorQueues = new ConcurrentHashMap<BSBundle, BlockingQueue<IChunkMessage<IBindingSet>>>();

        }

    }

    /**
     * Make a chunk of binding sets available for consumption by the query.
     * <p>
     * Note: this is invoked by {@link QueryEngine#acceptChunk(IChunkMessage)}
     * 
     * @param msg
     *            The chunk.
     * 
     * @return <code>true</code> if the message was accepted.
     */
    @Override
    protected boolean acceptChunk(final IChunkMessage<IBindingSet> msg) {

        if (msg == null)
            throw new IllegalArgumentException();

        if (!msg.isMaterialized())
            throw new IllegalStateException();

//        if (SolutionsLog.solutionsLog.isInfoEnabled())
//            SolutionsLog.log(msg);
        
        final BSBundle bundle = new BSBundle(msg.getBOpId(), msg
                .getPartitionId());

        lock.lock();

        try {

            if (isDone()) {
                // The query is no longer running.
                msg.release();
                return false;
                //throw new RuntimeException(ERR_QUERY_DONE, future.getCause());
            }

            BlockingQueue<IChunkMessage<IBindingSet>> queue = operatorQueues
                    .get(bundle);

            if (queue == null) {

                /*
                 * There is no input queue for this operator, so we create one
                 * now while we are holding the lock. If the target is a
                 * pipelined operator, then we impose a limit on the #of
                 * messages which may be buffered for that operator. If the
                 * operator is NOT pipelined, e.g., ORDER_BY, then we use an
                 * unbounded queue.
                 * 
                 * TODO Unit/stress tests with capacity set to 1.
                 */
                
                // The target operator for this message.
                final PipelineOp bop = (PipelineOp) getBOp(msg.getBOpId());
                
                final int capacity = bop.isPipelinedEvaluation() ? bop.getProperty(
                        PipelineOp.Annotations.PIPELINE_QUEUE_CAPACITY,
                        PipelineOp.Annotations.DEFAULT_PIPELINE_QUEUE_CAPACITY)
                        : Integer.MAX_VALUE;

                // Create a new queue using [lock].
                queue = new com.bigdata.jsr166.LinkedBlockingDeque<IChunkMessage<IBindingSet>>(//
                        capacity,
                        lock);

                // Add to the collection of operator input queues.
                if (operatorQueues.put(bundle, queue) != null) {

                    /*
                     * There must not be an entry for this operator. We checked
                     * for this above. Nobody else should be adding entries into
                     * the [operatorQueues] map.
                     */

                    throw new AssertionError(bundle.toString());

                }
             
            }

            // offer (non-blocking)
            if (!queue.offer(msg)) {
                // Offer failed.  Work queue will block.
                getQueryEngine().counters.blockedWorkQueueCount.increment();
                getQueryEngine().counters.blockedWorkQueueRunningTotal.increment();
                try {
                    // blocking put()
                    queue.put(msg);
                    getQueryEngine().counters.bufferedChunkMessageCount.increment();
                    if (msg instanceof LocalNativeChunkMessage) {
                        getQueryEngine().counters.bufferedChunkMessageBytesOnNativeHeap
                                .add(((LocalNativeChunkMessage) msg).getByteCount());
                    }
                } finally {
                    // Queue unblocked
                    getQueryEngine().counters.blockedWorkQueueCount.decrement();
                }
            }

            // and see if the target operator can run now.
            scheduleNext(bundle);
            
            return true;
            
        } catch(InterruptedException ex) {
            
            // wrap interrupt thrown out of queue.put(msg);
            
            msg.release();
            
            throw new RuntimeException(ex);
            
        } finally {

            lock.unlock();

        }

    }

    /**
     * {@inheritDoc}.
     * <p>
     * Examines the input queue for each (bopId,partitionId). If there is work
     * available and no task is currently running, then drain the work queue and
     * submit a task to consume that work.
     */
    @Override
    protected void consumeChunk() {
        lock.lock();
        try {
            for (BSBundle bundle : operatorQueues.keySet()) {
                scheduleNext(bundle);
            }
        } catch (RuntimeException ex) {
            halt(ex);
        } finally {
            lock.unlock();
        }
    }

    /**
     * Overridden to attempt to consume another chunk each time an operator
     * reports that it has halted evaluation. This is necessary because the
     * haltOp() message can arrive asynchronously, so we need to test the work
     * queues in case there are "at-once" operators awaiting the termination of
     * their predecessor(s) in the pipeline.
     */
    @Override
    protected void haltOp(final IHaltOpMessage msg) {
        lock.lock();
        try {
            super.haltOp(msg);
            consumeChunk();
        } finally {
            lock.unlock();
        }
    }

    /**
     * Examine the input queue for the (bopId,partitionId). If there is work
     * available, then drain the work queue and submit a task to consume that
     * work. This handles {@link PipelineOp.Annotations#MAX_PARALLEL},
     * {@link PipelineOp.Annotations#PIPELINED}, and
     * {@link PipelineOp.Annotations#MAX_MESSAGES_PER_TASK} as special cases.
     * 
     * @param bundle
     *            The (bopId,partitionId).
     * 
     * @return <code>true</code> if a new task was started.
     * 
     * @todo Also handle {@link PipelineOp.Annotations#MAX_MEMORY} here by
     *       handshaking with the {@link FederatedRunningQuery}.
     */
    private boolean scheduleNext(final BSBundle bundle) {
        if (bundle == null)
            throw new IllegalArgumentException();
        final PipelineOp bop = (PipelineOp) getBOp(bundle.bopId);
        final int maxParallel = bop.getMaxParallel();
        final boolean pipelined = bop.isPipelinedEvaluation();
        final int maxMessagesPerTask = bop.getProperty(
                PipelineOp.Annotations.MAX_MESSAGES_PER_TASK,
                PipelineOp.Annotations.DEFAULT_MAX_MESSAGES_PER_TASK);
//      final long maxMemory = bop.getProperty(
//              PipelineOp.Annotations.MAX_MEMORY,
//              PipelineOp.Annotations.DEFAULT_MAX_MEMORY);
        lock.lock();
        try {
            // Make sure the query is still running.
            if(isDone())
                return false;
            // Is there a Future for this (bopId,partitionId)?
            ConcurrentHashMap<ChunkFutureTask, ChunkFutureTask> map = operatorFutures
                    .get(bundle);
            // #of instances of the operator already running.
            int nrunning = 0;
            if (map != null) {
                for (ChunkFutureTask cft : map.keySet()) {
                    if (cft.isDone()) {
                        // Remove tasks which have already terminated.
                        map.remove(cft);
                    }
                    nrunning++;
                }
                if (nrunning == 0) {
                    // No tasks running for this operator.
                    if(removeMapOperatorFutureEntries)
                        if (map != operatorFutures.remove(bundle)) throw new AssertionError();
                }
            }
            if (nrunning >= maxParallel) {
                /*
                 * Defer concurrent execution for the same (bopId,shardId) since
                 * there are already at lease [maxParallel] instances of this
                 * operator running for that (bopId,shardId).
                 */
                if (log.isDebugEnabled())
                    log.debug("Deferring next execution: " + bundle
                            + ", #running=" + nrunning + ", maxParallel="
                            + maxParallel + ", runState=" + runStateString());
                return false;
            }
//          {
//              /*
//               * Verify that we can acquire sufficient permits to do some
//               * work.
//               */
//              final BlockingQueue<IChunkMessage<IBindingSet>> queue = operatorQueues
//                      .get(bundle);
//              if (queue == null || queue.isEmpty()) {
//                  // No work.
//                  return false;
//              }
//              // The queue could be increased, but this will be its minimum size.
//              final int minQueueSize = queue.size();
//              if(!outstandingMessageSemaphore.tryAcquire(minQueueSize)) {
//                  // Not enough permits.
//                  System.err.println("Permits: required=" + minQueueSize
//                          + ", available="
//                          + outstandingMessageSemaphore.availablePermits()
//                          + ", bundle=" + bundle);
//                  return false;
//              }
//          
//          }
            // Get the work queue for that (bopId,partitionId).
            final BlockingQueue<IChunkMessage<IBindingSet>> queue = operatorQueues
                    .get(bundle);
            if (queue == null) {
                // no work
                return false;
            }
//          if (false && pipelined && !getQueryEngine().isScaleOut()) {
//              /*
//               * For pipelined operators, examine the sink and altSink (if
//               * different). For each of the sink and altSink, if the
//               * (alt)sink operator is also pipelined and there are more than
//               * maxMessagesPerTask messages on its work queue, then this
//               * operator needs to block, which we handle by deferring its
//               * next evaluation task. This allows us to bound the amount of
//               * data placed onto the JVM heap.
//               * 
//               * FIXME In order to do this for scale-out, we need to examine
//               * each of the shardIds for the (alt)sink, for example, by
//               * organizing the operatorQueues as a nested map: (bopId =>
//               * (shardId => queue)).
//               */
//
//              // parent (null if this is the root of the operator tree).
//              final BOp p = BOpUtility.getParent(getQuery(), bop);
//              /*
//               * The sink is the parent. The parent MUST have an id so we can
//               * target it with a message. (The sink will be null iff there is
//               * no parent for this operator.)
//               */
//              final Integer sinkId = BOpUtility.getEffectiveDefaultSink(bop,
//                      p);
//              // altSink (null when not specified).
//              final Integer altSinkId = (Integer) bop
//                      .getProperty(PipelineOp.Annotations.ALT_SINK_REF);
//
//              if (sinkId != null) {
//
//                  // Examine the work queue for the sink.
//                  
//                  final BlockingQueue<IChunkMessage<IBindingSet>> sinkQueue = operatorQueues
//                          .get(new BSBundle(sinkId, -1));
//
//                  final int sinkQueueSize = sinkQueue == null ? 0 : sinkQueue
//                          .size();
//
//                  final int sinkMaxMessagesPerTask = getBOp(sinkId).getProperty(
//                          PipelineOp.Annotations.MAX_MESSAGES_PER_TASK,
//                          PipelineOp.Annotations.DEFAULT_MAX_MESSAGES_PER_TASK);
//
//                  if (sinkQueueSize > sinkMaxMessagesPerTask) {
//                      log.warn("Waiting on consumer(s): bopId="
//                              + bundle.bopId + ", nrunning=" + nrunning
//                              + ", maxParallel=" + maxParallel + ", sink="
//                              + sinkId + ", sinkQueueSize=" + sinkQueueSize
//                              + ", sinkMaxMessagesPerTask="
//                              + sinkMaxMessagesPerTask+", runState="+runStateString());
//                      return false;
//                  }
//
//              }
//
//              if (altSinkId != null && !altSinkId.equals(sinkId)) {
//
//                  // Examine the work queue for the altSink.
//
//                  final BlockingQueue<IChunkMessage<IBindingSet>> sinkQueue = operatorQueues
//                          .get(new BSBundle(altSinkId, -1));
//
//                  final int sinkQueueSize = sinkQueue == null ? 0 : sinkQueue
//                          .size();
//
//                  final int sinkMaxMessagesPerTask = getBOp(altSinkId).getProperty(
//                          PipelineOp.Annotations.MAX_MESSAGES_PER_TASK,
//                          PipelineOp.Annotations.DEFAULT_MAX_MESSAGES_PER_TASK);
//
//                  if (sinkQueueSize > sinkMaxMessagesPerTask) {
//                      log.warn("Waiting on consumer(s): bopId="
//                              + bundle.bopId + ", nrunning=" + nrunning
//                              + ", maxParallel=" + maxParallel + ", altSink="
//                              + altSinkId + ", altSinkQueueSize="
//                              + sinkQueueSize + ", sinkMaxMessagesPerTask="
//                              + sinkMaxMessagesPerTask+", runState="+runStateString());
//                      return false;
//                  }
//
//              }
//
//            }
            if (queue.isEmpty()) {
                // No work, so remove work queue for (bopId,partitionId).
                if(removeMapOperatorQueueEntries)
                    if (queue != operatorQueues.remove(bundle)) throw new AssertionError();
                return false;
            }
            /*
             * true iff operator requires at once evaluation and all solutions
             * are now available for that operator.
             */
            boolean atOnceReady = false;
            if (!pipelined) {
                if (!isAtOnceReady(bundle.bopId)) {
                    /*
                     * This operator is not pipelined, so we need to wait until
                     * all of its input solutions have been materialized (no
                     * prior operator in the pipeline is running or has inputs
                     * available which could cause it to run).
                     * 
                     * TODO This is where we should examine MAX_MEMORY and the
                     * buffered data to see whether or not to trigger an
                     * evaluation pass for the operator based on the data
                     * already materialized for that operator.
                     */
                    if (log.isDebugEnabled())
                        log.debug("Waiting on producer(s): bopId="
                                + bundle.bopId);
                    return false;
                }
                atOnceReady = true;
            }
            /*
             * Drain the work queue for that (bopId,partitionId).
             * 
             * Note: If the operator is pipelined, then we do not drain more
             * than [maxMessagesPerTask] messages at a time. The remainder are
             * left on the work queue for the next task instance which we start
             * for this operator.
             */
            final List<IChunkMessage<IBindingSet>> accepted = new LinkedList<IChunkMessage<IBindingSet>>();
            try {
                /*
                 * Note: Once we drain these messages from the work queue we are
                 * responsible for calling release() on them.
                 */
                queue.drainTo(accepted, pipelined ? maxMessagesPerTask
                        : Integer.MAX_VALUE);
                // #of messages accepted from the work queue.
                final int naccepted = accepted.size();
                getQueryEngine().counters.bufferedChunkMessageCount.add(-naccepted);
                {
                    long byteCount = 0;
                    for(IChunkMessage<?> msg : accepted) {
                        if (msg instanceof LocalNativeChunkMessage) {
                            byteCount += ((LocalNativeChunkMessage) msg).getByteCount();
                        }
                    }
                    if(byteCount!=0) {
                        getQueryEngine().counters.bufferedChunkMessageBytesOnNativeHeap.add(-byteCount);
                    }
                }
                // #of messages remaining on the work queue.
                final int nremaining = queue.size();
                if (nremaining == 0) {
                    // Remove the work queue for that (bopId,partitionId).
                    if(removeMapOperatorQueueEntries)
                        if(queue != operatorQueues.remove(bundle)) throw new AssertionError();
                } else if (pipelined) {
                    /*
                     * After removing the maximum amount from a pipelined operator,
                     * the work queue is still not empty.
                     */
                    if (INFO)
                        log.info("Work queue is over capacity: bundle=" + bundle
                                + ", naccepted=" + naccepted + ", nremaining="
                                + nremaining + ", maxMessagesPerTask="
                                + maxMessagesPerTask + ", runState="
                                + runStateString());
                }
                /*
                 * Combine the messages into a single source to be consumed by a
                 * task.
                 */
                int nassigned = 1;
                final Iterator<IChunkMessage<IBindingSet>> mitr = accepted.iterator();
                final IChunkMessage<IBindingSet> firstChunk = mitr.next();
                // See BOpContext#isLastInvocation()
                final boolean isLastInvocation = pipelined
    //                    && nremaining == 0
    //                    && maxParallel == 1
    //                    && isOperatorDone(bundle.bopId)
                        && firstChunk.isLastInvocation()
                        ;
                /*
                 * Note: There is no longer any reliance on the IAsynchronous
                 * Iterator API here. It is perfectly sufficient to only
                 * implement ICloseableIterator. Query operator and chunk
                 * message implementations should be revisited with this
                 * simplifying assumption in mind.
                 * 
                 * @see https://sourceforge.net/apps/trac/bigdata/ticket/475
                 */
                final IMultiSourceCloseableIterator<IBindingSet[]> source = new MultiSourceSequentialCloseableIterator<IBindingSet[]>(//
    //                  accepted.remove(0).getChunkAccessor().iterator()//
                        firstChunk.getChunkAccessor().iterator()//
                        );
    //            for (IChunkMessage<IBindingSet> msg : accepted) {
    //          source.add(msg.getChunkAccessor().iterator());
                // #of solutions accepted across those chunk messages.
                final long solutionsAccepted;
                {
                    long na = firstChunk.getSolutionCount();
                    while (mitr.hasNext()) {
                        final IChunkMessage<IBindingSet> msg = mitr.next();
                        na += msg.getSolutionCount();
                        source.add(msg.getChunkAccessor().iterator());
                        nassigned++;
                    }
                    solutionsAccepted = na;
                }
                if (nassigned != naccepted)
                    throw new AssertionError();
                /*
                 * Create task to consume that source.
                 */
                final ChunkFutureTask cft;
                try {
                    cft = new ChunkFutureTask(
                            new ChunkTask(bundle.bopId, bundle.shardId,
                                    naccepted, isLastInvocation, source));
                } catch (Throwable t2) {
                    // Ensure accepted messages are released();
                    safeRelease(accepted);
                    halt(t2); // ensure query halts.
                    if (getCause() != null) {
                        // Abnormal termination - wrap and rethrow.
                        throw new RuntimeException(t2);
                    }
                    // normal termination - swallow the exception.
                    return false;
                }
                /*
                 * Save the Future for this task. Together with the logic above this
                 * may be used to limit the #of concurrent tasks per (bopId,shardId)
                 * to one for a given query.
                 */
                if (map == null) {
                    map = new ConcurrentHashMap<ChunkFutureTask, ChunkFutureTask>();
                    operatorFutures.put(bundle, map);
                }
                map.put(cft, cft);
                /*
                 * Submit task for execution (asynchronous).
                 */
                if (INFO)
                    log.info("Running task: bop=" + bundle.bopId
                            + (pipelined?"":", atOnceReady=" + atOnceReady) + ", bop="
                            + bop.toShortString() + ", messages=" + naccepted
                            + ", solutions=" + solutionsAccepted
                            + (log.isDebugEnabled()?", runState=" + runStateString():""));
                getQueryEngine().execute(cft);
                return true;
            } catch(Throwable t) {
                // Ensure accepted messages are released();
                safeRelease(accepted);
                // wrap and rethrow cause.
                throw new RuntimeException(t);
            }
        } finally {
            lock.unlock();
        }
    }

    /**
     * Ensure messages are {@link IChunkMessage#release() released()}. Nothing
     * is thrown unless the {@link Throwable} has a root cause which indicates
     * an interrupt.
     * 
     * @param accepted
     *            The messages.
     * 
     * @throws RuntimeException
     *             wrapping an {@link InterruptedException} cause.
     */
    private void safeRelease(final List<IChunkMessage<IBindingSet>> accepted) {
        if (accepted == null)
            return;
        try {
            for (IChunkMessage<IBindingSet> msg : accepted) {
                try {
                    msg.release();
                } catch (Throwable t2) {
                    if (isRootCauseInterrupt(t2)) {
                        throw t2;
                    }
                    log.error("Could not release message: " + msg, t2);
                }
            }
        } catch (Throwable t2) {
            if (isRootCauseInterrupt(t2)) {
                // Wrap and throw out.
                throw new RuntimeException(t2);
            }
            log.error(t2, t2);
        }
    }

    /**
     * A {@link FutureTask} which conditionally schedules another task for the
     * same (bopId, partitionId) once this the wrapped {@link ChunkTask} is
     * done. This is similar to the {@link Memoizer} pattern. This class
     * coordinates with the {@link #operatorFutures}, which maintains a map of
     * all currently running tasks which are consuming chunks.
     * <p>
     * The {@link ChunkTask} is wrapped by a {@link ChunkTaskWrapper} which is
     * responsible for communicating the changes in the query's running state
     * back to the {@link RunState} object on the query controller.
     */
    private class ChunkFutureTask extends FutureTaskMon<Void> {

        private final ChunkTask t;

        public ChunkFutureTask(final ChunkTask chunkTask) {

            /*
             * Note: wraps chunk task to communicate run state changes back to
             * the query controller.
             */
            super(new ChunkTaskWrapper(chunkTask), null/* result */);

            this.t = chunkTask;

        }

        @Override
        public void run() {

            try {

                super.run();
                
            } catch(Throwable t) {

                // ensure query halts.
                halt(t);
                
                if (getCause() != null) {

                    // abnormal termination. wrap and rethrow.
                    throw new RuntimeException(t);
                    
                }
                
                // otherwise ignore exception (normal termination).
                
            } finally {

                /*
                 * This task is done executing so remove its Future before we
                 * attempt to schedule another task for the same
                 * (bopId,partitionId).
                 */

                final ConcurrentHashMap<ChunkFutureTask, ChunkFutureTask> map = operatorFutures
                        .get(new BSBundle(t.bopId, t.partitionId));

                if (map != null) {

                    map.remove(this, this);

                }

            }

            // Schedule another task if any messages are waiting.
            ChunkedRunningQuery.this.scheduleNext(new BSBundle(t.bopId,
                    t.partitionId));

        }
        
    }

    /**
     * Wraps the {@link ChunkTask} and handles various handshaking with the
     * {@link ChunkedRunningQuery} and the {@link RunState} on the query controller.
     * Since starting and stopping a {@link ChunkTask} requires handshaking with
     * the query controller (and thus can require RMI), it is important that
     * these actions take place once the task has been submitted - otherwise
     * they would be synchronous in the loop which consumes available chunks and
     * generates new {@link ChunkTask}s.
     */
    private class ChunkTaskWrapper implements Runnable {

        private final ChunkTask t;

        public ChunkTaskWrapper(final ChunkTask chunkTask) {

            if (chunkTask == null)
                throw new IllegalArgumentException();
            
            this.t = chunkTask;
            
        }
        
        @Override
        public void run() {

            final QueryEngine queryEngine = getQueryEngine();
            final UUID serviceId = queryEngine.getServiceUUID();
            final QueryEngineCounters c = queryEngine.counters;
            try {

                /*
                 * Notify query controller that operator task will start (sync
                 * notification).
                 * 
                 * Note: This is potentially an RMI back to the controller. It
                 * is invoked from within the running task in order to remove
                 * the latency for that RMI from the thread which submits tasks
                 * to consume chunks.
                 * 
                 * FIXME This is a protocol that should be optimized to provide
                 * better throughput for scale-out. E.g., a single socket on
                 * which we transmit and receive notice about operator
                 * start/stop metadata using some non-blocking service.
                 */

//                final boolean lastPassRequested = ((PipelineOp) (t.bop))
//                        .isLastPassRequested();

                getQueryController().startOp(
                        new StartOpMessage(getQueryId(), t.bopId,
                                t.partitionId, serviceId, t.messagesIn 
//                                ,t.bop.getEvaluationContext(),
//                                lastPassRequested
                                ));

                /*
                 * Run the operator task.
                 */
                final long begin = System.currentTimeMillis();
                try {
                    c.operatorStartCount.increment();
                    c.operatorActiveCount.increment();
                    t.call();
                } catch(Throwable t2) {
                    halt(t2); // ensure query halts.
                    if (getCause() != null) {
                        // Abnormal termination - wrap and rethrow.
                        throw new RuntimeException(t2);
                    }
                    // normal termination - swallow the exception.
                } finally {
                    c.operatorHaltCount.increment();
                    c.operatorActiveCount.decrement();
                    t.context.getStats().elapsed.add(System.currentTimeMillis()
                            - begin);
                }

                // Notify query controller that operator task did run.
                sendHaltMessage(serviceId, t, null/* firstCause */);
                
            } catch (Throwable ex1) {

                /*
                 * Mark the query as halted on this node regardless of whether
                 * we are able to communicate with the query controller.
                 * 
                 * Note: Invoking halt(t) here will log an error. This logged
                 * error message is necessary in order to catch errors in
                 * clientProxy.haltOp() (above and below).
                 */

                // ensure halted.
                halt(ex1);

                if (getCause() != null) {

                    // Log an error (abnormal termination only).
                    log.error("queryId=" + getQueryId() + ", bopId=" + t.bopId
                            + ", bop=" + t.bop, ex1);

                }
                
                // Notify query controller that operator task did run.
                sendHaltMessage(serviceId, t, getCause()/* firstCauseIfError */);

            }
        
        } // runOnce()
        
    }

    /**
     * Send a {@link IHaltOpMessage}. Whether the delivery is synchronous or not
     * depends on {@link #asynchronousHaltMessage}. If this is the controller
     * and we are using synchronous message delivery, then the message is
     * delivered directly to {@link AbstractRunningQuery#haltOp(IHaltOpMessage)}.
     * Otherwise it will pass through {@link IQueryClient#haltOp(IHaltOpMessage)}
     * .
     * 
     * @param serviceId
     *            The service to be notified.
     * @param t
     *            The {@link ChunkTask}.
     * @param cause
     *            The first cause (or <code>null</code> if the task did not end
     *            with an error).
     */
    private final void sendHaltMessage(final UUID serviceId, final ChunkTask t,
            final Throwable cause) {

        final IHaltOpMessage msg = new HaltOpMessage(getQueryId(), t.bopId,
                t.partitionId, serviceId, cause, //t.sinkId,
                t.sinkMessagesOut.get(), //t.altSinkId,
                t.altSinkMessagesOut.get(),//
                t.context.getStats());

        if (asynchronousHaltMessage) {
            try {
                /*
                 * Queue a task to send the halt message to the query controller
                 * (asynchronous notification).
                 */
                final SendHaltMessageTask sendTask = new SendHaltMessageTask(
                        getQueryController(), msg, ChunkedRunningQuery.this);
                // submit task (asynchronous execution).
                t.context.getExecutorService().execute(sendTask);
            } catch (RejectedExecutionException ex) {
                // e.g., service is shutting down.
                if (INFO)
                    log.info("Could not send message: " + msg, ex);
            } catch (Throwable ex) {
                log.error("Could not send message: " + msg + " : " + ex, ex);
            }
        } else {
            /*
             * Synchronous delivery (local).
             */
            if (isController()) {
                haltOp(msg);
            } else {
                new SendHaltMessageTask(getQueryController(), msg,
                        ChunkedRunningQuery.this).run();
            }
        }

    }
    
    /**
     * Runnable evaluates an operator for some chunk of inputs. In scale-out,
     * the operator may be evaluated against some partition of a scale-out
     * index.
     */
    private class ChunkTask implements Callable<Void> {

        /** Alias for the {@link ChunkTask}'s logger. */
        private final Logger log = chunkTaskLog;

        /** The index of the bop which is being evaluated. */
        private final int bopId;

        /**
         * The index partition against which the operator is being evaluated and
         * <code>-1</code> if the operator is not being evaluated against a
         * shard.
         */
        private final int partitionId;

        /** The operator which is being evaluated. */
        private final BOp bop;

        /**
         * The index of the operator which is the default sink for outputs
         * generated by this evaluation. This is the
         * {@link BOp.Annotations#BOP_ID} of the parent of this operator. This
         * will be <code>null</code> if the operator does not have a parent and
         * is not a query since no outputs will be generated in that case.
         */
        private final Integer sinkId;

        /**
         * The index of the operator which is the alternative sink for outputs
         * generated by this evaluation. This is <code>null</code> unless the
         * operator explicitly specifies an alternative sink using either
         * {@link PipelineOp.Annotations#ALT_SINK_REF} or
         * {@link PipelineOp.Annotations#ALT_SINK_GROUP}.  
         */
        private final Integer altSinkId;

        /**
         * The sink on which outputs destined for the {@link #sinkId} operator
         * will be written and <code>null</code> if {@link #sinkId} is
         * <code>null</code>.
         */
        private final IBlockingBuffer<IBindingSet[]> sink;

        /**
         * The sink on which outputs destined for the {@link #altSinkId}
         * operator will be written and <code>null</code> if {@link #altSinkId}
         * is <code>null</code>.
         */
        private final IBlockingBuffer<IBindingSet[]> altSink;

        /**
         * The evaluation context for this operator.
         */
        private final BOpContext<IBindingSet> context;

        /**
         * {@link FutureTask} which evaluates the operator (evaluation is
         * delegated to this {@link FutureTask}).
         */
        private final FutureTask<Void> ft;

        /**
         * The #of input messages.
         */
        final int messagesIn;
        
        /** #of chunk messages out to sink. */
        final AtomicInteger sinkMessagesOut = new AtomicInteger(0);

        /** #of chunk messages out to altSink. */
        final AtomicInteger altSinkMessagesOut = new AtomicInteger(0);

        /**
         * A human readable representation of the {@link ChunkTask}'s state.
         */
        @Override
        public String toString() {
            return "ChunkTask" + //
                    "{query=" + getQueryId() + //
                    ",bopId=" + bopId + //
                    ",partitionId=" + partitionId + //
                    ",sinkId=" + sinkId + //
                    ",altSinkId=" + altSinkId + //
                    "}";
        }

        /**
         * Core implementation.
         * <p>
         * This looks up the {@link BOp} which is the target for the message in
         * the {@link IRunningQuery#getBOpIndex() BOp Index}, creates the
         * sink(s) for the {@link BOp}, creates the {@link BOpContext} for that
         * {@link BOp}, and wraps the value returned by
         * {@link PipelineOp#eval(BOpContext)} in order to handle the outputs
         * written on those sinks.
         * 
         * @param bopId
         *            The operator to which the message was addressed.
         * @param partitionId
         *            The partition identifier to which the message was
         *            addressed.
         * @param messagesIn
         *            The number of {@link IChunkMessage} to be consumed by this
         *            task.
         * @param isLastInvocation
         *            iff the caller knows that no additional messages can be
         *            produced which target this operator (regardless of the
         *            shard).
         * @param source
         *            Where the task will read its inputs.
         */
        public ChunkTask(final int bopId, final int partitionId,
                final int messagesIn, boolean isLastInvocation,
                final ICloseableIterator<IBindingSet[]> src) {

            this.bopId = bopId;
            
            this.partitionId = partitionId;
         
            this.messagesIn = messagesIn;
            
            bop = getBOpIndex().get(bopId);
            
            if (bop == null)
                throw new NoSuchBOpException(bopId);
            
            if (!(bop instanceof PipelineOp))
                throw new UnsupportedOperationException(bop.getClass()
                        .getName());

            // self
            final PipelineOp op = ((PipelineOp) bop);

            // parent (null if this is the root of the operator tree).
            final BOp p = BOpUtility.getParent(getQuery(), op);

            /*
             * The sink is the parent. The parent MUST have an id so we can
             * target it with a message. (The sink will be null iff there is no
             * parent for this operator.)
             */
            sinkId = BOpUtility.getEffectiveDefaultSink(bop, p);

            // altSink (null when not specified).
            altSinkId = (Integer) op
                    .getProperty(PipelineOp.Annotations.ALT_SINK_REF);
//            {
//                // altSink (null when not specified).
//                final Integer altSinkId = (Integer) op
//                        .getProperty(PipelineOp.Annotations.ALT_SINK_REF);
//                final Integer altSinkGroup = (Integer) op
//                        .getProperty(PipelineOp.Annotations.ALT_SINK_GROUP);
//                if (altSinkId != null && altSinkGroup != null)
//                    throw new RuntimeException(
//                            "Annotations are mutually exclusive: "
//                                    + PipelineOp.Annotations.ALT_SINK_REF
//                                    + " and "
//                                    + PipelineOp.Annotations.ALT_SINK_GROUP);
//                if (altSinkGroup != null) {
//                    /*
//                     * Lookup the first pipeline op in the conditional binding
//                     * group and use its bopId as the altSinkId.
//                     */
//                    this.altSinkId = BOpUtility.getFirstBOpIdForConditionalGroup(
//                            getQuery(), altSinkGroup);
//                } else {
//                    // MAY be null.
//                    this.altSinkId = altSinkId;
//                }
//            }

            if (altSinkId != null && !getBOpIndex().containsKey(altSinkId))
                throw new NoSuchBOpException(altSinkId);

            if (altSinkId != null && sinkId == null) {
                throw new RuntimeException(
                        "The primary sink must be defined if the altSink is defined: "
                                + bop);
            }

            /*
             * Setup the BOpStats object. For some operators, e.g., SliceOp,
             * this MUST be the same object across all invocations of that
             * instance of that operator for this query. This is marked by the
             * PipelineOp#isSharedState() method and is handled by a
             * putIfAbsent() pattern when that method returns true.
             * 
             * Note: RunState#haltOp() avoids adding a BOpStats object to itself
             * since that would cause double counting when the same object is
             * used for each invocation of the operator.
             * 
             * Note: It tends to be more useful to have distinct BOpStats
             * objects for each operator task instance that we run as this makes
             * it possible to see how much work was performed by that task
             * instance. The data are aggregated in the [statsMap] across the
             * entire run of the query.
             */
            final BOpStats stats;
            if (((PipelineOp) bop).isSharedState()) {//|| statsMap != null) {
                // shared stats object.
                stats = getStats(bopId);
            } else {
                // distinct stats objects, aggregated as each task finishes.
                stats = op.newStats();
//        		log.warn("bopId=" + bopId + ", stats=" + stats);
            }
            if (stats == null) {
                throw new AssertionError("No stats: op=" + op);
            }

//            // The groupId (if any) for this operator.
//            final Integer fromGroupId = (Integer) op
//                    .getProperty(PipelineOp.Annotations.CONDITIONAL_GROUP);

            if (p == null) {
                /*
                 * The top-most operator in the query plan is the last operator
                 * in evaluation order. It MUST run on the query controller.
                 * This is required in order for it to add solutions to the
                 * query buffer. The QueryEngine verifies this before it
                 * executes a query plan.
                 * 
                 * Note: Operators MUST NOT close the query buffer since they do
                 * not have enough context to decide when the query is done. The
                 * query buffer will be closed when the query is cancelled.
                 * Closing it prematurely will cause the query to fail if any
                 * other operator evaluation pass attempts to write on the query
                 * buffer.
                 */
                sink = new NoCloseBuffer<IBindingSet[]>(getQueryId(), bop, bopId,
                        partitionId, getQueryBuffer());
            } else {
//                final BOp targetOp = getBOpIndex().get(sinkId);
//                final Integer toGroupId = (Integer) targetOp
//                        .getProperty(PipelineOp.Annotations.CONDITIONAL_GROUP);
//                final SinkTransitionMetadata stm = new SinkTransitionMetadata(
//                        fromGroupId, toGroupId, true/* isSink */);
                sink = newBuffer(op, sinkId, //null/*stm*/, 
                        sinkMessagesOut, stats);
            }

            if (altSinkId == null) {
                altSink = null;
                // } else if(altSinkId.equals(sinkId)){
                /*
                 * @todo Note: The optimization when altSink:=sink is now only
                 * possible when the groupId is not changing during the
                 * transition.
                 */ 
                // altSink = sink;
            } else {
//                final BOp targetOp = getBOpIndex().get(altSinkId);
//                final Integer toGroupId = (Integer) targetOp
//                        .getProperty(PipelineOp.Annotations.CONDITIONAL_GROUP);
//                final SinkTransitionMetadata stm = new SinkTransitionMetadata(
//                        fromGroupId, toGroupId, false/* isSink */);
                altSink = newBuffer(op, altSinkId, //null/* stm */,
                        altSinkMessagesOut, stats);
            }

            // context : @todo pass in IChunkMessage or IChunkAccessor
            context = new BOpContext<IBindingSet>(ChunkedRunningQuery.this,
                    partitionId, stats, op, isLastInvocation, src, sink,
                    altSink);

//            if(isLastInvocation)
//                context.setLastInvocation();
            
            // FutureTask for operator execution (not running yet).
            if ((ft = op.eval(context)) == null)
                throw new RuntimeException("No future: " + op);

        }

        /**
         * Factory returns the {@link IBlockingBuffer} on which the operator
         * should write its outputs which target the specified <i>sinkId</i>.
         * 
         * @param op
         *            The operator whose evaluation task is being constructed.
         * @param sinkId
         *            The identifier for an operator which which the task will
         *            write its solutions (either the primary or alternative
         *            sink).
         * @param stats
         *            The statistics object for the evaluation of the operator.
         *            
         * @return The buffer on which the operator should write outputs which
         *         target that sink.
         */
        private IBlockingBuffer<IBindingSet[]> newBuffer(final PipelineOp op,
                final int sinkId,//
                final AtomicInteger sinkMessagesOut, //
                final BOpStats stats//
                ) {

            return new HandleChunkBuffer(ChunkedRunningQuery.this, bopId,
                    partitionId, sinkId, op.getChunkCapacity(),
                    op.isReorderSolutions(), sinkMessagesOut, stats);

        }

        /**
         * Evaluate the {@link IChunkMessage}.
         */
        @Override
        public Void call() throws Exception {
            if (log.isDebugEnabled())
                log.debug("Running chunk: " + this);
            try {
                ft.run(); // run
                ft.get(); // verify success
                /*
                 * If the operator ran successfully, then it should have flushed
                 * its sink(s) and closed them.
                 */
                if (sink != null) {
                    sink.flush();
                    sink.close();
                }
                if (altSink != null) {
                    altSink.flush();
                    altSink.close();
                }
            } catch (Throwable t) {
                /*
                 * Ensure query halts.
                 * 
                 * Note: This is where we attach the metadata about the operator
                 * and query for which the error was observed.
                 */
                halt(new Exception("task=" + toString() + ", cause=" + t, t));
                if (getCause() != null) {
                    // Abnormal termination - wrap and rethrow.
                    // TODO Why is this line empty? (I think that it is handled by the ChunkTaskWrapper.)
                }
                // otherwise ignore exception (normal completion).
            } finally {
                /*
                 * Ensure that the source is closed.
                 * 
                 * TODO This is not being guarded by a lock so we might not
                 * safely publish the state change to the source iterator when
                 * it is closed.
                 */
                context.getSource().close();
                /**
                 * Ensure that the task is cancelled.
                 * 
                 * Note: This does not appear to be necessary. I am observing
                 * the interrupt of the operator evaluation task regardless.
                 * 
                 * @see https://sourceforge.net/apps/trac/bigdata/ticket/707
                 *      (BlockingBuffer.close() does not unblock threads)
                 * @see https://sourceforge.net/apps/trac/bigdata/ticket/716
                 *      (Verify that IRunningQuery instances (and nested
                 *      queries) are correctly cancelled when interrupted)
                 */
//                ft.cancel(true/*mayInterruptIfRunning*/);
            }
            // Done.
            return null;
        } // call()

    } // class ChunkTask
    
    /**
     * A delegation pattern which does not pass on the {@link #close()} method.
     * This is used to prevent a {@link PipelineOp} from accidentally closing
     * the query buffer.
     */
    private class NoCloseBuffer<E> extends DelegateBuffer<E> {

        private final UUID queryId;
        private final BOp bop;
        private final int bopId;
        final int partitionId;
        
        public NoCloseBuffer(final UUID queryId, final BOp bop, final int bopId,
                final int partitionId, final IBlockingBuffer<E> delegate) {
        
            super(delegate);
            
            this.queryId = queryId;
            this.bop = bop;
            this.bopId = bopId;
            this.partitionId = partitionId;

        }

        @Override
        public void add(final E e) {
            super.add(e);
            if (SolutionsLog.INFO) {
                SolutionsLog
                        .log(queryId, bop, bopId, partitionId, (IBindingSet[]) e);
            }
        }

        @Override
        public void close() {
            // NOP - This makes sure that the query buffer is not closed.
        }

    } // class NoCloseBuffer

    /**
     * Class traps {@link #add(IBindingSet[])} to handle the {@link IBindingSet}
     * [] chunks as they are generated by the running operator task, invoking
     * {@link ChunkedRunningQuery#handleOutputChunk(BOp, int, IBlockingBuffer)}
     * for each generated chunk to synchronously emit {@link IChunkMessage}s.
     * <p>
     * This use of this class significantly increases the parallelism and
     * throughput of selective queries. If output chunks are not "handled" until
     * the {@link ChunkTask} is complete then the total latency of selective
     * queries is increased dramatically.
     */
    static private class HandleChunkBuffer implements
            IBlockingBuffer<IBindingSet[]> {

        private final ChunkedRunningQuery q;

        private final int bopId;
        
        private final int partitionId;

        private final int sinkId;

//        /**
//         * The desired chunk size. 
//         */
//        private final int chunkCapacity;

        /** The minimum desired chunk size (50% of the {@link #chunkCapacity}). */
        private final int minChunkSize;

        /** The maximum desired chunk size (150% of the {@link #chunkCapacity}) */
        private final int maxChunkSize;
        /**
         * When <code>true</code>, the buffer MAY reorder solutions. When
         * <code>false</code>, it MUST NOT.
         */
        private final boolean reorderSolutions;
        
        private final AtomicInteger sinkMessagesOut;

        private final BOpStats stats;

        private volatile boolean open = true;

        /**
         * A list of small chunks that will be combined into a single chunk. The
         * solutions in this list are always evicted by {@link #flush()}.
         */
        private List<IBindingSet[]> smallChunks = null;
        
        /**
         * The #of elements in the {@link #smallChunks} buffer. Each element is
         * an {@link IBindingSet}, so this is the number of solutions that have
         * not yet been flushed through because we have not yet made up a single
         * decent sized chunk.
         */
        private int chunkSize = 0;

        /**
         * 
         * @param q
         * @param bopId
         * @param sinkId
         * @param chunkCapacity
         *            The target capacity for each chunk.
         * @param reorderSolutions
         *            When <code>true</code>, the buffer MAY reorder solutions.
         *            When <code>false</code>, it MUST NOT.
         * @param sinkMessagesOut
         * @param stats
         */
        public HandleChunkBuffer(final ChunkedRunningQuery q, final int bopId,
                final int partitionId,
                final int sinkId, final int chunkCapacity,
                final boolean reorderSolutions,
                final AtomicInteger sinkMessagesOut, final BOpStats stats) {
            this.q = q;
            this.bopId = bopId;
            this.partitionId = partitionId;
            this.sinkId = sinkId;
//            this.chunkCapacity = chunkCapacity;
            this.reorderSolutions = reorderSolutions;
            this.sinkMessagesOut = sinkMessagesOut;
            this.stats = stats;
            this.minChunkSize = (chunkCapacity >> 1); // 50%
            this.maxChunkSize = chunkCapacity + (chunkCapacity >> 1); // 150%
        }

        /**
         * Handle sink output, sending appropriate chunk message(s). This method
         * MUST NOT block since that will deadlock the caller.
         * <p>
         * Note: This maps output over shards/nodes in s/o.
         * <p>
         * Note: This must be synchronized in case the caller is multi-threaded
         * since it has a possible side effect on the internal buffer.
         * 
         * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/798" >
         *      Solution order not always preserved. </a>
         */
        @Override
        public void add(final IBindingSet[] e) {
            
            if(!open)
                throw new BufferClosedException();

            if (SolutionsLog.INFO) {
                SolutionsLog.log(q.getQueryId(), q.getBOp(bopId), bopId,
                        partitionId, (IBindingSet[]) e);
            }

            if (false) {

                /*
                 * Note: Do this INSTEAD if you want to complete disable both
                 * reordering and chunk combination. This should ONLY be used
                 * for debugging. Chunk combination is an important throughput
                 * enhancer.
                 */

                // outputChunk(e);

            } else {
                
                if (reorderSolutions) {

                    // Solutions MAY be reordered.
                    addReorderAllowed(e);

                } else {

                    // Solutions MUST NOT be reordered.
                    addReorderNotAllowed(e);

                }
                
            }
            
        } // add()

        /**
         * We are allowed to reorder the solutions.
         * <p>
         * This will reorder solutions by outputting the current chunk
         * immediately if it is GTE 50% of the target chunkCapacity. This is
         * also a non-blocking code path (no lock is taken in this method).
         * <p>
         * Otherwise, the chunk is added {@link #smallChunks} list. If the #of
         * solutions on the {@link #smallChunks} reaches a threshold, then the
         * {@link #smallChunks} list is converted into a single chunk an
         * evicted. 
         */
        private void addReorderAllowed(final IBindingSet[] e) {
            
            if (e.length < minChunkSize) {

                /*
                 * The caller's array is significantly smaller than the target
                 * chunk size. Append the caller's array to the internal list
                 * and return immediately. The buffered chunks will be copied
                 * through either in a subsequent add() or in flush().
                 */
                
                synchronized (this) {

                    if (chunkSize + e.length > maxChunkSize) {

                        // flush the buffer first.
                        outputBufferedChunk();

                    }

                    if (smallChunks == null)
                        smallChunks = new LinkedList<IBindingSet[]>();

                    // Add to the buffer.
                    smallChunks.add(e);
                    chunkSize += e.length;

                    return;

                }
            
            }

            // output the caller's chunk immediately.
            outputChunk(e);

        }
        
        /**
         * We are not allowed to reorder the solutions.
         * <p>
         * This always outputs solutions in the same order that they are added.
         * In order to avoid pushing through small chunks, it allows the output
         * chunk to be over the target capacity (by 50%).
         * 
         * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/798" >
         *      Solution order not always preserved. </a>
         */
        private void addReorderNotAllowed(final IBindingSet[] e) {

            synchronized (this) {

                if (chunkSize + e.length > maxChunkSize) {

                    /*
                     * The combined chunk would be too large for the buffer.
                     */

                    // Flush the buffer.
                    outputBufferedChunk();

                    if (e.length > minChunkSize) {

                        /*
                         * The internal buffer is empty. The chunk is big
                         * enough. Sent it through immediately.
                         */
                       
                        outputChunk(e);
                        
                        return;
                        
                    }
                    
                }

                /*
                 * Add the chunk to the internal buffer.
                 */

                if (smallChunks == null)
                    smallChunks = new LinkedList<IBindingSet[]>();

                // Add to the buffer.
                smallChunks.add(e);
                chunkSize += e.length;

            } // synchronized(this)

        }
        
        /**
         * Output a chunk, updating the counters.
         * 
         * @param e
         *            The chunk.
         */
        private void outputChunk(final IBindingSet[] e) {

            final int chunkSize = e.length;
            
            stats.unitsOut.add(chunkSize);
            
            stats.chunksOut.increment();
            
            final int messagesOut = q.getChunkHandler().handleChunk(q, bopId,
                    sinkId, e);
            
            sinkMessagesOut.addAndGet(messagesOut);
            
        }
        
        /**
         * Output the internal buffer.
         */
        synchronized // Note: has side-effect on internal buffer. 
        private void outputBufferedChunk() {
            if (smallChunks == null || chunkSize == 0) {
                return;
            }
            if (smallChunks.size() == 1) {
                // directly output a single small chunk.
                outputChunk(smallChunks.get(0));
                chunkSize = 0;
                smallChunks = null;
                return;
            }
            // exact fit buffer.
            final IBindingSet[] chunk = new IBindingSet[chunkSize];
            // copy the small chunks into the buffer.
            int destPos = 0;
            for (IBindingSet[] e : smallChunks) {
                System.arraycopy(e/* src */, 0/* srcPos */, chunk/* dest */,
                        destPos, e.length/* length */);
                destPos += e.length;
            }
            outputChunk(chunk);
            chunkSize = 0;
            smallChunks = null;
        }

        @Override
        synchronized // Note: possible side-effect on internal buffer.
        public long flush() {
            if (open)
                outputBufferedChunk();
            return 0L;
//          return sink.flush();
      }

        @Override
        public void abort(final Throwable cause) {
            open = false;
            q.halt(cause);
//            sink.abort(cause);
        }

        @Override
        public void close() {
//            sink.close();
            open = false;
        }

        @Override
        public Future getFuture() {
//            return sink.getFuture();
            return null;
        }

        @Override
        public boolean isEmpty() {
            return true;
//            return sink.isEmpty();
        }

        @Override
        public boolean isOpen() {
            return open && !q.isDone();
//            return sink.isOpen();
        }

        @Override
        public IAsynchronousIterator<IBindingSet[]> iterator() {
            throw new UnsupportedOperationException();
//            return sink.iterator();
        }

        @Override
        public void reset() {
//            sink.reset();
        }

        @Override
        public void setFuture(Future future) {
            throw new UnsupportedOperationException();
//            sink.setFuture(future);
        }

        @Override
        public int size() {
            return 0;
//            return sink.size();
        }

    } // class HandleChunkBuffer
    
    /**
     * {@link Runnable} sends the {@link IQueryClient} a message indicating that
     * some query has halted on some node. This is used to send such messages
     * asynchronously.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
     *         Thompson</a>
     * @version $Id$
     */
    private static class SendHaltMessageTask implements Runnable {

        private final IQueryClient clientProxy;

        private final IHaltOpMessage msg;

        private final ChunkedRunningQuery q;

        public SendHaltMessageTask(final IQueryClient clientProxy,
                final IHaltOpMessage msg, final ChunkedRunningQuery q) {

            if (clientProxy == null)
                throw new IllegalArgumentException();

            if (msg == null)
                throw new IllegalArgumentException();

            if (q == null)
                throw new IllegalArgumentException();

            this.clientProxy = clientProxy;

            this.msg = msg;

            this.q = q;

        }

        @Override
        public void run() {
			try {
				if (q.isController()) {
					/*
					 * Local method call.
					 * 
					 * Note: This MUST NOT be done using RMI when the operator
					 * is using shared state to provide live statistics update
					 * for operators which evaluate on the query controller.
					 * Using RMI here results in double-counting with each new
					 * haltOp and can cause queries to terminate too soon
					 * because they have false #of chunks and units out
					 * reported.
					 * 
					 * @see https://sourceforge.net/apps/trac/bigdata/ticket/464
					 */
					q.haltOp(msg);
				} else {
					// RMI.
					clientProxy.haltOp(msg);
				}
            } catch (Throwable e) {
                if (!isRootCauseInterrupt(e)) {
                	/*
                	 * @see https://sourceforge.net/apps/trac/bigdata/ticket/479
                	 */
                    log.error("Could not notify query controller: " + e, e);
                }
                q.cancel(true/* mayInterruptIfRunning */);
            }
        }

    }

    @Override
    protected boolean cancelRunningOperators(final boolean mayInterruptIfRunning) {
        
        boolean cancelled = false;
        
        final Iterator<ConcurrentHashMap<ChunkFutureTask,ChunkFutureTask>> fitr = operatorFutures.values().iterator();
        
        while (fitr.hasNext()) {
        
            final ConcurrentHashMap<ChunkFutureTask,ChunkFutureTask> set = fitr.next();

            for(ChunkFutureTask f : set.keySet()) {

                /*
                 * Note: This can wind up setting the interrupt status on the
                 * thread in which it is called. For example, SLICE will call
                 * halt(), which calls AbstractRunningQuery#cancel(), which
                 * calls this method. Since the SliceOp() is still running it's
                 * future will be cancelled here. The code in
                 * AbstractRunningQuery#cancel() needs to be robust in the face
                 * of that interrupt.
                 */

                if (f.cancel(mayInterruptIfRunning))
                    cancelled = true;
                
            }
            
        }
     
        return cancelled;
        
    }

    @Override
    protected void releaseAcceptedMessages() {

        for (Map.Entry<BSBundle, BlockingQueue<IChunkMessage<IBindingSet>>> e : operatorQueues
                .entrySet()) {

            final BlockingQueue<IChunkMessage<IBindingSet>> queue = e.getValue();

            if (queue.isEmpty())
                continue;
            
            final LinkedList<IChunkMessage<IBindingSet>> c = new LinkedList<IChunkMessage<IBindingSet>>();

            queue.drainTo(c);

            for (IChunkMessage<IBindingSet> msg : c) {

                msg.release();
                
            }
            
        }
     
    }

    /**
     * Return a summary of the work queue for the operators in this query
     * (non-blocking).
     * 
     * @return A map whose keys are the operator identifiers and whose values
     *         provide summary statistics for the work queue(s) for those
     *         operators.
     *         <p>
     *         Note: For a cluster, there is one work queue per (operator,shard)
     *         pair.
     */
    protected Map<Integer/* bopId */, QueueStats> getQueueStats() {
        
        final Map<Integer, QueueStats> map = new HashMap<Integer, QueueStats>();

        for (Map.Entry<BSBundle, BlockingQueue<IChunkMessage<IBindingSet>>> e : operatorQueues
                .entrySet()) {

            final BSBundle bundle = e.getKey();

            final BlockingQueue<IChunkMessage<IBindingSet>> queue = e
                    .getValue();

            @SuppressWarnings("unchecked")
            final IChunkMessage<IBindingSet>[] chunks = queue
                    .toArray(new IChunkMessage[0]);

            if (chunks.length == 0)
                continue;

            final Integer bopId = Integer.valueOf(bundle.bopId);

            QueueStats stats = map.get(bopId);

            if (stats == null) {

                map.put(bopId, stats = new QueueStats());

            }
            
            stats.shardSet.add(bundle.shardId);
            
            for (IChunkMessage<IBindingSet> msg : chunks) {

                stats.chunkCount++;

                stats.solutionCount += msg.getSolutionCount();

            }

        }

        return map;
        
    }
    
    /**
     * Factory returns the effective {@link IChunkHandler} for this query.
     * 
     * @param queryEngine
     * @param query
     * 
     * @return The effective {@link IChunkHandler} policy.
     * 
     * @see QueryHints#QUERY_ENGINE_CHUNK_HANDLER
     * @see BLZG-533 Vector query engine on native heap.
     */
    protected IChunkHandler getChunkHandler(final QueryEngine queryEngine, final PipelineOp query) {

        /**
         * Use whatever is configured by the query hints.
         */
        return query.getProperty(
                QueryEngine.Annotations.CHUNK_HANDLER,
                QueryHints.DEFAULT_QUERY_ENGINE_CHUNK_HANDLER);
        
    }

    /**
     * Return the effective {@link IChunkHandler} for this query.
     * 
     * @see BLZG-533 Vector query engine on native heap.
     */
    final protected IChunkHandler getChunkHandler() {
        return chunkHandler;
    }

}