DistributedJoinTask.java example

Explorer
database-master
package com.bigdata.relation.rule.eval.pipeline;

import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.UUID;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;

import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.bop.IVariable;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.relation.IMutableRelation;
import com.bigdata.relation.accesspath.AbstractUnsynchronizedArrayBuffer;
import com.bigdata.relation.accesspath.BlockingBuffer;
import com.bigdata.relation.accesspath.IAsynchronousIterator;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.relation.rule.IRule;
import com.bigdata.relation.rule.eval.ActionEnum;
import com.bigdata.relation.rule.eval.IJoinNexus;
import com.bigdata.relation.rule.eval.IRuleState;
import com.bigdata.relation.rule.eval.ISolution;
import com.bigdata.service.AbstractDistributedFederation;
import com.bigdata.service.AbstractScaleOutFederation;
import com.bigdata.service.DataService;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.service.IDataService;
import com.bigdata.service.Session;
import com.bigdata.striterator.IKeyOrder;
import com.bigdata.util.concurrent.Computable;
import com.bigdata.util.concurrent.Memoizer;

import cutthecrap.utils.striterators.Filter;
import cutthecrap.utils.striterators.Resolver;
import cutthecrap.utils.striterators.Striterator;


/**
 * Implementation used by scale-out deployments. There will be one instance
 * of this task per index partition on which the rule will read. Those
 * instances will be in-process on the {@link DataService} hosting that
 * index partition. Instances are created on the {@link DataService} using
 * the {@link JoinTaskFactoryTask} helper class.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class DistributedJoinTask extends JoinTask {

    /**
     * When <code>true</code>, enables a trace on {@link System#err} of
     * the code polling the source {@link IAsynchronousIterator}s from
     * which this {@link DistributedJoinTask} draws its {@link IBindingSet}
     * chunks.
     */
    static private final boolean trace = false;

    /**
     * The federation is used to obtain locator scans for the access paths.
     */
    final protected AbstractScaleOutFederation<?> fed;

    /**
     * The {@link IJoinNexus} for the {@link IBigdataFederation}. This is
     * mainly used to setup the {@link #solutionBuffer} since it needs to
     * write on the scale-out index while the {@link AccessPathTask} will
     * read on the local index partition view.
     */
    final protected IJoinNexus fedJoinNexus;

    /**
     * A (proxy for) the {@link Future} for this {@link DistributedJoinTask}.
     */
    protected Future<Void> futureProxy;
    
    /**
     * @see IRuleState#getKeyOrder()
     */
    final private IKeyOrder<?>[] keyOrders;

    /**
     * The name of the scale-out index associated with the next
     * {@link IPredicate} in the evaluation order and <code>null</code>
     * iff this is the last {@link IPredicate} in the evaluation order [logging only.]
     */
    final private String nextScaleOutIndexName;

    /**
     * Sources for {@link IBindingSet} chunks that will be processed by this
     * {@link JoinTask}. There will be one such source for each upstream
     * {@link JoinTask} that targets this {@link JoinTask}.
     * <p>
     * Note: This is a thread-safe collection since new sources may be added
     * asynchronously during processing.
     */
    final private Vector<IAsynchronousIterator<IBindingSet[]>> sources = new Vector<IAsynchronousIterator<IBindingSet[]>>();

    /**
     * <code>false</code> until all binding sets have been consumed and the
     * join task has made an atomic decision that it will not accept any new
     * sources. Note that the join task may still be consuming binding sets once
     * this flag is set - it is not necessarily done with its work, just no
     * willing to accept new {@link #sources}.
     * 
     * @todo rename as sourcesClosed
     */
    private boolean sourcesExhausted = false;
    
    /**
     * The {@link DataService} on which this task is executing. This is used to
     * remove the entry for the task from {@link DataService#getSession()}.
     */
    private final DataService dataService;
    
//    /**
//     * The {@link JoinTaskSink}s for the downstream
//     * {@link DistributedJoinTask}s onto which the generated
//     * {@link IBindingSet}s will be written. This is <code>null</code>
//     * for the last join since we will write solutions onto the
//     * {@link #getSolutionBuffer()} instead.
//     * 
//     * @todo configure capacity based on expectations of index partition
//     *       fan-out for this join dimension
//     */
//    final private Map<PartitionLocator, JoinTaskSink> sinkCache;

    public DistributedJoinTask(
//            final String scaleOutIndexName,
            final IRule rule,//
            final IJoinNexus joinNexus,//
            final int[] order,//
            final int orderIndex,//
            final int partitionId,//
            final AbstractScaleOutFederation<?> fed,//
            final IJoinMaster master,//
            final UUID masterUUID,//
            final IAsynchronousIterator<IBindingSet[]> src,//
            final IKeyOrder[] keyOrders,//
            final DataService dataService,//
            final IVariable[][] requiredVars//
            ) {

        super(
                /*DataService.getIndexPartitionName(scaleOutIndexName,
                        partitionId),*/ rule, joinNexus, order, orderIndex,
                partitionId, master, masterUUID, requiredVars);

        if (fed == null)
            throw new IllegalArgumentException();

        if (src == null)
            throw new IllegalArgumentException();

        if (dataService == null)
            throw new IllegalArgumentException();

        // Note: This MUST be the index manager for the local data service.
        if(joinNexus instanceof IBigdataFederation<?>)
            throw new IllegalArgumentException();
        
        this.fed = fed;

        this.keyOrders = keyOrders;

        this.dataService = dataService;
        
        // This is the index manager for the federation (scale-out indices).
        this.fedJoinNexus = joinNexus.getJoinNexusFactory().newInstance(fed);

        if (lastJoin) {

//            sinkCache = null;
        	memo = null;

            nextScaleOutIndexName = null;

            final ActionEnum action = fedJoinNexus.getAction();

            if (action.isMutation()) {

                /*
                 * Note: The solution buffer for mutation operations
                 * is obtained locally from a joinNexus that is
                 * backed by the federation NOT the local index
                 * manager. (This is because the solution buffer
                 * needs to write on the scale-out indices.)
                 */

                final IJoinNexus tmp = fedJoinNexus;

                /*
                 * The view of the mutable relation for the _head_ of the
                 * rule.
                 */

                final IMutableRelation<?> relation = (IMutableRelation<?>) tmp
                        .getHeadRelationView(rule.getHead());

                switch (action) {

                case Insert: {

                    solutionBuffer = tmp.newInsertBuffer(relation);

                    break;

                }

                case Delete: {

                    solutionBuffer = tmp.newDeleteBuffer(relation);

                    break;

                }

                default:
                    throw new AssertionError();

                }

            } else {

                /*
                 * The solution buffer for queries is obtained from the
                 * master.
                 */

                try {

                    solutionBuffer = masterProxy.getSolutionBuffer();

                } catch (IOException ex) {

                    throw new RuntimeException(ex);

                }

            }

        } else {

            final IPredicate<?> nextPredicate = rule
                    .getTail(order[orderIndex + 1]);

            final String namespace = nextPredicate.getOnlyRelationName();

            nextScaleOutIndexName = namespace +"."
                    + keyOrders[order[orderIndex + 1]];

            solutionBuffer = null;

//            sinkCache = new LinkedHashMap<PartitionLocator, JoinTaskSink>();
            memo = new SinkMemoizer(getSink);

            //                System.err.println("orderIndex=" + orderIndex + ", resources="
            //                        + Arrays.toString(getResource()) + ", nextPredicate="
            //                        + nextPredicate + ", nextScaleOutIndexName="
            //                        + nextScaleOutIndexName);

        }

        addSource(src);

    }

    /**
     * Adds a source from which this {@link DistributedJoinTask} will read
     * {@link IBindingSet} chunks.
     * 
     * @param source
     *            The source.
     * 
     * @return <code>true</code> iff the source was accepted.
     * 
     * @throws IllegalArgumentException
     *             if the <i>source</i> is <code>null</code>.
     */
    public boolean addSource(final IAsynchronousIterator<IBindingSet[]> source) {

        if (source == null)
            throw new IllegalArgumentException();

        lock.lock();

        try {

            if (sourcesExhausted) {

                // new source declarations are rejected.

                if (INFO)
                    log.info("source rejected: orderIndex=" + orderIndex
                            + ", partitionId=" + partitionId);

                return false;

            }

            sources.add(source);

            stats.fanIn++;

        } finally {

            lock.unlock();

        }
    
        if (DEBUG)
            log.debug("orderIndex=" + orderIndex + ", partitionId="
                    + partitionId + ", fanIn=" + stats.fanIn + ", fanOut="
                    + stats.fanOut);

        return true;
        
    }

    final protected IBuffer<ISolution[]> getSolutionBuffer() {

        return solutionBuffer;

    }

    private final IBuffer<ISolution[]> solutionBuffer;

    /**
     * Sets a flag preventing new sources from being declared and closes all
     * known {@link #sources} and removes this task from the {@link Session}.
     */
    protected void closeSources() {

        if (INFO)
            log.info(toString());
        
        lock.lock();

        try {

            sourcesExhausted = true;

            final IAsynchronousIterator<?>[] a = sources
                    .toArray(new IAsynchronousIterator[] {});

            for (IAsynchronousIterator<?> source : a) {

                source.close();

            }

            removeFromSession();
            
        } finally {

            lock.unlock();

        }

    }

    /**
     * Remove the task from the session, but only if the task in the session is
     * this task (it will have been overwritten if this task decides not to
     * accept more sources and another source shows up).
     */
    private void removeFromSession() {

        lock.lock();

        try {

            // @todo allocate this in the ctor.
            final String namespace = JoinTaskFactoryTask.getJoinTaskNamespace(
                    masterUUID, orderIndex, partitionId);

            /*
             * Note: If something else has the entry in the session then that is
             * Ok, but we need to make sure that we don't remove it by accident!
             */

            dataService.getSession().remove(namespace, this);
            
        } finally {

            lock.unlock();

        }

    }
    
    /**
     * This lock is used to make {@link #nextChunk()} and
     * {@link #addSource(IAsynchronousIterator)} into mutually exclusive
     * operations. {@link #nextChunk()} is the reader.
     * {@link #addSource(IAsynchronousIterator)} is the writer. These operations
     * need to be exclusive and atomic so that the termination condition of
     * {@link #nextChunk()} is consistent -- it should terminate when there are
     * no sources remaining. The first source is added when the
     * {@link DistributedJoinTask} is created. Additional sources are added (and
     * can result in a fan-in greater than one) when a
     * {@link JoinTaskFactoryTask} identifies that there is an existing
     * {@link DistributedJoinTask} and is able to atomically assign a new source
     * to that {@link DistributedJoinTask}. If the atomic assignment of the new
     * source fails (because all sources are exhausted before the assignment
     * occurs) then a new {@link DistributedJoinTask} will be created for the
     * same {@link DistributedJoinMasterTask}, orderIndex, and index partition
     * identifier and the source will be assigned to that
     * {@link DistributedJoinTask} instead.
     * 
     * @todo javadoc update
     */
//    private ReadWriteLock lock = new ReentrantReadWriteLock(false/* fair */);
    private final ReentrantLock lock = new ReentrantLock();

    /**
     * Returns a chunk of {@link IBindingSet}s by combining chunks from the
     * various source {@link JoinTask}s.
     * 
     * @return A chunk assembled from one or more chunks from one or more of
     *         the source {@link JoinTask}s.
     */
	protected IBindingSet[] nextChunk() throws InterruptedException {

        if (sourcesExhausted) {

            // nothing remaining in any accepted source.

            return null;

        }

        if (DEBUG)
            log.debug("Reading chunk of bindings from source(s): orderIndex="
                    + orderIndex + ", partitionId=" + partitionId);

        // #of elements in the combined chunk(s)
        int bindingSetCount = 0;

        // source chunks read so far.
        final List<IBindingSet[]> chunks = new LinkedList<IBindingSet[]>();

        /*
         * Assemble a chunk of suitable size
         * 
         * @todo don't wait too long. if we have some data then it is probably
         * better to process that data rather than waiting beyond a timeout for
         * a full chunk. also, make sure that we are neither yielding nor
         * spinning too long in this loop. However, the loop must wait if there
         * is nothing available and the sources are not yet exhausted.
         * 
         * @todo config. we need a different capacity here than the one used for
         * batch index operations. on the order of 100 should work well.
         * 
         * Note: The termination conditions under which we will return [null]
         * indicating that no more binding sets can be read are: (a) [halt] is
         * true; (b) [sourcesExhausted] is true; or (c) all sources are
         * exhausted and we are able to acquire the lock.
         * 
         * Once we do acquire the lock we set [sourcesExhausted] to true and any
         * subsequent request to add another source to this joinTask will fail.
         * This has the consequence that a new JoinTask will be created if a new
         * source has been identified once this task halts.
         */

        final int chunkCapacity = 100;// joinNexus.getChunkCapacity();

        while (!sourcesExhausted) {

            while (!halt && !sources.isEmpty()
                    && bindingSetCount < chunkCapacity) {

                if (trace)
                    System.err.print("\norderIndex=" + orderIndex);

                if (trace)
                    System.err.print(": reading");

                // if (DEBUG)
                // log.debug("Testing " + nsources + " sources: orderIndex="
                // + orderIndex + ", partitionId=" + partitionId);

                // clone to avoid concurrent modification of sources during
                // traversal.
                @SuppressWarnings("unchecked")
                final IAsynchronousIterator<IBindingSet[]>[] sources = (IAsynchronousIterator<IBindingSet[]>[]) this.sources
                        .toArray(new IAsynchronousIterator[] {});

                // #of sources that are exhausted.
                int nexhausted = 0;

                for (int i = 0; i < sources.length
                        && bindingSetCount < chunkCapacity; i++) {

                    if (trace)
                        System.err.print(" <<(" + i + ":" + sources.length
                                + ")");

                    final IAsynchronousIterator<IBindingSet[]> src = sources[i];

                    // if there is something to read on that source.
                    if (src.hasNext(1L, TimeUnit.MILLISECONDS)) {

                        /*
                         * Read the chunk, waiting up to the timeout for
                         * additional chunks from this source which can be
                         * combined together by the iterator into a single
                         * chunk.
                         * 
                         * @todo config chunkCombiner timeout here and
                         * experiment with the value with varying fanIns.
                         */
                        final IBindingSet[] chunk = src.next(10L,
                                TimeUnit.MILLISECONDS);

                        /*
                         * Note: Since hasNext() returned [true] for this source
                         * we SHOULD get a chunk back since it is known to be
                         * there waiting for us. The timeout should only give
                         * the iterator an opportunity to combine multiple
                         * chunks together if they are already in the iterator's
                         * queue (or if they arrive in a timely manner).
                         */
                        assert chunk != null;

                        chunks.add(chunk);

                        bindingSetCount += chunk.length;

                        if (trace)
                            System.err.print("[" + chunk.length + "]");

                        if (DEBUG)
                            log.debug("Read chunk from source: sources[" + i
                                    + "], chunkSize=" + chunk.length
                                    + ", orderIndex=" + orderIndex
                                    + ", partitionId=" + partitionId);

                    } else if (src.isExhausted()) {

                        nexhausted++;

                        if (trace)
                            System.err.print("X{" + nexhausted + "}");

                        if (DEBUG)
                            log.debug("Source is exhausted: nexhausted="
                                    + nexhausted);

                        // no longer consider an exhausted source.
                        if (!this.sources.remove(src)) {

                            // could happen if src.equals() is not defined.
                            throw new AssertionError("Could not find source: "
                                    + src);

                        }

                    }

                }

                if (nexhausted == sources.length) {

                    /*
                     * All sources on which we were reading in this loop have
                     * been exhausted.
                     * 
                     * Note: we may have buffered some data, which is checked
                     * below.
                     * 
                     * Note: new sources may have been added concurrently, so we
                     * get the lock and then test the [sources] field, not just
                     * the local array.
                     */

                    lock.lock();

                    try {

                        if (this.sources.isEmpty()) {

                            if (INFO)
                                log.info("Sources are exhausted: orderIndex="
                                        + orderIndex + ", partitionId="
                                        + partitionId);

                            sourcesExhausted = true;

                            /*
                             * Remove ourselves from the Session since we will
                             * no longer accept any new sources.
                             */
                            
                            removeFromSession();
                            
                        }

                    } finally {

                        lock.unlock();

                    }

                    break;

                }

            }

            if (halt)
                throw new RuntimeException(firstCause.get());

            /*
             * Combine the chunks.
             */

            if (!chunks.isEmpty()) {

                return combineChunks(chunks, bindingSetCount);

            }

        } // while(!sourcesExhausted)

        /*
         * Termination condition: we did not get any data from any source, we
         * are not permitting any new sources, and there are no sources
         * remaining.
         */

        if (DEBUG)
            log.debug("Sources are exhausted: orderIndex=" + orderIndex
                    + ", partitionId=" + partitionId);

        if (trace)
            System.err.print(" exhausted");

        return null;

    }
    
    /**
     * Combine the chunk(s) into a single chunk.
     * 
     * @param chunks
     *            A list of chunks read from the {@link #sources}.
     * @param bindingSetCount
     *            The #of bindingSets in those chunks.
     * @return
     */
    protected IBindingSet[] combineChunks(final List<IBindingSet[]> chunks,
            final int bindingSetCount) {
        
        final int chunkCount = chunks.size();

        assert chunkCount > 0; // at least one chunk.
        
        assert bindingSetCount > 0; // at least on bindingSet.
        
        final IBindingSet[] chunk;

        if (chunkCount == 1) {

            // Only one chunk is available.

            chunk = chunks.get(0);

        } else {

            // Combine 2 or more chunks.

            chunk = new IBindingSet[bindingSetCount];

            final Iterator<IBindingSet[]> itr = chunks.iterator();

            int offset = 0;

            while (itr.hasNext()) {

                final IBindingSet[] a = itr.next();

                System.arraycopy(a, 0, chunk, offset, a.length);

                offset += a.length;

            }

        }

        if (halt)
            throw new RuntimeException(firstCause.get());

        if (DEBUG) {

            log.debug("Read chunk(s): nchunks=" + chunkCount
                    + ", #bindingSets=" + chunk.length + ", orderIndex="
                    + orderIndex + ", partitionId=" + partitionId);
        }

        stats.bindingSetChunksIn += chunkCount;
        stats.bindingSetsIn += bindingSetCount;

        if (trace)
            System.err.print(" chunk[" + chunk.length + "]");

        return chunk;

    }

    protected AbstractUnsynchronizedArrayBuffer<IBindingSet> newUnsyncOutputBuffer() {

        final AbstractUnsynchronizedArrayBuffer<IBindingSet> unsyncOutputBuffer;

        /*
         * On overflow, the generated binding sets are mapped across the
         * JoinTaskSink(s) for the target index partition(s).
         */

        final int chunkCapacity = fedJoinNexus.getChunkCapacity();

        if (lastJoin) {

            /*
             * Accepted binding sets are flushed to the solution buffer.
             */

            unsyncOutputBuffer = new UnsynchronizedSolutionBuffer<IBindingSet>(
                    this, fedJoinNexus, chunkCapacity);

        } else {

            /*
             * Accepted binding sets are flushed to the next join dimension.
             * 
             * Note: The index is key-range partitioned. Each bindingSet
             * will be mapped across the index partition(s) on which the
             * generated access path for that bindingSet will have to read.
             * There will be a JoinTask associated with each such index
             * partition. That JoinTask will execute locally on the
             * DataService which hosts that index partition.
             */

            unsyncOutputBuffer = new UnsyncDistributedOutputBuffer<IBindingSet>(
                    fed, this, chunkCapacity);

        }

        return unsyncOutputBuffer;

    }

    /**
     * Notifies each sink that this {@link DistributedJoinTask} will no
     * longer generate new {@link IBindingSet} chunks and then waits for the
     * sink task(s) to complete.
     * <p>
     * Note: Closing the {@link BlockingBuffer} from which a sink
     * {@link JoinTask} is reading will cause the source iterator for that
     * sink task to eventually return <code>false</code> indicating that
     * it is exhausted (assuming that the sink keeps reading on the
     * iterator).
     * 
     * @throws InterruptedException
     *             if interrupted while awaiting the future for a sink.
     */
    @Override
    protected void flushAndCloseBuffersAndAwaitSinks()
            throws InterruptedException, ExecutionException {

        if (DEBUG)
            log.debug("orderIndex="
                    + orderIndex
                    + ", partitionId="
                    + partitionId
                    + (lastJoin ? ", lastJoin" : ", sinkCount="
                            + memo.size()));

        /*
         * For the last join dimension the JoinTask instead writes onto the
         * [solutionBuffer]. For query, that is the shared solution buffer
         * and will be a proxied object. For mutation, that is a per
         * JoinTask buffer that writes onto the target relation. In the
         * latter case we MUST report the mutationCount returned by flushing
         * the solutionBuffer via JoinStats to the master.
         * 
         * Note: JoinTask#flushUnsyncBuffers() will already have been
         * invoked so all generated binding sets will already be in the sync
         * buffer ready for output.
         */
        if (lastJoin) {

//            assert sinkCache == null;
        	assert memo == null;

            if (DEBUG)
                log.debug("\nWill flush buffer containing "
                        + getSolutionBuffer().size() + " solutions.");

            final long counter = getSolutionBuffer().flush();

            if (DEBUG)
                log.debug("\nFlushed buffer: mutationCount=" + counter);

            if (joinNexus.getAction().isMutation()) {

                /*
                 * Apply mutationCount to the JoinStats so that it will be
                 * reported back to the JoinMasterTask.
                 */

                stats.mutationCount.addAndGet(counter);

            }

        } else {

            /*
             * Close sinks.
             * 
             * For all but the lastJoin, the buffers are writing onto the
             * per-sink buffers. We flush and close those buffers now. The sink
             * JoinTasks drain those buffers. Once the buffers are closed, the
             * sink JoinTasks will eventually exhaust the buffers.
             * 
             * Note: This flushes the buffers using a thread pool which should
             * give better throughput when the fanOut is GT ONE (1).
             */

            {

                if (halt)
                    throw new RuntimeException(firstCause.get());

                final List<Callable<Void>> tasks = new LinkedList<Callable<Void>>();

                final Iterator<JoinTaskSink> itr = memo.getSinks();

                while (itr.hasNext()) {

                    final JoinTaskSink sink = itr.next();

                    tasks.add(new FlushAndCloseSinkBufferTask(sink));

                }

                final List<Future<Void>> futures = fed.getExecutorService()
                        .invokeAll(tasks);

                for (Future<?> f : futures) {

                    // make sure that all tasks were successful.
                    f.get();

                }
                
            }

            // Await sinks.
            {

                final Iterator<JoinTaskSink> itr = memo.getSinks();

                while (itr.hasNext()) {

                    if (halt)
                        throw new RuntimeException(firstCause.get());

                    final JoinTaskSink sink = itr.next();

                    final Future<?> f = sink.getFuture();

                    if (DEBUG)
                        log.debug("Waiting for Future: sink=" + sink);

                    // will throw any exception from the sink's Future.
                    f.get();

                }

            }

        } // else (lastJoin)

        if (DEBUG)
            log.debug("Done: orderIndex="
                    + orderIndex
                    + ", partitionId="
                    + partitionId
                    + (lastJoin ? "lastJoin" : ", sinkCount="
                            + memo.size()));

    }

    /**
     * Flushes any buffered data for a {@link JoinTaskSink} and closes the
     * {@link BlockingBuffer} for that sink so that the sink {@link JoinTask}'s
     * iterator can eventually drain the buffer and report that it is exhausted.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     * @version $Id$
     */
    private class FlushAndCloseSinkBufferTask implements Callable<Void> {

        final private JoinTaskSink sink;

        public FlushAndCloseSinkBufferTask(final JoinTaskSink sink) {

            if (sink == null)
                throw new IllegalArgumentException();

            this.sink = sink;

        }

        public Void call() throws Exception {

            if (halt)
                throw new RuntimeException(firstCause.get());

            if (DEBUG)
                log.debug("Closing sink: sink=" + sink
                        + ", unsyncBufferSize=" + sink.unsyncBuffer.size()
                        + ", blockingBufferSize=" + sink.blockingBuffer.size());

            // flush to the blockingBuffer.
            sink.unsyncBuffer.flush();

            // close the blockingBuffer.
            sink.blockingBuffer.close();

            return null;

        }

    }
    
    /**
     * Cancel all {@link DistributedJoinTask}s that are sinks for this
     * {@link DistributedJoinTask}.
     */
    @Override
    protected void cancelSinks() {

        // no sinks.
        if (lastJoin)
            return;

        if (DEBUG)
            log.debug("orderIndex=" + orderIndex + ", partitionId="
                    + partitionId + ", sinkCount=" + memo.size());

        final Iterator<JoinTaskSink> itr = memo.getSinks();

        while (itr.hasNext()) {

            final JoinTaskSink sink = itr.next();

            sink.unsyncBuffer.reset();

            sink.blockingBuffer.reset();

            sink.blockingBuffer.close();

            sink.getFuture().cancel(true/* mayInterruptIfRunning */);

        }

        if (DEBUG)
            log.debug("Done: orderIndex=" + orderIndex + ", partitionId="
                    + partitionId + ", sinkCount=" + memo.size());

    }

    /**
     * Return the sink on which we will write {@link IBindingSet} for the
     * index partition associated with the specified locator. The sink will
     * be backed by a {@link DistributedJoinTask} running on the
     * {@link IDataService} that is host to that index partition. The
     * scale-out index will be the scale-out index for the next
     * {@link IPredicate} in the evaluation order.
     * 
     * @param locator
     *            The locator for the index partition.
     * 
     * @return The sink.
     * 
     * @throws RuntimeException
     *             If the {@link JoinTaskFactoryTask} fails.
     * @throws InterruptedException
     *             If the {@link JoinTaskFactoryTask} is interrupted.
     */
    protected JoinTaskSink getSink(final PartitionLocator locator)
            throws InterruptedException, RuntimeException {

        return memo.compute(new SinkRequest(this, locator));

    }

    /**
     * Helper class models a request to obtain a sink for a given join task and
     * locator.
     * <p>
     * Note: This class must implement equals() and hashCode() since it is used
     * within the {@link Memoizer} pattern.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
     *         Thompson</a>
     */
    private static class SinkRequest {

        final DistributedJoinTask joinTask;

        final PartitionLocator locator;

        /**
         * 
         * @param joinTask
         *            The join task.
         * @param locator
         *            The locator for the target shard.
         */
        public SinkRequest(final DistributedJoinTask joinTask, final PartitionLocator locator) {
    
            this.joinTask = joinTask;
            
            this.locator = locator;
            
        }

        /**
         * Equals returns true iff parent == o.parent and index == o.index.
         */
        public boolean equals(final Object o) {

            if (!(o instanceof SinkRequest))
                return false;

            final SinkRequest r = (SinkRequest) o;

            return joinTask == r.joinTask && locator.equals(r.locator);
            
        }

		/**
		 * The hashCode() is based directly on the hash code of the
		 * {@link PartitionLocator}. All requests against a given
		 * {@link Memoizer} will have the same {@link DistributedJoinTask} so
		 * that field can be factored out of the hash code.
		 */
        public int hashCode() {
            
            return locator.hashCode();
            
        }
        
    }

    /**
     * Helper establishes a {@link JoinTaskSink} on the target {@link IDataService}.
     */
    final private static Computable<SinkRequest, JoinTaskSink> getSink = new Computable<SinkRequest, JoinTaskSink>() {

        public JoinTaskSink compute(final SinkRequest req)
                throws InterruptedException {

            try {
				return req.joinTask._getSink(req.locator);
			} catch (ExecutionException e) {
				throw new RuntimeException(e);
			}
        
        }
        
    };

    /**
     * FIXME javadoc : A {@link Memoizer} subclass which exposes an additional method to remove
     * a {@link FutureTask} from the internal cache. This is used as part of an
     * explicit protocol to clear out cache
     * entries once the sink reference has been set on 
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
     *         Thompson</a>
     */
    private static class SinkMemoizer extends
            Memoizer<SinkRequest/* request */, JoinTaskSink/* sink */> {

        /**
         * @param c
         */
		public SinkMemoizer(final Computable<SinkRequest, JoinTaskSink> c) {

			super(c);

        }

		int size() {
			return cache.size();
		}
		
		/**
		 * FIMXE There are two distinct semantics available here.  One is the set
		 * of current sinks (there is a join task fully up and running on a DS
		 * somewhere and we have a proxy for that DS).  The other is the set of
		 * sinks which have been requested but may or may not have been fully
		 * realized yet.  When we are breaking a join, we probably want to cancel
		 * all of the requests to obtain sinks in addition to canceling any
		 * running sinks.  A similar problem may exist if we implement native
		 * SLICE since we could break the join while there are requests out to
		 * create sinks.  
		 * 
		 * One way to handle this is to pull the cancelSinks() method into this
		 * memoizer.
		 * 
		 * However, if we broad cast the rule to the nodes and move away from 
		 * this sinks model to using NIO buffers then we will just broadcast 
		 * the close of each tail in turn or broadcast the break of the join.
		 */
		@SuppressWarnings("unchecked")
		Iterator<JoinTaskSink> getSinks() {
			return new Striterator(cache.values().iterator()).addFilter(new Filter(){
				private static final long serialVersionUID = 1L;
				@Override
				public boolean isValid(final Object e) {
					/*
					 * Filter out any tasks which are not done or which had an
					 * error.
					 */
					final Future<JoinTaskSink> f = (Future<JoinTaskSink>)e;
					if(!f.isDone()) {
						return false;
					}
					try {f.get();}
					catch(final ExecutionException ex) {
						return false;
					} catch (final InterruptedException ex) {
						return false;
					}
					return true;
				}
			}).addFilter(new Resolver(){
				private static final long serialVersionUID = 1L;
				@Override
				protected Object resolve(final Object arg0) {
					/*
					 * We filtered out any tasks which were not done and any
					 * tasks which had errors.  The future should be immediately
					 * available and Future.get() should not throw an error. 
					 */
					final Future<JoinTaskSink> f = (Future<JoinTaskSink>)arg0;
					try {
						return f.get();
					} catch (final InterruptedException e) {
						throw new RuntimeException(e);
					} catch (final ExecutionException e) {
						throw new RuntimeException(e);
					}
				}
			});
		}
		
//        /**
//         * Called by the thread which atomically sets the
//         * {@link AbstractNode#childRefs} element to the computed
//         * {@link AbstractNode}. At that point a reference exists to the child
//         * on the parent.
//         * 
//         * @param req
//         *            The request.
//         */
//        void removeFromCache(final SinkRequest req) {
//
//            if (cache.remove(req) == null) {
//
//                throw new AssertionError();
//                
//            }
//
//        }

//        /**
//         * Called from {@link AbstractBTree#close()}.
//         * 
//         * @todo should we do this?  There should not be any reads against the
//         * the B+Tree when it is close()d.  Therefore I do not believe there 
//         * is any reason to clear the FutureTask cache.
//         */
//        void clear() {
//            
//            cache.clear();
//            
//        }
        
    };

    /**
     * Used to materialize {@link JoinTaskSink}s without causing concurrent requests
     * for different sinks to block.
     */
    final private SinkMemoizer memo;

	/**
	 * Inner implementation invoked from the {@link Memoizer}.
	 * 
	 * @param locator
	 *            The shard locator.
	 *            
	 * @return The sink which will write on the downstream {@link JoinTask}
	 *         running on the node for that shard.
	 * 
	 * @throws ExecutionException 
	 * @throws InterruptedException 
	 */
    private JoinTaskSink _getSink(final PartitionLocator locator) throws InterruptedException, ExecutionException {
        
        /*
         * Allocate/discover JoinTask on the target data service and
         * obtain a sink reference for its future and buffers.
         * 
         * Note: The JoinMasterTask uses very similar logic to setup the
         * first join dimension. Of course, it gets to assume that there
         * is no such JoinTask in existence at the time.
         */

        final int nextOrderIndex = orderIndex + 1;

        if (DEBUG)
            log.debug("Creating join task: nextOrderIndex="
                    + nextOrderIndex + ", indexName="
                    + nextScaleOutIndexName + ", partitionId="
                    + locator.getPartitionId());

        final UUID sinkUUID = locator.getDataServiceUUID();

        final IDataService dataService;
        if (sinkUUID.equals(fed.getServiceUUID())) {

		/*
		 * As an optimization, special case when the downstream
		 * data service is _this_ data service.
		 */
        	dataService = (IDataService)fed.getService();
            
        } else {
        
        	dataService = fed.getDataService(sinkUUID);
        	
        }

        final JoinTaskSink sink = new JoinTaskSink(fed, locator, this);

        /*
         * Export async iterator proxy.
         * 
         * Note: This proxy is used by the sink to draw chunks from the
         * source JoinTask(s).
         */
        final IAsynchronousIterator<IBindingSet[]> sourceItrProxy;
        if (fed.isDistributed()) {

            sourceItrProxy = ((AbstractDistributedFederation<?>) fed)
                    .getProxy(sink.blockingBuffer.iterator(), joinNexus
                            .getBindingSetSerializer(), joinNexus
                            .getChunkOfChunksCapacity());

        } else {

            sourceItrProxy = sink.blockingBuffer.iterator();

        }

        // the future for the factory task (not the JoinTask).
        final Future<?> factoryFuture;
        try {

            final JoinTaskFactoryTask factoryTask = new JoinTaskFactoryTask(
                nextScaleOutIndexName, rule, joinNexus
                        .getJoinNexusFactory(), order, nextOrderIndex,
                locator.getPartitionId(), masterProxy, masterUUID,
                sourceItrProxy, keyOrders, requiredVars);

            // submit the factory task, obtain its future.
            factoryFuture = dataService.submit(factoryTask);

        } catch (IOException ex) {

            // RMI problem.
            throw new RuntimeException(ex);

        }

        /*
         * Obtain the future for the JoinTask from the factory task's
         * Future.
         */

        sink.setFuture((Future<?>) factoryFuture.get());

        stats.fanOut++;

        return sink;

    }

    /**
     * Logs an error in {@link JoinTask#call()} on the local log file and adds
     * some metadata about the operation which was being executed. This does not
     * imply that the error originates with this join task. You have to inspect
     * the error messages, the order in which the joins were being evaluated,
     * and even correlate the {@link JoinTask#masterUUID} in order to figure out
     * what really happened.
     */
    @Override
    protected void logCallError(final Throwable t) {

        log.error("hostname=" + dataService.getHostname() + ", serviceName="
                + dataService.getServiceName() + ", joinTask=" + toString()
                + ", rule=" + rule, t);
        
    }
    
}