DistributedJoinMasterTask.java example

Explorer
blazegraph-master
- database-master
package com.bigdata.relation.rule.eval.pipeline;

import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.relation.IMutableRelation;
import com.bigdata.relation.accesspath.BlockingBuffer;
import com.bigdata.relation.accesspath.IAccessPath;
import com.bigdata.relation.accesspath.IAsynchronousIterator;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.relation.accesspath.ThickAsynchronousIterator;
import com.bigdata.relation.rule.IQueryOptions;
import com.bigdata.relation.rule.IRule;
import com.bigdata.relation.rule.IAccessPathExpander;
import com.bigdata.relation.rule.eval.IJoinNexus;
import com.bigdata.relation.rule.eval.ISolution;
import com.bigdata.service.AbstractDistributedFederation;
import com.bigdata.service.AbstractScaleOutFederation;
import com.bigdata.service.DataService;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.service.IDataService;
import com.bigdata.service.ndx.IClientIndex;
import com.bigdata.service.proxy.RemoteBuffer;
import com.bigdata.util.concurrent.ExecutionExceptions;

/**
 * Implementation for distributed join execution.
 * <p>
 * Note: For query, this object MUST be executed locally on the client. This
 * ensures that all data flows back to the client directly. For mutation, it
 * is possible to submit this object to any service in the federation and
 * each {@link DistributedJoinTask} will write directly on the scale-out
 * view of the target {@link IMutableRelation}.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class DistributedJoinMasterTask extends JoinMasterTask implements
        Serializable {

    /**
     * 
     */
    private static final long serialVersionUID = 7096223893807015958L;

    /**
     * The proxy for this {@link DistributedJoinMasterTask}.
     */
    private final IJoinMaster masterProxy;

    /**
     * The proxy for the solution buffer (query only).
     * <p>
     * Note: The query buffer is always an {@link IBlockingBuffer}. The
     * client has the {@link IAsynchronousIterator} that drains the
     * {@link BlockingBuffer}. The master is local to the client so that
     * data from the distributed join tasks flows directly to the client.
     * <p>
     * Note: The reason why we do not use a {@link RemoteBuffer} for
     * mutation is that it would cause all data to flow through the master!
     * Instead each {@link JoinTask} for the last join dimension uses its
     * own buffer to aggregate and write on the target
     * {@link IMutableRelation}.
     */
    private final IBuffer<ISolution[]> solutionBufferProxy;

    /**
     * For queries, the master MUST execute locally to the client. If the
     * master were to be executed on a remote {@link DataService} then that
     * would cause the {@link #getSolutionBuffer()} to be created on the
     * remote service and all query results would be forced through that
     * remote JVM before being streamed back to the client.
     * <p>
     * This is not a problem when the rule is a mutation operation since
     * the individual join tasks will each allocate their own buffer that
     * writes on the target {@link IMutableRelation}.
     * 
     * @throws UnsupportedOperationException
     *             if the operation is a query.
     */
    private void writeObject(java.io.ObjectOutputStream out) throws IOException {

        if (!joinNexus.getAction().isMutation()) {

            throw new UnsupportedOperationException(
                    "Join master may not be executed remotely for query.");

        }

        out.defaultWriteObject();

    }

    /**
     * @param rule
     * @param joinNexus
     * @param buffer
     *            The buffer on which the last {@link DistributedJoinTask}
     *            will write query {@link ISolution}s. However, it is
     *            ignored for mutation operations as each
     *            {@link DistributedJoinTask} for the last join dimension
     *            (there can be more than one if the index partition has
     *            more than one partition) will obtain and write on its own
     *            solution buffer in order to avoid moving all data through
     *            the master.
     * 
     * @throws UnsupportedOperationException
     *             unless {@link IJoinNexus#getIndexManager()} reports an
     *             {@link AbstractScaleOutFederation}.
     */
    public DistributedJoinMasterTask(final IRule rule, final IJoinNexus joinNexus,
            final IBuffer<ISolution[]> buffer) {

        super(rule, joinNexus, buffer);

        if (!(joinNexus.getIndexManager() instanceof IBigdataFederation)
                || !(((IBigdataFederation) joinNexus.getIndexManager())
                        .isScaleOut())) {

            /*
             * Either not running in a scale-out deployment or executed in a
             * context (such as within the ConcurrencyManager) where the
             * joinNexus will not report the federation as the index manager
             * object.
             */

            throw new UnsupportedOperationException();

        }

        if (joinNexus.getAction().isMutation()) {

            /*
             * Check constraints on executing mutation operations.
             * 
             * Note: These constraints arise from (a) the need to flush
             * solutions onto relations that may also be in the body of the
             * rule; and (b) the need to avoid stale locators when writing
             * on those relations.
             */

            if (!TimestampUtility
                    .isReadOnly(joinNexus.getReadTimestamp())) {

                /*
                 * Must use a read-consistent view and advance the
                 * readTimestamp before each mutation operation.
                 */

                throw new UnsupportedOperationException();

            }

        } else {

            if (joinNexus.getReadTimestamp() == ITx.UNISOLATED) {

                /*
                 * Note: While you probably can run a query against the
                 * unisolated indices it will prevent overflow processing
                 * since there will exclusive locks and is a bad idea.
                 */

                log.warn("Unisolated scale-out query");

            }

        }

        /*
         * Export proxies?
         * 
         * Note: We need proxies if the federation is really distributed and
         * using RMI to communicate.
         * 
         * @todo do we need distributed garbage collection for these
         * proxies?
         */
        if (joinNexus.getIndexManager() instanceof AbstractDistributedFederation) {

            final AbstractDistributedFederation fed = (AbstractDistributedFederation) joinNexus
                    .getIndexManager();

            masterProxy = (IJoinMaster) fed
                    .getProxy(this, true/* enableDGC */);

            if (joinNexus.getAction().isMutation()) {

                // mutation.
                solutionBufferProxy = null;

            } else {

                // query - export proxy for the solution buffer.
                solutionBufferProxy = fed.getProxy(solutionBuffer);

            }

        } else {

            /*
             * Not really distributed, so just use the actual reference.
             */

            masterProxy = this;

            solutionBufferProxy = solutionBuffer;

        }

    }

    @Override
    public IBuffer<ISolution[]> getSolutionBuffer() throws IOException {

        if (joinNexus.getAction().isMutation()) {

            /*
             * Note: access is not permitted for mutation to keep data from
             * distributed join tasks from flowing through the master.
             */

            throw new UnsupportedOperationException();

        }

        return solutionBufferProxy;

    }

    /**
     * Create and run the {@link JoinTask}(s) that will evaluate the first
     * join dimension.
     * <p>
     * A {@link JoinTask} is created on the {@link DataService} for each
     * index partition that is spanned by the {@link IAccessPath} for the
     * first {@link IPredicate} in the evaluation order. Those
     * {@link JoinTask} are run in parallel, so the actual parallelism for
     * the first {@link IPredicate} is the #of index partitions spanned by
     * its {@link IAccessPath}.
     * 
     * @return The {@link Future} for each {@link DistributedJoinTask}
     *         created for the first join dimension (one per index
     *         partitions spanned by the predicate that is first in the
     *         evaluation order given the initial bindingSet for the rule).
     */
    @Override
    final protected List<Future<Void>> start() throws Exception {

        /*
         * The initial bindingSet.
         * 
         * Note: This bindingSet might not be empty since constants can be
         * bound before the rule is evaluated.
         */
        final IBindingSet initialBindingSet = joinNexus.newBindingSet(rule);

        /*
         * Map the initial binding set across all index partitions on which the
         * asBound() predicate would read for the first join dimension.
         */
        final List<Future> factoryTaskFutures = mapBindingSet(initialBindingSet);

        // await futures for the factory tasks.
        final List<Future<Void>> joinTaskFutures = awaitFactoryFutures(factoryTaskFutures);

        return joinTaskFutures;

    }

    /**
     * Map the given {@link IBindingSet} over the {@link JoinTask}(s) for
     * the index partition(s) the span the {@link IAccessPath} for that
     * {@link IBindingSet} in parallel.
     * 
     * @param bindingSet
     *            The binding set.
     * 
     * @return A list of {@link Future}s for the
     *         {@link JoinTaskFactoryTask} that will create the
     *         {@link DistributedJoinTask}s for the first join dimension.
     * 
     * @throws Exception
     * 
     * FIXME If a predicate defines an {@link IAccessPathExpander} then we DO
     * NOT map the predicate. Instead, we use
     * {@link IJoinNexus#getTailAccessPath(IPredicate)} and evaluate the
     * {@link IAccessPath} with the layered {@link IAccessPathExpander} in
     * process. If the {@link IAccessPathExpander} touches the index, it will
     * be using an {@link IClientIndex}. While the {@link IClientIndex} is
     * not nearly as efficient as using a local index partition, it will
     * provide a view of the total key-range partitioned index.
     * <p>
     * do this for each join dimension for which an
     * {@link IAccessPathExpander} is defined, including not only the first N
     * join dimensions (handles free text search) but also an intermediate
     * join dimension (requires that all source join tasks target a join
     * task having a view of the scale-out index rather than mapping the
     * task across the index partitions).
     * 
     * FIXME The initial binding set should not be mapped across the index
     * partitions for the first join dimension if {@link IQueryOptions#isStable()}
     * is <code>true</code> (any parallel evaluation violates the stable
     * constraint).
     */
    protected List<Future> mapBindingSet(final IBindingSet bindingSet)
            throws Exception {

        /*
         * The first predicate in the evaluation order with the initial
         * bindings applied.
         */
        final IPredicate<?> predicate = rule.getTail(order[0]).asBound(bindingSet);

        // scale-out index manager.
        final AbstractScaleOutFederation<?> fed = (AbstractScaleOutFederation<?>) joinNexus
                .getIndexManager();

        // the scale out index on which this predicate must read (logging only).
        final String scaleOutIndexName = predicate.getOnlyRelationName()+"."
                + ruleState.getKeyOrder()[order[0]];

        final Iterator<PartitionLocator> itr = joinNexus.locatorScan(fed,
                predicate);

        final List<Future> futures = new LinkedList<Future>();

        while (itr.hasNext()) {

            final PartitionLocator locator = itr.next();

            final int partitionId = locator.getPartitionId();

            if (log.isDebugEnabled())
                log.debug("Will submit JoinTask: partitionId="
                        + partitionId);

            /*
             * Note: Since there is only a single binding set, we send a
             * serializable thick iterator to the client.
             */
            final ThickAsynchronousIterator<IBindingSet[]> sourceItr = newBindingSetIterator(bindingSet);

            final JoinTaskFactoryTask factoryTask = new JoinTaskFactoryTask(
                    scaleOutIndexName, rule, joinNexusFactory, order,
                    0/* orderIndex */, partitionId, masterProxy, masterUUID,
                    sourceItr, ruleState.getKeyOrder(), 
                    ruleState.getRequiredVars());

            final IDataService dataService = fed.getDataService(locator
                    .getDataServiceUUID());

            /*
             * Submit the JoinTask. It will begin to execute when it is
             * scheduled by the ConcurrencyManager. When it executes it will
             * consume the [initialBindingSet].
             */
            final Future f;

            try {

                f = dataService.submit(factoryTask);

            } catch (Exception ex) {

                throw new ExecutionException("Could not submit: task="
                        + factoryTask, ex);

            }

            /*
             * Add to the list of futures that we need to await.
             */
            futures.add(f);

        }

        return futures;

    }

    /**
     * Await the {@link JoinTaskFactoryTask} {@link Future}s.
     * <p>
     * Note: the result for a {@link JoinTaskFactoryTask} {@link Future} is
     * a {@link DistributedJoinTask} {@link Future}.
     * 
     * @param factoryTaskFutures
     *            A list of {@link Future}s, with one {@link Future} for
     *            each index partition that is spanned by the
     *            {@link IAccessPath} for the first {@link IPredicate} in
     *            the evaluation order.
     * 
     * @return A list of {@link DistributedJoinTask} {@link Future}s. There
     *         will be one element in the list for each
     *         {@link JoinTaskFactoryTask} {@link Future} in the caller's
     *         list. The elements will be in the same order.
     * 
     * @throws InterruptedException
     *             if the master itself was interrupted.
     * @throws ExecutionExceptions
     *             if any of the factory tasks fail.
     */
    protected List<Future<Void>> awaitFactoryFutures(
            final List<Future> factoryTaskFutures) throws InterruptedException,
            ExecutionExceptions {

        final int size = factoryTaskFutures.size();

        if (log.isDebugEnabled())
            log.debug("#futures=" + size);

        int ndone = 0;

        /*
         * A list containing any join tasks that were successfully created.
         * Since we process the factory task futures in order the list will
         * be in the same order as the factory task futures.
         */
        final List<Future<Void>> joinTaskFutures = new ArrayList<Future<Void>>(
                size);

        final Iterator<Future> itr = factoryTaskFutures.iterator();

        /*
         * Initially empty. Populated with an errors encountered when trying
         * to execute the _factory_ tasks.
         */
        final List<ExecutionException> causes = new LinkedList<ExecutionException>();

        /*
         * Process all factory tasks.
         * 
         * Note: if an error occurs for any factory task, then we cancel the
         * remaining factory tasks and also cancel any join task that was
         * already started.
         */
        while (itr.hasNext()) {

            /*
             * Note: The Future of the JoinFactoryTask returns the Future of
             * the JoinTask.
             */

            // future for the JoinTaskFactoryTask.
            final Future factoryTaskFuture = itr.next();

            if (log.isDebugEnabled())
                log.debug("Waiting for factoryTask");

            // wait for the JoinTaskFactoryTask to finish.
            final Future<Void> joinTaskFuture;

            try {

                if (!causes.isEmpty()) {

                    /*
                     * We have to abort, so cancel the factory task in case
                     * it is still running but fall through and try to get
                     * its future in case it has already created the join
                     * task.
                     */

                    factoryTaskFuture.cancel(true/* mayInterruptIfRunning */);

                }

                // log.fatal("\nWaiting on factoryTaskFuture: "+factoryTaskFuture);
                joinTaskFuture = (Future<Void>) factoryTaskFuture.get();
                // log.fatal("\nHave joinTaskFuture: "+joinTaskFuture);

            } catch (ExecutionException ex) {

                causes.add(ex);

                /*
                 * Note: This is here because the ExecutionExceptions that
                 * we throw does not print out all of its stack traces.
                 * 
                 * @todo log iff unexpected exception class or get all
                 * traces from ExecutionExceptions class.
                 */
                log.error(ex, ex);

                continue;

            }

            if (causes.isEmpty()) {

                // no errors yet, so remember the future for the join task.
                joinTaskFutures.add(joinTaskFuture);

            } else {

                // cancel the join task since we have to abort anyway.
                joinTaskFuture.cancel(true/* mayInterruptIfRunning */);

            }

            ndone++;

            if (log.isDebugEnabled())
                log.debug("ndone=" + ndone + " of " + size);

        }

        if (!causes.isEmpty()) {

            for (Future<Void> f : joinTaskFutures) {

                // cancel since we have to abort anyway.
                f.cancel(true/* mayInterruptIfRunning */);

            }

            throw new ExecutionExceptions(causes);

        }

        if (log.isDebugEnabled())
            log.debug("All factory tasks done: #futures=" + size);

        return joinTaskFutures;

    }

}