package com.bigdata.relation.rule.eval.pipeline; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import com.bigdata.bop.IBindingSet; import com.bigdata.bop.IPredicate; import com.bigdata.journal.ITx; import com.bigdata.journal.TimestampUtility; import com.bigdata.mdi.PartitionLocator; import com.bigdata.relation.IMutableRelation; import com.bigdata.relation.accesspath.BlockingBuffer; import com.bigdata.relation.accesspath.IAccessPath; import com.bigdata.relation.accesspath.IAsynchronousIterator; import com.bigdata.relation.accesspath.IBlockingBuffer; import com.bigdata.relation.accesspath.IBuffer; import com.bigdata.relation.accesspath.ThickAsynchronousIterator; import com.bigdata.relation.rule.IQueryOptions; import com.bigdata.relation.rule.IRule; import com.bigdata.relation.rule.IAccessPathExpander; import com.bigdata.relation.rule.eval.IJoinNexus; import com.bigdata.relation.rule.eval.ISolution; import com.bigdata.service.AbstractDistributedFederation; import com.bigdata.service.AbstractScaleOutFederation; import com.bigdata.service.DataService; import com.bigdata.service.IBigdataFederation; import com.bigdata.service.IDataService; import com.bigdata.service.ndx.IClientIndex; import com.bigdata.service.proxy.RemoteBuffer; import com.bigdata.util.concurrent.ExecutionExceptions; /** * Implementation for distributed join execution. * <p> * Note: For query, this object MUST be executed locally on the client. This * ensures that all data flows back to the client directly. For mutation, it * is possible to submit this object to any service in the federation and * each {@link DistributedJoinTask} will write directly on the scale-out * view of the target {@link IMutableRelation}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class DistributedJoinMasterTask extends JoinMasterTask implements Serializable { /** * */ private static final long serialVersionUID = 7096223893807015958L; /** * The proxy for this {@link DistributedJoinMasterTask}. */ private final IJoinMaster masterProxy; /** * The proxy for the solution buffer (query only). * <p> * Note: The query buffer is always an {@link IBlockingBuffer}. The * client has the {@link IAsynchronousIterator} that drains the * {@link BlockingBuffer}. The master is local to the client so that * data from the distributed join tasks flows directly to the client. * <p> * Note: The reason why we do not use a {@link RemoteBuffer} for * mutation is that it would cause all data to flow through the master! * Instead each {@link JoinTask} for the last join dimension uses its * own buffer to aggregate and write on the target * {@link IMutableRelation}. */ private final IBuffer<ISolution[]> solutionBufferProxy; /** * For queries, the master MUST execute locally to the client. If the * master were to be executed on a remote {@link DataService} then that * would cause the {@link #getSolutionBuffer()} to be created on the * remote service and all query results would be forced through that * remote JVM before being streamed back to the client. * <p> * This is not a problem when the rule is a mutation operation since * the individual join tasks will each allocate their own buffer that * writes on the target {@link IMutableRelation}. * * @throws UnsupportedOperationException * if the operation is a query. */ private void writeObject(java.io.ObjectOutputStream out) throws IOException { if (!joinNexus.getAction().isMutation()) { throw new UnsupportedOperationException( "Join master may not be executed remotely for query."); } out.defaultWriteObject(); } /** * @param rule * @param joinNexus * @param buffer * The buffer on which the last {@link DistributedJoinTask} * will write query {@link ISolution}s. However, it is * ignored for mutation operations as each * {@link DistributedJoinTask} for the last join dimension * (there can be more than one if the index partition has * more than one partition) will obtain and write on its own * solution buffer in order to avoid moving all data through * the master. * * @throws UnsupportedOperationException * unless {@link IJoinNexus#getIndexManager()} reports an * {@link AbstractScaleOutFederation}. */ public DistributedJoinMasterTask(final IRule rule, final IJoinNexus joinNexus, final IBuffer<ISolution[]> buffer) { super(rule, joinNexus, buffer); if (!(joinNexus.getIndexManager() instanceof IBigdataFederation) || !(((IBigdataFederation) joinNexus.getIndexManager()) .isScaleOut())) { /* * Either not running in a scale-out deployment or executed in a * context (such as within the ConcurrencyManager) where the * joinNexus will not report the federation as the index manager * object. */ throw new UnsupportedOperationException(); } if (joinNexus.getAction().isMutation()) { /* * Check constraints on executing mutation operations. * * Note: These constraints arise from (a) the need to flush * solutions onto relations that may also be in the body of the * rule; and (b) the need to avoid stale locators when writing * on those relations. */ if (!TimestampUtility .isReadOnly(joinNexus.getReadTimestamp())) { /* * Must use a read-consistent view and advance the * readTimestamp before each mutation operation. */ throw new UnsupportedOperationException(); } } else { if (joinNexus.getReadTimestamp() == ITx.UNISOLATED) { /* * Note: While you probably can run a query against the * unisolated indices it will prevent overflow processing * since there will exclusive locks and is a bad idea. */ log.warn("Unisolated scale-out query"); } } /* * Export proxies? * * Note: We need proxies if the federation is really distributed and * using RMI to communicate. * * @todo do we need distributed garbage collection for these * proxies? */ if (joinNexus.getIndexManager() instanceof AbstractDistributedFederation) { final AbstractDistributedFederation fed = (AbstractDistributedFederation) joinNexus .getIndexManager(); masterProxy = (IJoinMaster) fed .getProxy(this, true/* enableDGC */); if (joinNexus.getAction().isMutation()) { // mutation. solutionBufferProxy = null; } else { // query - export proxy for the solution buffer. solutionBufferProxy = fed.getProxy(solutionBuffer); } } else { /* * Not really distributed, so just use the actual reference. */ masterProxy = this; solutionBufferProxy = solutionBuffer; } } @Override public IBuffer<ISolution[]> getSolutionBuffer() throws IOException { if (joinNexus.getAction().isMutation()) { /* * Note: access is not permitted for mutation to keep data from * distributed join tasks from flowing through the master. */ throw new UnsupportedOperationException(); } return solutionBufferProxy; } /** * Create and run the {@link JoinTask}(s) that will evaluate the first * join dimension. * <p> * A {@link JoinTask} is created on the {@link DataService} for each * index partition that is spanned by the {@link IAccessPath} for the * first {@link IPredicate} in the evaluation order. Those * {@link JoinTask} are run in parallel, so the actual parallelism for * the first {@link IPredicate} is the #of index partitions spanned by * its {@link IAccessPath}. * * @return The {@link Future} for each {@link DistributedJoinTask} * created for the first join dimension (one per index * partitions spanned by the predicate that is first in the * evaluation order given the initial bindingSet for the rule). */ @Override final protected List<Future<Void>> start() throws Exception { /* * The initial bindingSet. * * Note: This bindingSet might not be empty since constants can be * bound before the rule is evaluated. */ final IBindingSet initialBindingSet = joinNexus.newBindingSet(rule); /* * Map the initial binding set across all index partitions on which the * asBound() predicate would read for the first join dimension. */ final List<Future> factoryTaskFutures = mapBindingSet(initialBindingSet); // await futures for the factory tasks. final List<Future<Void>> joinTaskFutures = awaitFactoryFutures(factoryTaskFutures); return joinTaskFutures; } /** * Map the given {@link IBindingSet} over the {@link JoinTask}(s) for * the index partition(s) the span the {@link IAccessPath} for that * {@link IBindingSet} in parallel. * * @param bindingSet * The binding set. * * @return A list of {@link Future}s for the * {@link JoinTaskFactoryTask} that will create the * {@link DistributedJoinTask}s for the first join dimension. * * @throws Exception * * FIXME If a predicate defines an {@link IAccessPathExpander} then we DO * NOT map the predicate. Instead, we use * {@link IJoinNexus#getTailAccessPath(IPredicate)} and evaluate the * {@link IAccessPath} with the layered {@link IAccessPathExpander} in * process. If the {@link IAccessPathExpander} touches the index, it will * be using an {@link IClientIndex}. While the {@link IClientIndex} is * not nearly as efficient as using a local index partition, it will * provide a view of the total key-range partitioned index. * <p> * do this for each join dimension for which an * {@link IAccessPathExpander} is defined, including not only the first N * join dimensions (handles free text search) but also an intermediate * join dimension (requires that all source join tasks target a join * task having a view of the scale-out index rather than mapping the * task across the index partitions). * * FIXME The initial binding set should not be mapped across the index * partitions for the first join dimension if {@link IQueryOptions#isStable()} * is <code>true</code> (any parallel evaluation violates the stable * constraint). */ protected List<Future> mapBindingSet(final IBindingSet bindingSet) throws Exception { /* * The first predicate in the evaluation order with the initial * bindings applied. */ final IPredicate<?> predicate = rule.getTail(order[0]).asBound(bindingSet); // scale-out index manager. final AbstractScaleOutFederation<?> fed = (AbstractScaleOutFederation<?>) joinNexus .getIndexManager(); // the scale out index on which this predicate must read (logging only). final String scaleOutIndexName = predicate.getOnlyRelationName()+"." + ruleState.getKeyOrder()[order[0]]; final Iterator<PartitionLocator> itr = joinNexus.locatorScan(fed, predicate); final List<Future> futures = new LinkedList<Future>(); while (itr.hasNext()) { final PartitionLocator locator = itr.next(); final int partitionId = locator.getPartitionId(); if (log.isDebugEnabled()) log.debug("Will submit JoinTask: partitionId=" + partitionId); /* * Note: Since there is only a single binding set, we send a * serializable thick iterator to the client. */ final ThickAsynchronousIterator<IBindingSet[]> sourceItr = newBindingSetIterator(bindingSet); final JoinTaskFactoryTask factoryTask = new JoinTaskFactoryTask( scaleOutIndexName, rule, joinNexusFactory, order, 0/* orderIndex */, partitionId, masterProxy, masterUUID, sourceItr, ruleState.getKeyOrder(), ruleState.getRequiredVars()); final IDataService dataService = fed.getDataService(locator .getDataServiceUUID()); /* * Submit the JoinTask. It will begin to execute when it is * scheduled by the ConcurrencyManager. When it executes it will * consume the [initialBindingSet]. */ final Future f; try { f = dataService.submit(factoryTask); } catch (Exception ex) { throw new ExecutionException("Could not submit: task=" + factoryTask, ex); } /* * Add to the list of futures that we need to await. */ futures.add(f); } return futures; } /** * Await the {@link JoinTaskFactoryTask} {@link Future}s. * <p> * Note: the result for a {@link JoinTaskFactoryTask} {@link Future} is * a {@link DistributedJoinTask} {@link Future}. * * @param factoryTaskFutures * A list of {@link Future}s, with one {@link Future} for * each index partition that is spanned by the * {@link IAccessPath} for the first {@link IPredicate} in * the evaluation order. * * @return A list of {@link DistributedJoinTask} {@link Future}s. There * will be one element in the list for each * {@link JoinTaskFactoryTask} {@link Future} in the caller's * list. The elements will be in the same order. * * @throws InterruptedException * if the master itself was interrupted. * @throws ExecutionExceptions * if any of the factory tasks fail. */ protected List<Future<Void>> awaitFactoryFutures( final List<Future> factoryTaskFutures) throws InterruptedException, ExecutionExceptions { final int size = factoryTaskFutures.size(); if (log.isDebugEnabled()) log.debug("#futures=" + size); int ndone = 0; /* * A list containing any join tasks that were successfully created. * Since we process the factory task futures in order the list will * be in the same order as the factory task futures. */ final List<Future<Void>> joinTaskFutures = new ArrayList<Future<Void>>( size); final Iterator<Future> itr = factoryTaskFutures.iterator(); /* * Initially empty. Populated with an errors encountered when trying * to execute the _factory_ tasks. */ final List<ExecutionException> causes = new LinkedList<ExecutionException>(); /* * Process all factory tasks. * * Note: if an error occurs for any factory task, then we cancel the * remaining factory tasks and also cancel any join task that was * already started. */ while (itr.hasNext()) { /* * Note: The Future of the JoinFactoryTask returns the Future of * the JoinTask. */ // future for the JoinTaskFactoryTask. final Future factoryTaskFuture = itr.next(); if (log.isDebugEnabled()) log.debug("Waiting for factoryTask"); // wait for the JoinTaskFactoryTask to finish. final Future<Void> joinTaskFuture; try { if (!causes.isEmpty()) { /* * We have to abort, so cancel the factory task in case * it is still running but fall through and try to get * its future in case it has already created the join * task. */ factoryTaskFuture.cancel(true/* mayInterruptIfRunning */); } // log.fatal("\nWaiting on factoryTaskFuture: "+factoryTaskFuture); joinTaskFuture = (Future<Void>) factoryTaskFuture.get(); // log.fatal("\nHave joinTaskFuture: "+joinTaskFuture); } catch (ExecutionException ex) { causes.add(ex); /* * Note: This is here because the ExecutionExceptions that * we throw does not print out all of its stack traces. * * @todo log iff unexpected exception class or get all * traces from ExecutionExceptions class. */ log.error(ex, ex); continue; } if (causes.isEmpty()) { // no errors yet, so remember the future for the join task. joinTaskFutures.add(joinTaskFuture); } else { // cancel the join task since we have to abort anyway. joinTaskFuture.cancel(true/* mayInterruptIfRunning */); } ndone++; if (log.isDebugEnabled()) log.debug("ndone=" + ndone + " of " + size); } if (!causes.isEmpty()) { for (Future<Void> f : joinTaskFutures) { // cancel since we have to abort anyway. f.cancel(true/* mayInterruptIfRunning */); } throw new ExecutionExceptions(causes); } if (log.isDebugEnabled()) log.debug("All factory tasks done: #futures=" + size); return joinTaskFutures; } }