package com.bigdata.relation.rule.eval.pipeline; import java.io.IOException; import java.nio.channels.ClosedByInterruptException; import java.util.Arrays; import java.util.Collection; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.UUID; import java.util.concurrent.Callable; import java.util.concurrent.CancellationException; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.Executor; import java.util.concurrent.ExecutorService; import java.util.concurrent.FutureTask; import java.util.concurrent.RejectedExecutionException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import org.apache.log4j.Level; import org.apache.log4j.Logger; import com.bigdata.bop.IBindingSet; import com.bigdata.bop.IPredicate; import com.bigdata.bop.IVariable; import com.bigdata.btree.AbstractBTree; import com.bigdata.journal.AbstractTask; import com.bigdata.journal.ConcurrencyManager; import com.bigdata.journal.IIndexManager; import com.bigdata.journal.IIndexStore; import com.bigdata.journal.IJournal; import com.bigdata.journal.ITx; import com.bigdata.rdf.spo.SPOKeyOrder; import com.bigdata.relation.IRelation; import com.bigdata.relation.accesspath.AccessPath; import com.bigdata.relation.accesspath.AbstractUnsynchronizedArrayBuffer; import com.bigdata.relation.accesspath.BlockingBuffer; import com.bigdata.relation.accesspath.BufferClosedException; import com.bigdata.relation.accesspath.IAccessPath; import com.bigdata.relation.accesspath.IAsynchronousIterator; import com.bigdata.relation.accesspath.IBlockingBuffer; import com.bigdata.relation.accesspath.IBuffer; import com.bigdata.relation.accesspath.UnsynchronizedArrayBuffer; import com.bigdata.relation.rule.IRule; import com.bigdata.relation.rule.IStarJoin; import com.bigdata.relation.rule.IStarJoin.IStarConstraint; import com.bigdata.relation.rule.eval.ChunkTrace; import com.bigdata.relation.rule.eval.IJoinNexus; import com.bigdata.relation.rule.eval.ISolution; import com.bigdata.service.DataService; import com.bigdata.service.IDataService; import com.bigdata.striterator.IChunkedOrderedIterator; import com.bigdata.striterator.IKeyOrder; import com.bigdata.util.BytesUtil; import com.bigdata.util.InnerCause; import com.bigdata.util.concurrent.LatchedExecutor; /** * Consumes {@link IBindingSet} chunks from the previous join dimension. * <p> * Note: Instances of this class MUST be created on the {@link IDataService} * that is host to the index partition on the task will read and they MUST run * inside of an {@link AbstractTask} on the {@link ConcurrencyManager} in order * to have access to the local index object for the index partition. * <p> * This class is NOT serializable. * <p> * For a rule with 2 predicates, there will be two {@link JoinTask}s. The * {@link #orderIndex} is ZERO (0) for the first {@link JoinTask} and ONE (1) * for the second {@link JoinTask}. The first {@link JoinTask} will have a * single initialBinding from the {@link JoinMasterTask} and will read on the * {@link IAccessPath} for the first {@link IPredicate} in the evaluation * {@link #order}. The second {@link JoinTask} will read chunks of * {@link IBindingSet}s containing partial solutions from the first * {@link JoinTask} and will obtain and read on an {@link IAccessPath} for the * second predicate in the evaluation order for every partial solution. Since * there are only two {@link IPredicate}s in the {@link IRule}, the second and * last {@link JoinTask} will write on the {@link ISolution} buffer obtained * from {@link JoinMasterTask#getSolutionBuffer()}. Each {@link JoinTask} will * report its {@link JoinStats} to the master, which aggregates those * statistics. * <p> * Note: {@link ITx#UNISOLATED} requests will deadlock if the same query uses * the same access path for two predicates! This is because the first such join * dimension in the evaluation order will obtain an exclusive lock on an index * partition making it impossible for another {@link JoinTask} to obtain an * exclusive lock on the same index partition. This is not a problem if you are * using read-consistent timestamps! * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ * * @todo Allow the access paths to be consumed in parallel. this would let us * use more threads for join dimensions that had to test more source * binding sets. * <p> * Parallel {@link AccessPathTask} processing is useful when each * {@link AccessPathTask} consumes only a small chunk and there are a * large #of source binding sets to be processed. In this case, * parallelism reduces the overall latency by allowing threads to progress * as soon as the data can be materialized from the index. * {@link AccessPathTask} parallelism is realized by submitting each * {@link AccessPathTask} to a service imposing a parallelism limit on the * shared {@link IIndexStore#getExecutorService()}. Since the * {@link AccessPathTask}s are concurrent, each one requires its own * {@link UnsynchronizedOutputBuffer} on which it will place any accepted * {@link IBindingSet}s. Once an {@link AccessPathTask} completes, its * buffer may be reused by the next {@link AccessPathTask} assigned to a * worker thread (this reduces heap churn and allows us to assemble full * chunks when each {@link IAccessPath} realizes only a few accepted * {@link IBindingSet}s). For an {@link ExecutorService} with a * parallelism limit of N, there are therefore N * {@link UnsynchronizedOutputBuffer}s. Those buffers must be flushed when * the {@link JoinTask} exhausts its source(s). If the same set of threads * is not known to be reused for each {@link AccessPathTask} then the * actual #of buffers will be the #of distinct threads used. To reduce the * potential memory demand. striped locks could be used to protect a pool * of {@link UnsynchronizedArrayBuffer}s, but could lead to deadlock if * the buffer reference was exposed to the task (as opposed to adding the * object to the buffer within a private method, which hides that * reference) since there more than one thread demanding access to the * same buffer. * * @todo Parallel {@link ChunkTask} processing may be useful when an * {@link AccessPathTask} will consume a large #of chunks. Since the * {@link IAccessPath#iterator()} is NOT thread-safe, reads on the * {@link IAccessPath} must be sequential, but the chunks read from the * {@link IAccessPath} can be placed onto a queue and parallel * {@link ChunkTask}s can drain that queue, consuming the chunks. This can * help by reducing the latency to materialize any given chunk. * <p> * The required change is to have a per-thread * {@link UnsynchronizedArrayBuffer} feeding a thread-safe * {@link UnsyncDistributedOutputBuffer} (potentially via a queue) which * maps each generated binding set across the index partition(s) for the * sink {@link JoinTask}s. */ abstract public class JoinTask implements Callable<Void> { static protected final Logger log = Logger.getLogger(JoinTask.class); /** * True iff the {@link #log} level is WARN or less. */ static final protected boolean WARN = log.isEnabledFor(Level.WARN); /** * True iff the {@link #log} level is INFO or less. */ static final protected boolean INFO = log.isInfoEnabled(); /** * True iff the {@link #log} level is DEBUG or less. */ static final protected boolean DEBUG = log.isDebugEnabled(); /** The rule that is being evaluated. */ final protected IRule<?> rule; /** * The #of predicates in the tail of that rule. */ final protected int tailCount; /** * The index partition on which this {@link JoinTask} is reading -or- * <code>-1</code> if the deployment does not support key-range * partitioned indices. */ final protected int partitionId; /** * The tail index in the rule for the predicate on which we are reading * for this join dimension. */ final protected int tailIndex; /** * The {@link IPredicate} on which we are reading for this join * dimension. */ final protected IPredicate<?> predicate; /** * The {@link IRelation} view on which we are reading for this join * dimensions. */ final protected IRelation<?> relation; /** * The index into the evaluation {@link #order} for the predicate on * which we are reading for this join dimension. */ final protected int orderIndex; /** * <code>true</code> iff this is the last join dimension in the * evaluation order. */ final protected boolean lastJoin; /** * A proxy for the remote {@link JoinMasterTask}. */ final protected IJoinMaster masterProxy; final protected UUID masterUUID; /** * A list of variables required for each tail, by tailIndex. Used to filter * downstream variable binding sets. */ final protected IVariable<?>[][] requiredVars; /** * The {@link IJoinNexus} for the local {@link IIndexManager}, which * will be the live {@link IJournal}. This {@link IJoinNexus} MUST have * access to the local index objects, which means that class MUST be run * inside of the {@link ConcurrencyManager}. The {@link #joinNexus} is * created from the {@link #joinNexusFactory} once the task begins to * execute. */ protected IJoinNexus joinNexus; /** * Volatile flag is set <code>true</code> if the {@link JoinTask} * (including any tasks executing on its behalf) should halt. This flag * is monitored by the {@link BindingSetConsumerTask}, the * {@link AccessPathTask}, and the {@link ChunkTask}. It is set by any * of those tasks if they are interrupted or error out. * * @todo review handling of this flag. Should an exception always be * thrown if the flag is set wrapping the {@link #firstCause}? * Are there any cases where the behavior should be different? * If not, then replace tests with halt() and encapsulate the * logic in that method. */ volatile protected boolean halt = false; /** * Set by {@link BindingSetConsumerTask}, {@link AccessPathTask}, and * {@link ChunkTask} if they throw an error. Tasks are required to use * an {@link AtomicReference#compareAndSet(Object, Object)} and must * specify <code>null</code> as the expected value. This ensures that * only the first cause is recorded by this field. */ final protected AtomicReference<Throwable> firstCause = new AtomicReference<Throwable>( null); /** * Indicate that join processing should halt. This method is written * defensively and will not throw anything. * * @param cause * The cause. */ protected void halt(final Throwable cause) { halt = true; final boolean isFirstCause = firstCause.compareAndSet(null/* expect */, cause); if (WARN) try { if (!InnerCause.isInnerCause(cause, InterruptedException.class) && !InnerCause.isInnerCause(cause, CancellationException.class) && !InnerCause.isInnerCause(cause, ClosedByInterruptException.class) && !InnerCause.isInnerCause(cause, RejectedExecutionException.class) && !InnerCause.isInnerCause(cause, BufferClosedException.class)) { /* * This logs all unexpected causes, not just the first one * to be reported for this join task. * * Note: The master will log the firstCause that it receives * as an error. */ log.warn("orderIndex=" + orderIndex + ", partitionId=" + partitionId + ", isFirstCause=" + isFirstCause + " : " + cause.getLocalizedMessage(), cause); } } catch (Throwable ex) { // error in logging system - ignore. } } /** * The evaluation order. {@link #orderIndex} is the index into this * array. The {@link #orderIndex} is zero (0) for the first join * dimension and is incremented by one for each subsequent join * dimension. The value at <code>order[orderIndex]</code> is the index * of the tail predicate that will be evaluated at a given * {@link #orderIndex}. */ final int[] order; /** * The statistics for this {@link JoinTask}. */ final JoinStats stats; /** * A factory pattern for per-thread objects whose life cycle is tied to some * container. For example, there may be an instance of this pool for a * {@link JoinTask} or an {@link AbstractBTree}. The pool can be torn down * when the container is torn down, which prevents its thread-local * references from escaping. * * @author thompsonbry@users.sourceforge.net * @param <T> * The generic type of the thread-local object. * * @todo There should be two implementations of a common interface or * abstract base class: one based on a private * {@link ConcurrentHashMap} and the other on striped locks. The * advantage of the {@link ConcurrentHashMap} is approximately 3x * higher concurrency. The advantage of striped locks is that you can * directly manage the #of buffers when when the threads using those * buffers is unbounded. However, doing so could lead to deadlock * since two threads can be hashed onto the same buffer object. */ abstract public class ThreadLocalFactory<T extends IBuffer<E>, E> { /** * The thread-local queues. */ private final ConcurrentHashMap<Thread, T> map; /** * A list of all objects visible to the caller. This is used to ensure * that any objects allocated by the factory are visited. * * <p>Note: Since the * collection is not thread-safe, synchronization is required when * adding to the collection and when visiting the elements of the * collection. */ private final LinkedList<T> list = new LinkedList<T>(); protected ThreadLocalFactory() { this(16/* initialCapacity */, .75f/* loadFactor */, 16/* concurrencyLevel */); } protected ThreadLocalFactory(final int initialCapacity, final float loadFactor, final int concurrencyLevel) { map = new ConcurrentHashMap<Thread, T>(initialCapacity, loadFactor, concurrencyLevel); } /** * Return the #of thread-local objects. */ final public int size() { return map.size(); } /** * Add the element to the thread-local buffer. * * @param e * An element. * * @throws IllegalStateException * if the factory is asynchronously closed. */ public void add(E e) { get().add(e); } /** * Return a thread-local buffer * * @return The thread-local buffer. * * @throws RuntimeException * if the join is halted. */ final private T get() { final Thread t = Thread.currentThread(); T tmp = map.get(t); if (tmp == null) { if (map.put(t, tmp = initialValue()) != null) { /* * Note: Since the key is the thread it is not possible for * there to be a concurrent put of an entry under the same * key so we do not have to use putIfAbsent(). */ throw new AssertionError(); } // Add to list. synchronized(list) { list.add(tmp); } } if (halt) throw new RuntimeException(firstCause.get()); return tmp; } /** * Flush each of the unsynchronized buffers onto their backing * synchronized buffer. * * @throws RuntimeException * if the join is halted. */ public void flush() { synchronized (list) { int n = 0; long m = 0L; for (T b : list) { if (halt) throw new RuntimeException(firstCause.get()); // #of elements to be flushed. final int size = b.size(); // flush, returning total #of elements written onto this // buffer. final long counter = b.flush(); m += counter; if (DEBUG) log.debug("Flushed buffer: size=" + size + ", counter=" + counter); } if (INFO) log.info("Flushed " + n + " unsynchronized buffers totalling " + m + " elements"); } } /** * Reset each of the synchronized buffers, discarding their buffered * writes. * <p> * Note: This method is used during error processing, therefore it DOES * NOT check {@link JoinTask#halt}. */ public void reset() { synchronized (list) { int n = 0; for (T b : list) { // #of elements in the buffer before reset(). final int size = b.size(); // reset the buffer. b.reset(); if (DEBUG) log.debug("Reset buffer: size=" + size); } if (INFO) log.info("Reset " + n + " unsynchronized buffers"); } } // /** // * Reset the per-{@link Thread} unsynchronized output buffers (used as // * part of error handling for the {@link JoinTask}). // */ // final protected void resetUnsyncBuffers() throws Exception { // // final int n = threadLocalBufferFactory.reset(); // .close(new Visitor<AbstractUnsynchronizedArrayBuffer<IBindingSet>>() { // // @Override // public void meet( // final AbstractUnsynchronizedArrayBuffer<IBindingSet> b) // throws Exception { // // // } /** * Create and return a new object. */ abstract protected T initialValue(); } final private ThreadLocalFactory<AbstractUnsynchronizedArrayBuffer<IBindingSet>, IBindingSet> threadLocalBufferFactory = new ThreadLocalFactory<AbstractUnsynchronizedArrayBuffer<IBindingSet>, IBindingSet>() { @Override protected AbstractUnsynchronizedArrayBuffer<IBindingSet> initialValue() { // new buffer created by the concrete JoinClass impl. return newUnsyncOutputBuffer(); } }; /** * A method used by the {@link #threadLocalBufferFactory} to create new * output buffer as required. The output buffer will be used to * aggregate {@link IBindingSet}s generated by this {@link JoinTask}. * <p> * Note: A different implementation class must be used depending on * whether or not this is the last join dimension for the query (when it * is, then we write on the solution buffer) and whether or not the * target join index is key-range partitioned (when it is, each binding * set is mapped across the sink {@link JoinTask}(s)). */ abstract protected AbstractUnsynchronizedArrayBuffer<IBindingSet> newUnsyncOutputBuffer(); /** * The buffer on which the last predicate in the evaluation order will * write its {@link ISolution}s. * * @return The buffer. * * @throws IllegalStateException * unless {@link #lastJoin} is <code>true</code>. */ abstract protected IBuffer<ISolution[]> getSolutionBuffer(); /** * Return the index of the tail predicate to be evaluated at the given * index in the evaluation order. * * @param orderIndex * The evaluation order index. * * @return The tail index to be evaluated at that index in the * evaluation order. */ final protected int getTailIndex(final int orderIndex) { assert order != null; final int tailIndex = order[orderIndex]; assert orderIndex >= 0 && orderIndex < tailCount : "orderIndex=" + orderIndex + ", rule=" + rule; return tailIndex; } public String toString() { return getClass().getName() + "{ orderIndex=" + orderIndex + ", partitionId=" + partitionId + ", lastJoin=" + lastJoin + ", masterUUID=" + masterUUID + "}"; } /** * Instances of this class MUST be created in the appropriate execution * context of the target {@link DataService} so that the federation and * the joinNexus references are both correct and so that it has access * to the local index object for the specified index partition. * * @param concurrencyManager * @param indexName * @param rule * @param joinNexus * @param order * @param orderIndex * @param partitionId * The index partition identifier and <code>-1</code> if * the deployment does not support key-range partitioned * indices. * @param masterProxy * * @see JoinTaskFactoryTask */ public JoinTask(/*final String indexName,*/ final IRule rule, final IJoinNexus joinNexus, final int[] order, final int orderIndex, final int partitionId, final IJoinMaster masterProxy, final UUID masterUUID, final IVariable[][] requiredVars) { if (rule == null) throw new IllegalArgumentException(); if (joinNexus == null) throw new IllegalArgumentException(); final int tailCount = rule.getTailCount(); if (order == null) throw new IllegalArgumentException(); if (order.length != tailCount) throw new IllegalArgumentException(); if (orderIndex < 0 || orderIndex >= tailCount) throw new IllegalArgumentException(); if (masterProxy == null) throw new IllegalArgumentException(); if (masterUUID == null) throw new IllegalArgumentException(); if (requiredVars == null) throw new IllegalArgumentException(); this.rule = rule; this.partitionId = partitionId; this.tailCount = tailCount; this.orderIndex = orderIndex; this.joinNexus = joinNexus; this.order = order; // note: assign before using getTailIndex() this.tailIndex = getTailIndex(orderIndex); this.lastJoin = ((orderIndex + 1) == tailCount); this.predicate = rule.getTail(tailIndex); this.relation = joinNexus.getTailRelationView(predicate); this.stats = new JoinStats(partitionId, orderIndex); this.masterProxy = masterProxy; this.masterUUID = masterUUID; this.requiredVars = requiredVars; if (DEBUG) log.debug("orderIndex=" + orderIndex + ", partitionId=" + partitionId); } /** * Runs the {@link JoinTask}. * * @return <code>null</code>. */ public Void call() throws Exception { if (DEBUG) log.debug("orderIndex=" + orderIndex + ", partitionId=" + partitionId); try { /* * Consume bindingSet chunks from the source JoinTask(s). */ consumeSources(); /* * Flush and close output buffers and wait for all sink * JoinTasks to complete. */ // flush the unsync buffers. // flushUnsyncBuffers(); threadLocalBufferFactory.flush(); // flush the sync buffer and await the sink JoinTasks flushAndCloseBuffersAndAwaitSinks(); if (DEBUG) log.debug("JoinTask done: orderIndex=" + orderIndex + ", partitionId=" + partitionId + ", halt=" + halt + "firstCause=" + firstCause.get()); if (halt) throw new RuntimeException(firstCause.get()); return null; } catch (Throwable t) { try { logCallError(t); } catch (Throwable t2) { log.error(t2.getLocalizedMessage(), t2); } /* * This is used for processing errors and also if this task is * interrupted (because a SLICE has been satisfied). * * @todo For a SLICE, consider that the query solution buffer * proxy could return the #of solutions added so far so that we * can halt each join task on the last join dimension in a * relatively timely manner producing no more than one chunk too * many (actually, it might not be that timely since some index * partitions might not produce any solutions; this suggests * that the master might need a fatter API than a Future for the * JoinTask so that it can directly notify the JoinTasks for the * first predicate and they can propagate that notice downstream * to their sinks). This will be an issue when fanOut GT ONE. */ halt(t); // reset the unsync buffers. try { // resetUnsyncBuffers(); threadLocalBufferFactory.reset(); } catch (Throwable t2) { log.error(t2.getLocalizedMessage(), t2); } // reset the sync buffer and cancel the sink JoinTasks. try { cancelSinks(); } catch (Throwable t2) { log.error(t2.getLocalizedMessage(), t2); } // report join stats _before_ we close our source(s). try { reportOnce(); } catch (Throwable t2) { log.error(t2.getLocalizedMessage(), t2); } /* * Close source iterators, which will cause any source JoinTasks * that are still executing to throw a CancellationException * when the Future associated with the source iterator is * cancelled. */ try { closeSources(); } catch (Throwable t2) { log.error(t2.getLocalizedMessage(), t2); } throw new RuntimeException(t); } finally { // report join stats iff they have not already been reported. reportOnce(); } } /** * Method is used to log the primary exception thrown by {@link #call()}. * The default implementation does nothing and the exception will be logged * by the {@link JoinMasterTask}. However, this method is overridden by * {@link DistributedJoinTask} so that the exception can be logged on the * host and {@link DataService} where it originates. This appears to be * necessary in order to trace back the cause of an exception which can * otherwise be obscured (or even lost?) in a deeply nested RMI stack trace. * * @param o * @param t */ protected void logCallError(Throwable t) { } /** * Method reports {@link JoinStats} to the {@link JoinMasterTask}, but * only if they have not already been reported. This "report once" * constraint is used to make it safe to invoke during error handling * before actions which could cause the source {@link JoinTask}s (and * hence the {@link JoinMasterTask}) to terminate. */ protected void reportOnce() { if (!didReport) { didReport = true; try { // report statistics to the master. masterProxy.report(stats); } catch (IOException ex) { log.warn("Could not report statistics to the master", ex); } } } private boolean didReport = false; // static private AtomicBoolean firstJoin = new AtomicBoolean(false); /** * Consume {@link IBindingSet} chunks from source(s). The first join * dimension always has a single source - the initialBindingSet * established by the {@link JoinMasterTask}. Downstream join * dimensions read from {@link IAsynchronousIterator}(s) from the * upstream join dimension. When the {@link IIndexManager} allows * key-range partitions, then the fan-in for the sources may be larger * than one as there will be one {@link JoinTask} for each index * partition touched by each join dimension. * * @throws Exception * @throws BufferClosedException * if there is an attempt to output a chunk of * {@link IBindingSet}s or {@link ISolution}s and the * output buffer is an {@link IBlockingBuffer} (true for all * join dimensions exception the lastJoin and also true for * query on the lastJoin) and that {@link IBlockingBuffer} * has been closed. */ protected void consumeSources() throws Exception { if (INFO) log.info(toString()); /* * The maximum parallelism with which the {@link JoinTask} will * consume the source {@link IBindingSet}s. * * Note: When ZERO (0), everything will run in the caller's * {@link Thread}. When GT ZERO (0), tasks will run on an * {@link ExecutorService} with the specified maximum parallelism. * * Note: even when maxParallel is zero there will be one thread per * join dimension. For many queries that may be just fine. * * FIXME parallel execution requires some thread-local unsynchronized * buffers -- see my notes elsewhere in this class for what has to be * done to support this (actually, it all appears to work just fine). */ final int maxParallel = 0; // final int maxParallel = joinNexus.getMaxParallelSubqueries(); // final int maxParallel = 10; /* * Note: There is no reason for parallelism in the first join dimension * as there will be only a single source bindingSet and hence a single * AccessPathTask so the Executor is just overhead. * * @todo this will not be true when we support binding set joins as the * input could be a stream of binding sets (basically, when the first * join dimension is a subrule, it can have lots of access path tasks). */ if (orderIndex > 0 && maxParallel > 0) { // /* // * Setup parallelism limited executor that will be used to run the // * access path tasks. // */ // if(firstJoin.compareAndSet(false/*expect*/,true/*update*/)) { // System.err.println("maxParallel=" + maxParallel); // } // the sharedService. final ExecutorService sharedService = joinNexus.getIndexManager() .getExecutorService(); // final ExecutorService limitedService = Executors // .newFixedThreadPool(maxParallel, new DaemonThreadFactory // (getClass().getName()+".joinService")); final Executor limitedService = new LatchedExecutor(sharedService, maxParallel); /* * consume chunks until done (using caller's thread to consume and * service to run subtasks). */ new BindingSetConsumerTask(limitedService).call(); if (halt) throw new RuntimeException(firstCause.get()); } else { /* * consume chunks until done using the caller's thread and run * subtasks in the caller's thread as well. */ new BindingSetConsumerTask(null/* noService */).call(); } } /** * Close any source {@link IAsynchronousIterator}(s). This method is * invoked when a {@link JoinTask} fails. */ abstract void closeSources(); // /** // * Flush the per-{@link Thread} unsynchronized output buffers (they // * write onto the thread-safe output buffer). // */ // final protected void flushUnsyncBuffers() throws Exception { // // final int n = threadLocalBufferFactory.flush(); // close(new Visitor<AbstractUnsynchronizedArrayBuffer<IBindingSet>>() { // // public void meet( // final AbstractUnsynchronizedArrayBuffer<IBindingSet> b) { // // // unless halted // if (halt) // throw new RuntimeException(firstCause.get()); // // // #of elements to be flushed. // final int size = b.size(); // // // flush, returning total #of elements written onto this // // buffer. // final long counter = b.flush(); // // if (DEBUG) // log.debug("Flushed buffer: size=" + size // + ", counter=" + counter); // // } // // }); // // if (INFO) // log.info("Flushed " + n + " unsynchronized buffers"); // // } // /** // * Reset the per-{@link Thread} unsynchronized output buffers (used as // * part of error handling for the {@link JoinTask}). // */ // final protected void resetUnsyncBuffers() throws Exception { // // final int n = threadLocalBufferFactory.reset(); // .close(new Visitor<AbstractUnsynchronizedArrayBuffer<IBindingSet>>() { // // @Override // public void meet( // final AbstractUnsynchronizedArrayBuffer<IBindingSet> b) // throws Exception { // // // #of elements in the buffer before reset(). // final int size = b.size(); // // // flush the buffer. // b.reset(); // // if (DEBUG) // log.debug("Reset buffer: size=" + size); // } // }); // // if (INFO) // log.info("Reset " + n + " unsynchronized buffers"); // // } /** * Flush and close all output buffers and await sink {@link JoinTask}(s). * <p> * Note: You MUST close the {@link BlockingBuffer} from which each sink * reads <em>before</em> invoking this method in order for those sinks to * terminate. Otherwise the source {@link IAsynchronousIterator}(s) on which * the sink is reading will remain open and the sink will never decide that * it has exhausted its source(s). * * @throws InterruptedException * @throws ExecutionException */ abstract protected void flushAndCloseBuffersAndAwaitSinks() throws InterruptedException, ExecutionException; /** * Cancel sink {@link JoinTask}(s). */ abstract protected void cancelSinks(); /** * Return a chunk of {@link IBindingSet}s from the * {@link IAsynchronousIterator}s. The 1st join dimension is always fed * by the {@link JoinMasterTask}. The nth+1 join dimension is always * fed by the nth {@link JoinTask}(s). * * @return The next available chunk of {@link IBindingSet}s -or- * <code>null</code> IFF all known source(s) are exhausted. */ abstract protected IBindingSet[] nextChunk() throws InterruptedException; /** * Class consumes chunks from the source(s) until canceled, * interrupted, or all source(s) are exhausted. For each * {@link IBindingSet} in each chunk, an {@link AccessPathTask} is * created which will consume that {@link IBindingSet}. The * {@link AccessPathTask} for a given source chunk are sorted based on * their <code>fromKey</code> so as to order the execution of those * tasks in a manner that will maximize the efficiency of index reads. * The ordered {@link AccessPathTask}s are then submitted to the * caller's {@link Executor}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ protected class BindingSetConsumerTask implements Callable<Void> { private final Executor executor; /** * * @param executor * The service that will execute the generated * {@link AccessPathTask}s -or- <code>null</code> IFF * you want the {@link AccessPathTask}s to be executed * in the caller's thread. */ public BindingSetConsumerTask(final Executor executor) { this.executor = executor; } /** * Read chunks from one or more sources until canceled, * interrupted, or all sources are exhausted and submits * {@link AccessPathTask}s to the caller's {@link ExecutorService} * -or- executes those tasks in the caller's thread if no * {@link ExecutorService} was provided to the ctor. * <p> * Note: When running with an {@link ExecutorService}, the caller * is responsible for waiting on that {@link ExecutorService} until * the {@link AccessPathTask}s to complete and must verify all * tasks completed successfully. * * @return <code>null</code> * * @throws BufferClosedException * if there is an attempt to output a chunk of * {@link IBindingSet}s or {@link ISolution}s and the * output buffer is an {@link IBlockingBuffer} (true for * all join dimensions exception the lastJoin and also * true for query on the lastJoin) and that * {@link IBlockingBuffer} has been closed. */ public Void call() throws Exception { try { if (DEBUG) log.debug("begin: orderIndex=" + orderIndex + ", partitionId=" + partitionId); IBindingSet[] chunk; while (!halt && (chunk = nextChunk()) != null) { /* * @todo ChunkTrace for bindingSet chunks in as well as * access path chunks consumed. */ if (DEBUG) log.debug("Read chunk of bindings: chunkSize=" + chunk.length + ", orderIndex=" + orderIndex + ", partitionId=" + partitionId); /* * Aggregate the source bindingSets that license the * same asBound predicate. */ final Map<IPredicate<?>, Collection<IBindingSet>> map = combineBindingSets(chunk); /* * Generate an AccessPathTask from each distinct * asBound predicate that will consume all of the source * bindingSets in the chunk which resulted in the same * asBound predicate. */ final AccessPathTask[] tasks = getAccessPathTasks(map); /* * Reorder those tasks for better index read * performance. */ reorderTasks(tasks); /* * Execute the tasks (either in the caller's thread or * on the supplied service). */ executeTasks(tasks); } if (halt) throw new RuntimeException(firstCause.get()); if (DEBUG) log.debug("done: orderIndex=" + orderIndex + ", partitionId=" + partitionId); return null; } catch (Throwable t) { halt(t); throw new RuntimeException(t); } } /** * Populates a map of asBound predicates paired to a set of * bindingSets. * <p> * Note: The {@link AccessPathTask} will apply each bindingSet to * each element visited by the {@link IAccessPath} obtained for the * asBound {@link IPredicate}. This has the natural consequence of * eliminating subqueries within the chunk. * * @param chunk * A chunk of bindingSets from the source join dimension. * * @return A map which pairs the distinct asBound predicates to the * bindingSets in the chunk from which the predicate was * generated. */ protected Map<IPredicate<?>, Collection<IBindingSet>> combineBindingSets( final IBindingSet[] chunk) { if (DEBUG) log.debug("chunkSize=" + chunk.length); final int tailIndex = getTailIndex(orderIndex); final Map<IPredicate<?>, Collection<IBindingSet>> map = new LinkedHashMap<IPredicate<?>, Collection<IBindingSet>>( chunk.length); for (IBindingSet bindingSet : chunk) { if (halt) throw new RuntimeException(firstCause.get()); // constrain the predicate to the given bindings. IPredicate<?> predicate = rule.getTail(tailIndex).asBound( bindingSet); if (partitionId != -1) { /* * Constrain the predicate to the desired index partition. * * Note: we do this for scale-out joins since the access * path will be evaluated by a JoinTask dedicated to this * index partition, which is part of how we give the * JoinTask to gain access to the local index object for an * index partition. */ predicate = predicate.setPartitionId(partitionId); } // lookup the asBound predicate in the map. Collection<IBindingSet> values = map.get(predicate); if (values == null) { /* * This is the first bindingSet for this asBound * predicate. We create a collection of bindingSets to * be paired with that predicate and put the collection * into the map using that predicate as the key. */ values = new LinkedList<IBindingSet>(); map.put(predicate, values); } else { // more than one bindingSet will use the same access path. stats.accessPathDups++; } /* * Add the bindingSet to the collection of bindingSets * paired with the asBound predicate. */ values.add(bindingSet); } if (DEBUG) log.debug("chunkSize=" + chunk.length + ", #distinct predicates=" + map.size()); return map; } /** * Creates an {@link AccessPathTask} for each {@link IBindingSet} in * the given chunk. * * @param chunk * A chunk of {@link IBindingSet}s from one or more * source {@link JoinTask}s. * * @return A chunk of {@link AccessPathTask} in a desirable * execution order. * * @throws Exception */ protected AccessPathTask[] getAccessPathTasks( final Map<IPredicate<?>, Collection<IBindingSet>> map) { final int n = map.size(); if (DEBUG) log.debug("#distinct predicates=" + n); final AccessPathTask[] tasks = new AccessPathTask[n]; final Iterator<Map.Entry<IPredicate<?>, Collection<IBindingSet>>> itr = map .entrySet().iterator(); int i = 0; while (itr.hasNext()) { if (halt) throw new RuntimeException(firstCause.get()); final Map.Entry<IPredicate<?>, Collection<IBindingSet>> entry = itr .next(); tasks[i++] = new AccessPathTask(entry.getKey(), entry.getValue()); } return tasks; } /** * The tasks are ordered based on the <i>fromKey</i> for the * associated {@link IAccessPath} as licensed by each * {@link IBindingSet}. This order tends to focus the reads on the * same parts of the index partitions with a steady progression in * the <i>fromKey</i> as we process a chunk of {@link IBindingSet}s. * * @param tasks * The tasks. */ protected void reorderTasks(final AccessPathTask[] tasks) { // @todo layered access paths do not expose a fromKey. if (tasks[0].accessPath instanceof AccessPath<?>) { // reorder the tasks. Arrays.sort(tasks); } } /** * Either execute the tasks in the caller's thread or schedule them * for execution on the supplied service. * * @param tasks * The tasks. * * @throws Exception */ protected void executeTasks(final AccessPathTask[] tasks) throws Exception { if (executor == null) { /* * No Executor, so run each task in the caller's thread. */ for (AccessPathTask task : tasks) { task.call(); } return; } /* * Build list of FutureTasks. This list is used to check all tasks * for errors and ensure that any running tasks are cancelled. */ final List<FutureTask<Void>> futureTasks = new LinkedList<FutureTask<Void>>(); for (AccessPathTask task : tasks) { final FutureTask<Void> ft = new FutureTask<Void>(task); futureTasks.add(ft); } try { /* * Execute all tasks. */ for (FutureTask<Void> ft : futureTasks) { if (halt) throw new RuntimeException(firstCause.get()); // Queue for execution. executor.execute(ft); } // next task. /* * Wait for each task. If any task throws an exception, then * [halt] will become true and any running tasks will error out * quickly. Once [halt := true], we do not wait for any more * tasks, but proceed to cancel all tasks in the finally {} * clause below. */ for (FutureTask<Void> ft : futureTasks) { // Wait for a task. if (!halt) ft.get(); } } finally { /* * Ensure that all tasks are cancelled, regardless of whether * they were started or have already finished. */ for (FutureTask<Void> ft : futureTasks) { ft.cancel(true/* mayInterruptIfRunning */); } } } } /** * Accepts an asBound {@link IPredicate} and a (non-empty) collection of * {@link IBindingSet}s each of which licenses the same asBound * predicate for the current join dimension. The task obtains the * corresponding {@link IAccessPath} and delegates each chunk visited on * that {@link IAccessPath} to a {@link ChunkTask}. Note that optionals * are also handled by this task. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ protected class AccessPathTask implements Callable<Void>, Comparable<AccessPathTask> { /** * The {@link IBindingSet}s from the source join dimension to be * combined with each element visited on the {@link #accessPath}. * If there is only a single source {@link IBindingSet} in a given * chunk of source {@link IBindingSet}s that results in the same * asBound {@link IPredicate} then this will be a collection with a * single member. However, if multiple source {@link IBindingSet}s * result in the same asBound {@link IPredicate} within the same * chunk then those are aggregated and appear together in this * collection. * <p> * Note: An array is used for thread-safe traversal. */ final private IBindingSet[] bindingSets; /** * The {@link IAccessPath} corresponding to the asBound * {@link IPredicate} for this join dimension. The asBound * {@link IPredicate} is {@link IAccessPath#getPredicate()}. */ final private IAccessPath<?> accessPath; /** * Return the <em>fromKey</em> for the {@link IAccessPath} generated * from the {@link IBindingSet} for this task. * * @todo layered access paths do not expose a fromKey. This information * is always available from the {@link SPOKeyOrder} and that * method will be raised into the {@link IKeyOrder}. * Unfortunately, for RDF we also need to know if triples or quads * are being used, which is a property on the container or the * relation. */ protected byte[] getFromKey() { return ((AccessPath<?>) accessPath).getFromKey(); } /** * Return <code>true</code> iff the tasks are equivalent (same as * bound predicate). This test may be used to eliminate duplicates * that arise when different source {@link JoinTask}s generate the * same {@link IBindingSet}. * * @param o * Another task. * * @return if the as bound predicate is equals(). */ public boolean equals(final AccessPathTask o) { return accessPath.getPredicate() .equals(o.accessPath.getPredicate()); } /** * Evaluate an {@link IBindingSet} for the join dimension. When the * task runs, it will pair each element visited on the * {@link IAccessPath} with the asBound {@link IPredicate}. For * each element visited, if the binding is acceptable for the * constraints on the asBound {@link IPredicate}, then the task * will emit one {@link IBindingSet} for each source * {@link IBindingSet}. * * @param predicate * The asBound {@link IPredicate}. * @param bindingSets * A collection of {@link IBindingSet}s from the source * join dimension that all result in the same asBound * {@link IPredicate}. */ public AccessPathTask(final IPredicate<?> predicate, final Collection<IBindingSet> bindingSets) { if (predicate == null) throw new IllegalArgumentException(); if (bindingSets == null) throw new IllegalArgumentException(); /* * Note: this needs to be the access path for the local index * partition. We handle this by (a) constraining the predicate * to the desired index partition; (b) using an IJoinNexus that * is initialized once the JoinTask starts to execute inside of * the ConcurrencyManager; (c) declaring; and (d) using the * index partition name NOT the scale-out index name. */ final int n = bindingSets.size(); if (n == 0) throw new IllegalArgumentException(); this.accessPath = joinNexus.getTailAccessPath(relation, predicate); if (DEBUG) log.debug("orderIndex=" + orderIndex + ", tailIndex=" + tailIndex + ", tail=" + rule.getTail(tailIndex) + ", #bindingSets=" + n + ", accessPath=" + accessPath); // convert to array for thread-safe traversal. this.bindingSets = bindingSets.toArray(new IBindingSet[n]); } public String toString() { return JoinTask.this.getClass().getSimpleName() + "{ orderIndex=" + orderIndex + ", partitionId=" + partitionId + ", #bindingSets=" + bindingSets.length + "}"; } /** * Evaluate the {@link #accessPath} against the {@link #bindingSets}. * If nothing is accepted and {@link IPredicate#isOptional()} then * the {@link #bindingSets} is output anyway (this implements the * semantics of OPTIONAL). * * @return <code>null</code>. * * @throws BufferClosedException * if there is an attempt to output a chunk of * {@link IBindingSet}s or {@link ISolution}s and the * output buffer is an {@link IBlockingBuffer} (true for * all join dimensions exception the lastJoin and also * true for query on the lastJoin) and that * {@link IBlockingBuffer} has been closed. */ public Void call() throws Exception { if (halt) throw new RuntimeException(firstCause.get()); stats.accessPathCount++; if (accessPath.getPredicate() instanceof IStarJoin<?>) { handleStarJoin(); } else { handleJoin(); } return null; } /** * A vectored pipeline join (chunk at a time processing). */ protected void handleJoin() { boolean nothingAccepted = true; // Obtain the iterator for the current join dimension. final IChunkedOrderedIterator<?> itr = accessPath.iterator(); try { /* * @todo In order to run the chunks on a thread pool, pass in * [null] for the unsyncBuffer and each chunk will get its own * buffer. */ final AbstractUnsynchronizedArrayBuffer<IBindingSet> unsyncBuffer = threadLocalBufferFactory .get(); while (itr.hasNext()) { final Object[] chunk = itr.nextChunk(); stats.chunkCount++; // process the chunk in the caller's thread. final boolean somethingAccepted = new ChunkTask( bindingSets, unsyncBuffer, chunk).call(); if (somethingAccepted) { // something in the chunk was accepted. nothingAccepted = false; } } // next chunk. if (nothingAccepted && predicate.isOptional()) { /* * Note: when NO binding sets were accepted AND the * predicate is OPTIONAL then we output the _original_ * binding set(s) to the sink join task(s). */ for (IBindingSet bs : this.bindingSets) { unsyncBuffer.add(bs); } } return; } catch (Throwable t) { halt(t); throw new RuntimeException(t); } finally { itr.close(); } } protected void handleStarJoin() { IBindingSet[] solutions = this.bindingSets; final IStarJoin starJoin = (IStarJoin) accessPath.getPredicate(); final AbstractUnsynchronizedArrayBuffer<IBindingSet> unsyncBuffer = threadLocalBufferFactory .get(); // Obtain the iterator for the current join dimension. final IChunkedOrderedIterator<?> itr = accessPath.iterator(); // The actual #of elements scanned. int numElements = 0; try { /* * Note: The fast range count would give us an upper bound, * unless expanders are used, in which case there can be more * elements visited. */ final Object[] elements; { /* * First, gather all chunks. */ int nchunks = 0; final List<Object[]> chunks = new LinkedList<Object[]>(); while (itr.hasNext()) { final Object[] chunk = (Object[]) itr.nextChunk(); // add to list of chunks. chunks.add(chunk); numElements += chunk.length; stats.chunkCount++; nchunks++; } // next chunk. /* * Now flatten the chunks into a simple array. */ if (nchunks == 0) { // No match. return; } if (nchunks == 1) { // A single chunk. elements = chunks.get(0); } else { // Flatten the chunks. elements = new Object[numElements]; { int n = 0; for (Object[] chunk : chunks) { System.arraycopy(chunk/* src */, 0/* srcPos */, elements/* dst */, n/* dstPos */, chunk.length/* len */); n += chunk.length; } } } stats.elementCount += numElements; } if (numElements > 0) { final Iterator<IStarConstraint<?>> it = starJoin.getStarConstraints(); boolean constraintFailed = false; while (it.hasNext()) { final IStarConstraint constraint = it.next(); Collection<IBindingSet> constraintSolutions = null; int numVars = constraint.getNumVars(); for (int i = 0; i < numElements; i++) { Object e = elements[i]; if (constraint.isMatch(e)) { /* * For each match for the constraint, we clone * the old solutions and create a new solutions * that appends the variable bindings from this * match. * * At the end, we set the old solutions * collection to the new solutions collection. */ if (constraintSolutions == null) { constraintSolutions = new LinkedList<IBindingSet>(); } for (IBindingSet bs : solutions) { if (numVars > 0) { bs = bs.clone(); constraint.bind(bs, e); } constraintSolutions.add(bs); } // no reason to keep testing SPOs, there can // be only one if (numVars == 0) { break; } } } if (constraintSolutions == null) { // we did not find any matches to this constraint // that is ok, as long it's optional if (constraint.isOptional() == false) { constraintFailed = true; break; } } else { // set the old solutions to the new solutions, and // move on to the next constraint solutions = constraintSolutions.toArray( new IBindingSet[constraintSolutions.size()]); } } if (!constraintFailed) { for (IBindingSet bs : solutions) { unsyncBuffer.add(bs); } } } return; } catch (Throwable t) { halt(t); throw new RuntimeException(t); } finally { itr.close(); } } /** * Imposes an order based on the <em>fromKey</em> for the * {@link IAccessPath} associated with the task. * * @param o * * @return */ public int compareTo(final AccessPathTask o) { return BytesUtil.compareBytes(getFromKey(), o.getFromKey()); } } /** * Task processes a chunk of elements read from the {@link IAccessPath} * for a join dimension. Each element in the chunk in paired with a copy * of the given bindings. If that {@link IBindingSet} is accepted by the * {@link IRule}, then the {@link IBindingSet} will be output. The * {@link IBindingSet}s to be output are buffered into chunks and the * chunks added to the {@link JoinPipelineTask#bindingSetBuffers} for * the corresponding predicate. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ protected class ChunkTask implements Callable<Boolean> { /** * The index of the predicate for the access path that is being * consumed. */ private final int tailIndex; /** * The {@link IBindingSet}s which the each element in the chunk * will be paired to create {@link IBindingSet}s for the downstream * join dimension. */ private final IBindingSet[] bindingSets; /** * A per-{@link Thread} buffer that is used to collect * {@link IBindingSet}s into chunks before handing them off to the * next join dimension. The hand-off occurs no later than when the * current join dimension finishes consuming its source(s). */ private final AbstractUnsynchronizedArrayBuffer<IBindingSet> unsyncBuffer; /** * A chunk of elements read from the {@link IAccessPath} for the * current join dimension. */ private final Object[] chunk; /** * * @param bindingSet * The bindings with which the each element in the chunk will * be paired to create the bindings for the downstream join * dimension. * @param unsyncBuffer * A per-{@link Thread} buffer used to accumulate chunks of * generated {@link IBindingSet}s (optional). When the * {@link ChunkTask} will be run in its own thread, pass * <code>null</code> and the buffer will be obtained in * {@link #call()}. * @param chunk * A chunk of elements read from the {@link IAccessPath} for * the current join dimension. */ public ChunkTask( final IBindingSet[] bindingSet, final AbstractUnsynchronizedArrayBuffer<IBindingSet> unsyncBuffer, final Object[] chunk) { if (bindingSet == null) throw new IllegalArgumentException(); // Allow null! // if (unsyncBuffer == null) // throw new IllegalArgumentException(); if (chunk == null) throw new IllegalArgumentException(); this.tailIndex = getTailIndex(orderIndex); this.bindingSets = bindingSet; this.chunk = chunk; this.unsyncBuffer = unsyncBuffer; } /** * @return <code>true</code> iff NO elements in the chunk (as read * from the access path by the caller) were accepted when * combined with the {@link #bindingSets} from the source * {@link JoinTask}. * * @throws BufferClosedException * if there is an attempt to output a chunk of * {@link IBindingSet}s or {@link ISolution}s and the * output buffer is an {@link IBlockingBuffer} (true for * all join dimensions exception the lastJoin and also * true for query on the lastJoin) and that * {@link IBlockingBuffer} has been closed. */ public Boolean call() throws Exception { try { ChunkTrace.chunk(orderIndex, chunk); boolean nothingAccepted = true; // Use caller's or obtain our own as necessary. final AbstractUnsynchronizedArrayBuffer<IBindingSet> unsyncBuffer = (this.unsyncBuffer == null) ? threadLocalBufferFactory .get() : this.unsyncBuffer; for (Object e : chunk) { if (halt) return nothingAccepted; // naccepted for the current element (trace only). int naccepted = 0; stats.elementCount++; for (IBindingSet bset : bindingSets) { final IVariable<?>[] variablesToKeep = requiredVars[tailIndex]; if (INFO) { log.info("tailIndex: " + tailIndex); log.info("bset before: " + bset); } /* * Clone the binding set since it is tested for each * element visited. */ bset = bset.clone(); if (INFO) { log.info("tailIndex: " + tailIndex); log.info("bset after: " + bset); log.info("element: " + e); } // propagate bindings from the visited element. if (joinNexus.bind(rule, tailIndex, e, bset)) { bset = bset.copy(variablesToKeep); /* * Accept this binding set. * * @todo This is the place to intervene for * scale-out default graph queries. Instead of * directly accepting the bset, place the (bset,e) * pair on a queue which targets a distributed hash * map imposing distinct on [e] and only insert into * the unsyncBuffer those [bset]s which pass the * filter. * * The life cycle of that filter needs to be * protected with a latch or zlock. Each JoinTask * must wait until the filter has answered each of * its queued (bset,e) pairs, which could be done * using a latch. */ unsyncBuffer.add(bset); naccepted++; nothingAccepted = false; } } if (DEBUG) log.debug("Accepted element for " + naccepted + " of " + bindingSets.length + " possible bindingSet combinations: " + e.toString() + ", orderIndex=" + orderIndex + ", lastJoin=" + lastJoin + ", rule=" + rule.getName()); } // if something is accepted in the chunk return true. return nothingAccepted ? Boolean.FALSE: Boolean.TRUE; } catch (Throwable t) { halt(t); throw new RuntimeException(t); } } } }