package com.bigdata.relation.rule.eval.pipeline;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.UUID;
import java.util.Vector;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.bop.IVariable;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.relation.IMutableRelation;
import com.bigdata.relation.accesspath.AbstractUnsynchronizedArrayBuffer;
import com.bigdata.relation.accesspath.BlockingBuffer;
import com.bigdata.relation.accesspath.IAsynchronousIterator;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.relation.rule.IRule;
import com.bigdata.relation.rule.eval.ActionEnum;
import com.bigdata.relation.rule.eval.IJoinNexus;
import com.bigdata.relation.rule.eval.IRuleState;
import com.bigdata.relation.rule.eval.ISolution;
import com.bigdata.service.AbstractDistributedFederation;
import com.bigdata.service.AbstractScaleOutFederation;
import com.bigdata.service.DataService;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.service.IDataService;
import com.bigdata.service.Session;
import com.bigdata.striterator.IKeyOrder;
import com.bigdata.util.concurrent.Computable;
import com.bigdata.util.concurrent.Memoizer;
import cutthecrap.utils.striterators.Filter;
import cutthecrap.utils.striterators.Resolver;
import cutthecrap.utils.striterators.Striterator;
/**
* Implementation used by scale-out deployments. There will be one instance
* of this task per index partition on which the rule will read. Those
* instances will be in-process on the {@link DataService} hosting that
* index partition. Instances are created on the {@link DataService} using
* the {@link JoinTaskFactoryTask} helper class.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class DistributedJoinTask extends JoinTask {
/**
* When <code>true</code>, enables a trace on {@link System#err} of
* the code polling the source {@link IAsynchronousIterator}s from
* which this {@link DistributedJoinTask} draws its {@link IBindingSet}
* chunks.
*/
static private final boolean trace = false;
/**
* The federation is used to obtain locator scans for the access paths.
*/
final protected AbstractScaleOutFederation<?> fed;
/**
* The {@link IJoinNexus} for the {@link IBigdataFederation}. This is
* mainly used to setup the {@link #solutionBuffer} since it needs to
* write on the scale-out index while the {@link AccessPathTask} will
* read on the local index partition view.
*/
final protected IJoinNexus fedJoinNexus;
/**
* A (proxy for) the {@link Future} for this {@link DistributedJoinTask}.
*/
protected Future<Void> futureProxy;
/**
* @see IRuleState#getKeyOrder()
*/
final private IKeyOrder<?>[] keyOrders;
/**
* The name of the scale-out index associated with the next
* {@link IPredicate} in the evaluation order and <code>null</code>
* iff this is the last {@link IPredicate} in the evaluation order [logging only.]
*/
final private String nextScaleOutIndexName;
/**
* Sources for {@link IBindingSet} chunks that will be processed by this
* {@link JoinTask}. There will be one such source for each upstream
* {@link JoinTask} that targets this {@link JoinTask}.
* <p>
* Note: This is a thread-safe collection since new sources may be added
* asynchronously during processing.
*/
final private Vector<IAsynchronousIterator<IBindingSet[]>> sources = new Vector<IAsynchronousIterator<IBindingSet[]>>();
/**
* <code>false</code> until all binding sets have been consumed and the
* join task has made an atomic decision that it will not accept any new
* sources. Note that the join task may still be consuming binding sets once
* this flag is set - it is not necessarily done with its work, just no
* willing to accept new {@link #sources}.
*
* @todo rename as sourcesClosed
*/
private boolean sourcesExhausted = false;
/**
* The {@link DataService} on which this task is executing. This is used to
* remove the entry for the task from {@link DataService#getSession()}.
*/
private final DataService dataService;
// /**
// * The {@link JoinTaskSink}s for the downstream
// * {@link DistributedJoinTask}s onto which the generated
// * {@link IBindingSet}s will be written. This is <code>null</code>
// * for the last join since we will write solutions onto the
// * {@link #getSolutionBuffer()} instead.
// *
// * @todo configure capacity based on expectations of index partition
// * fan-out for this join dimension
// */
// final private Map<PartitionLocator, JoinTaskSink> sinkCache;
public DistributedJoinTask(
// final String scaleOutIndexName,
final IRule rule,//
final IJoinNexus joinNexus,//
final int[] order,//
final int orderIndex,//
final int partitionId,//
final AbstractScaleOutFederation<?> fed,//
final IJoinMaster master,//
final UUID masterUUID,//
final IAsynchronousIterator<IBindingSet[]> src,//
final IKeyOrder[] keyOrders,//
final DataService dataService,//
final IVariable[][] requiredVars//
) {
super(
/*DataService.getIndexPartitionName(scaleOutIndexName,
partitionId),*/ rule, joinNexus, order, orderIndex,
partitionId, master, masterUUID, requiredVars);
if (fed == null)
throw new IllegalArgumentException();
if (src == null)
throw new IllegalArgumentException();
if (dataService == null)
throw new IllegalArgumentException();
// Note: This MUST be the index manager for the local data service.
if(joinNexus instanceof IBigdataFederation<?>)
throw new IllegalArgumentException();
this.fed = fed;
this.keyOrders = keyOrders;
this.dataService = dataService;
// This is the index manager for the federation (scale-out indices).
this.fedJoinNexus = joinNexus.getJoinNexusFactory().newInstance(fed);
if (lastJoin) {
// sinkCache = null;
memo = null;
nextScaleOutIndexName = null;
final ActionEnum action = fedJoinNexus.getAction();
if (action.isMutation()) {
/*
* Note: The solution buffer for mutation operations
* is obtained locally from a joinNexus that is
* backed by the federation NOT the local index
* manager. (This is because the solution buffer
* needs to write on the scale-out indices.)
*/
final IJoinNexus tmp = fedJoinNexus;
/*
* The view of the mutable relation for the _head_ of the
* rule.
*/
final IMutableRelation<?> relation = (IMutableRelation<?>) tmp
.getHeadRelationView(rule.getHead());
switch (action) {
case Insert: {
solutionBuffer = tmp.newInsertBuffer(relation);
break;
}
case Delete: {
solutionBuffer = tmp.newDeleteBuffer(relation);
break;
}
default:
throw new AssertionError();
}
} else {
/*
* The solution buffer for queries is obtained from the
* master.
*/
try {
solutionBuffer = masterProxy.getSolutionBuffer();
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
} else {
final IPredicate<?> nextPredicate = rule
.getTail(order[orderIndex + 1]);
final String namespace = nextPredicate.getOnlyRelationName();
nextScaleOutIndexName = namespace +"."
+ keyOrders[order[orderIndex + 1]];
solutionBuffer = null;
// sinkCache = new LinkedHashMap<PartitionLocator, JoinTaskSink>();
memo = new SinkMemoizer(getSink);
// System.err.println("orderIndex=" + orderIndex + ", resources="
// + Arrays.toString(getResource()) + ", nextPredicate="
// + nextPredicate + ", nextScaleOutIndexName="
// + nextScaleOutIndexName);
}
addSource(src);
}
/**
* Adds a source from which this {@link DistributedJoinTask} will read
* {@link IBindingSet} chunks.
*
* @param source
* The source.
*
* @return <code>true</code> iff the source was accepted.
*
* @throws IllegalArgumentException
* if the <i>source</i> is <code>null</code>.
*/
public boolean addSource(final IAsynchronousIterator<IBindingSet[]> source) {
if (source == null)
throw new IllegalArgumentException();
lock.lock();
try {
if (sourcesExhausted) {
// new source declarations are rejected.
if (INFO)
log.info("source rejected: orderIndex=" + orderIndex
+ ", partitionId=" + partitionId);
return false;
}
sources.add(source);
stats.fanIn++;
} finally {
lock.unlock();
}
if (DEBUG)
log.debug("orderIndex=" + orderIndex + ", partitionId="
+ partitionId + ", fanIn=" + stats.fanIn + ", fanOut="
+ stats.fanOut);
return true;
}
final protected IBuffer<ISolution[]> getSolutionBuffer() {
return solutionBuffer;
}
private final IBuffer<ISolution[]> solutionBuffer;
/**
* Sets a flag preventing new sources from being declared and closes all
* known {@link #sources} and removes this task from the {@link Session}.
*/
protected void closeSources() {
if (INFO)
log.info(toString());
lock.lock();
try {
sourcesExhausted = true;
final IAsynchronousIterator<?>[] a = sources
.toArray(new IAsynchronousIterator[] {});
for (IAsynchronousIterator<?> source : a) {
source.close();
}
removeFromSession();
} finally {
lock.unlock();
}
}
/**
* Remove the task from the session, but only if the task in the session is
* this task (it will have been overwritten if this task decides not to
* accept more sources and another source shows up).
*/
private void removeFromSession() {
lock.lock();
try {
// @todo allocate this in the ctor.
final String namespace = JoinTaskFactoryTask.getJoinTaskNamespace(
masterUUID, orderIndex, partitionId);
/*
* Note: If something else has the entry in the session then that is
* Ok, but we need to make sure that we don't remove it by accident!
*/
dataService.getSession().remove(namespace, this);
} finally {
lock.unlock();
}
}
/**
* This lock is used to make {@link #nextChunk()} and
* {@link #addSource(IAsynchronousIterator)} into mutually exclusive
* operations. {@link #nextChunk()} is the reader.
* {@link #addSource(IAsynchronousIterator)} is the writer. These operations
* need to be exclusive and atomic so that the termination condition of
* {@link #nextChunk()} is consistent -- it should terminate when there are
* no sources remaining. The first source is added when the
* {@link DistributedJoinTask} is created. Additional sources are added (and
* can result in a fan-in greater than one) when a
* {@link JoinTaskFactoryTask} identifies that there is an existing
* {@link DistributedJoinTask} and is able to atomically assign a new source
* to that {@link DistributedJoinTask}. If the atomic assignment of the new
* source fails (because all sources are exhausted before the assignment
* occurs) then a new {@link DistributedJoinTask} will be created for the
* same {@link DistributedJoinMasterTask}, orderIndex, and index partition
* identifier and the source will be assigned to that
* {@link DistributedJoinTask} instead.
*
* @todo javadoc update
*/
// private ReadWriteLock lock = new ReentrantReadWriteLock(false/* fair */);
private final ReentrantLock lock = new ReentrantLock();
/**
* Returns a chunk of {@link IBindingSet}s by combining chunks from the
* various source {@link JoinTask}s.
*
* @return A chunk assembled from one or more chunks from one or more of
* the source {@link JoinTask}s.
*/
protected IBindingSet[] nextChunk() throws InterruptedException {
if (sourcesExhausted) {
// nothing remaining in any accepted source.
return null;
}
if (DEBUG)
log.debug("Reading chunk of bindings from source(s): orderIndex="
+ orderIndex + ", partitionId=" + partitionId);
// #of elements in the combined chunk(s)
int bindingSetCount = 0;
// source chunks read so far.
final List<IBindingSet[]> chunks = new LinkedList<IBindingSet[]>();
/*
* Assemble a chunk of suitable size
*
* @todo don't wait too long. if we have some data then it is probably
* better to process that data rather than waiting beyond a timeout for
* a full chunk. also, make sure that we are neither yielding nor
* spinning too long in this loop. However, the loop must wait if there
* is nothing available and the sources are not yet exhausted.
*
* @todo config. we need a different capacity here than the one used for
* batch index operations. on the order of 100 should work well.
*
* Note: The termination conditions under which we will return [null]
* indicating that no more binding sets can be read are: (a) [halt] is
* true; (b) [sourcesExhausted] is true; or (c) all sources are
* exhausted and we are able to acquire the lock.
*
* Once we do acquire the lock we set [sourcesExhausted] to true and any
* subsequent request to add another source to this joinTask will fail.
* This has the consequence that a new JoinTask will be created if a new
* source has been identified once this task halts.
*/
final int chunkCapacity = 100;// joinNexus.getChunkCapacity();
while (!sourcesExhausted) {
while (!halt && !sources.isEmpty()
&& bindingSetCount < chunkCapacity) {
if (trace)
System.err.print("\norderIndex=" + orderIndex);
if (trace)
System.err.print(": reading");
// if (DEBUG)
// log.debug("Testing " + nsources + " sources: orderIndex="
// + orderIndex + ", partitionId=" + partitionId);
// clone to avoid concurrent modification of sources during
// traversal.
@SuppressWarnings("unchecked")
final IAsynchronousIterator<IBindingSet[]>[] sources = (IAsynchronousIterator<IBindingSet[]>[]) this.sources
.toArray(new IAsynchronousIterator[] {});
// #of sources that are exhausted.
int nexhausted = 0;
for (int i = 0; i < sources.length
&& bindingSetCount < chunkCapacity; i++) {
if (trace)
System.err.print(" <<(" + i + ":" + sources.length
+ ")");
final IAsynchronousIterator<IBindingSet[]> src = sources[i];
// if there is something to read on that source.
if (src.hasNext(1L, TimeUnit.MILLISECONDS)) {
/*
* Read the chunk, waiting up to the timeout for
* additional chunks from this source which can be
* combined together by the iterator into a single
* chunk.
*
* @todo config chunkCombiner timeout here and
* experiment with the value with varying fanIns.
*/
final IBindingSet[] chunk = src.next(10L,
TimeUnit.MILLISECONDS);
/*
* Note: Since hasNext() returned [true] for this source
* we SHOULD get a chunk back since it is known to be
* there waiting for us. The timeout should only give
* the iterator an opportunity to combine multiple
* chunks together if they are already in the iterator's
* queue (or if they arrive in a timely manner).
*/
assert chunk != null;
chunks.add(chunk);
bindingSetCount += chunk.length;
if (trace)
System.err.print("[" + chunk.length + "]");
if (DEBUG)
log.debug("Read chunk from source: sources[" + i
+ "], chunkSize=" + chunk.length
+ ", orderIndex=" + orderIndex
+ ", partitionId=" + partitionId);
} else if (src.isExhausted()) {
nexhausted++;
if (trace)
System.err.print("X{" + nexhausted + "}");
if (DEBUG)
log.debug("Source is exhausted: nexhausted="
+ nexhausted);
// no longer consider an exhausted source.
if (!this.sources.remove(src)) {
// could happen if src.equals() is not defined.
throw new AssertionError("Could not find source: "
+ src);
}
}
}
if (nexhausted == sources.length) {
/*
* All sources on which we were reading in this loop have
* been exhausted.
*
* Note: we may have buffered some data, which is checked
* below.
*
* Note: new sources may have been added concurrently, so we
* get the lock and then test the [sources] field, not just
* the local array.
*/
lock.lock();
try {
if (this.sources.isEmpty()) {
if (INFO)
log.info("Sources are exhausted: orderIndex="
+ orderIndex + ", partitionId="
+ partitionId);
sourcesExhausted = true;
/*
* Remove ourselves from the Session since we will
* no longer accept any new sources.
*/
removeFromSession();
}
} finally {
lock.unlock();
}
break;
}
}
if (halt)
throw new RuntimeException(firstCause.get());
/*
* Combine the chunks.
*/
if (!chunks.isEmpty()) {
return combineChunks(chunks, bindingSetCount);
}
} // while(!sourcesExhausted)
/*
* Termination condition: we did not get any data from any source, we
* are not permitting any new sources, and there are no sources
* remaining.
*/
if (DEBUG)
log.debug("Sources are exhausted: orderIndex=" + orderIndex
+ ", partitionId=" + partitionId);
if (trace)
System.err.print(" exhausted");
return null;
}
/**
* Combine the chunk(s) into a single chunk.
*
* @param chunks
* A list of chunks read from the {@link #sources}.
* @param bindingSetCount
* The #of bindingSets in those chunks.
* @return
*/
protected IBindingSet[] combineChunks(final List<IBindingSet[]> chunks,
final int bindingSetCount) {
final int chunkCount = chunks.size();
assert chunkCount > 0; // at least one chunk.
assert bindingSetCount > 0; // at least on bindingSet.
final IBindingSet[] chunk;
if (chunkCount == 1) {
// Only one chunk is available.
chunk = chunks.get(0);
} else {
// Combine 2 or more chunks.
chunk = new IBindingSet[bindingSetCount];
final Iterator<IBindingSet[]> itr = chunks.iterator();
int offset = 0;
while (itr.hasNext()) {
final IBindingSet[] a = itr.next();
System.arraycopy(a, 0, chunk, offset, a.length);
offset += a.length;
}
}
if (halt)
throw new RuntimeException(firstCause.get());
if (DEBUG) {
log.debug("Read chunk(s): nchunks=" + chunkCount
+ ", #bindingSets=" + chunk.length + ", orderIndex="
+ orderIndex + ", partitionId=" + partitionId);
}
stats.bindingSetChunksIn += chunkCount;
stats.bindingSetsIn += bindingSetCount;
if (trace)
System.err.print(" chunk[" + chunk.length + "]");
return chunk;
}
protected AbstractUnsynchronizedArrayBuffer<IBindingSet> newUnsyncOutputBuffer() {
final AbstractUnsynchronizedArrayBuffer<IBindingSet> unsyncOutputBuffer;
/*
* On overflow, the generated binding sets are mapped across the
* JoinTaskSink(s) for the target index partition(s).
*/
final int chunkCapacity = fedJoinNexus.getChunkCapacity();
if (lastJoin) {
/*
* Accepted binding sets are flushed to the solution buffer.
*/
unsyncOutputBuffer = new UnsynchronizedSolutionBuffer<IBindingSet>(
this, fedJoinNexus, chunkCapacity);
} else {
/*
* Accepted binding sets are flushed to the next join dimension.
*
* Note: The index is key-range partitioned. Each bindingSet
* will be mapped across the index partition(s) on which the
* generated access path for that bindingSet will have to read.
* There will be a JoinTask associated with each such index
* partition. That JoinTask will execute locally on the
* DataService which hosts that index partition.
*/
unsyncOutputBuffer = new UnsyncDistributedOutputBuffer<IBindingSet>(
fed, this, chunkCapacity);
}
return unsyncOutputBuffer;
}
/**
* Notifies each sink that this {@link DistributedJoinTask} will no
* longer generate new {@link IBindingSet} chunks and then waits for the
* sink task(s) to complete.
* <p>
* Note: Closing the {@link BlockingBuffer} from which a sink
* {@link JoinTask} is reading will cause the source iterator for that
* sink task to eventually return <code>false</code> indicating that
* it is exhausted (assuming that the sink keeps reading on the
* iterator).
*
* @throws InterruptedException
* if interrupted while awaiting the future for a sink.
*/
@Override
protected void flushAndCloseBuffersAndAwaitSinks()
throws InterruptedException, ExecutionException {
if (DEBUG)
log.debug("orderIndex="
+ orderIndex
+ ", partitionId="
+ partitionId
+ (lastJoin ? ", lastJoin" : ", sinkCount="
+ memo.size()));
/*
* For the last join dimension the JoinTask instead writes onto the
* [solutionBuffer]. For query, that is the shared solution buffer
* and will be a proxied object. For mutation, that is a per
* JoinTask buffer that writes onto the target relation. In the
* latter case we MUST report the mutationCount returned by flushing
* the solutionBuffer via JoinStats to the master.
*
* Note: JoinTask#flushUnsyncBuffers() will already have been
* invoked so all generated binding sets will already be in the sync
* buffer ready for output.
*/
if (lastJoin) {
// assert sinkCache == null;
assert memo == null;
if (DEBUG)
log.debug("\nWill flush buffer containing "
+ getSolutionBuffer().size() + " solutions.");
final long counter = getSolutionBuffer().flush();
if (DEBUG)
log.debug("\nFlushed buffer: mutationCount=" + counter);
if (joinNexus.getAction().isMutation()) {
/*
* Apply mutationCount to the JoinStats so that it will be
* reported back to the JoinMasterTask.
*/
stats.mutationCount.addAndGet(counter);
}
} else {
/*
* Close sinks.
*
* For all but the lastJoin, the buffers are writing onto the
* per-sink buffers. We flush and close those buffers now. The sink
* JoinTasks drain those buffers. Once the buffers are closed, the
* sink JoinTasks will eventually exhaust the buffers.
*
* Note: This flushes the buffers using a thread pool which should
* give better throughput when the fanOut is GT ONE (1).
*/
{
if (halt)
throw new RuntimeException(firstCause.get());
final List<Callable<Void>> tasks = new LinkedList<Callable<Void>>();
final Iterator<JoinTaskSink> itr = memo.getSinks();
while (itr.hasNext()) {
final JoinTaskSink sink = itr.next();
tasks.add(new FlushAndCloseSinkBufferTask(sink));
}
final List<Future<Void>> futures = fed.getExecutorService()
.invokeAll(tasks);
for (Future<?> f : futures) {
// make sure that all tasks were successful.
f.get();
}
}
// Await sinks.
{
final Iterator<JoinTaskSink> itr = memo.getSinks();
while (itr.hasNext()) {
if (halt)
throw new RuntimeException(firstCause.get());
final JoinTaskSink sink = itr.next();
final Future<?> f = sink.getFuture();
if (DEBUG)
log.debug("Waiting for Future: sink=" + sink);
// will throw any exception from the sink's Future.
f.get();
}
}
} // else (lastJoin)
if (DEBUG)
log.debug("Done: orderIndex="
+ orderIndex
+ ", partitionId="
+ partitionId
+ (lastJoin ? "lastJoin" : ", sinkCount="
+ memo.size()));
}
/**
* Flushes any buffered data for a {@link JoinTaskSink} and closes the
* {@link BlockingBuffer} for that sink so that the sink {@link JoinTask}'s
* iterator can eventually drain the buffer and report that it is exhausted.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
private class FlushAndCloseSinkBufferTask implements Callable<Void> {
final private JoinTaskSink sink;
public FlushAndCloseSinkBufferTask(final JoinTaskSink sink) {
if (sink == null)
throw new IllegalArgumentException();
this.sink = sink;
}
public Void call() throws Exception {
if (halt)
throw new RuntimeException(firstCause.get());
if (DEBUG)
log.debug("Closing sink: sink=" + sink
+ ", unsyncBufferSize=" + sink.unsyncBuffer.size()
+ ", blockingBufferSize=" + sink.blockingBuffer.size());
// flush to the blockingBuffer.
sink.unsyncBuffer.flush();
// close the blockingBuffer.
sink.blockingBuffer.close();
return null;
}
}
/**
* Cancel all {@link DistributedJoinTask}s that are sinks for this
* {@link DistributedJoinTask}.
*/
@Override
protected void cancelSinks() {
// no sinks.
if (lastJoin)
return;
if (DEBUG)
log.debug("orderIndex=" + orderIndex + ", partitionId="
+ partitionId + ", sinkCount=" + memo.size());
final Iterator<JoinTaskSink> itr = memo.getSinks();
while (itr.hasNext()) {
final JoinTaskSink sink = itr.next();
sink.unsyncBuffer.reset();
sink.blockingBuffer.reset();
sink.blockingBuffer.close();
sink.getFuture().cancel(true/* mayInterruptIfRunning */);
}
if (DEBUG)
log.debug("Done: orderIndex=" + orderIndex + ", partitionId="
+ partitionId + ", sinkCount=" + memo.size());
}
/**
* Return the sink on which we will write {@link IBindingSet} for the
* index partition associated with the specified locator. The sink will
* be backed by a {@link DistributedJoinTask} running on the
* {@link IDataService} that is host to that index partition. The
* scale-out index will be the scale-out index for the next
* {@link IPredicate} in the evaluation order.
*
* @param locator
* The locator for the index partition.
*
* @return The sink.
*
* @throws RuntimeException
* If the {@link JoinTaskFactoryTask} fails.
* @throws InterruptedException
* If the {@link JoinTaskFactoryTask} is interrupted.
*/
protected JoinTaskSink getSink(final PartitionLocator locator)
throws InterruptedException, RuntimeException {
return memo.compute(new SinkRequest(this, locator));
}
/**
* Helper class models a request to obtain a sink for a given join task and
* locator.
* <p>
* Note: This class must implement equals() and hashCode() since it is used
* within the {@link Memoizer} pattern.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
*/
private static class SinkRequest {
final DistributedJoinTask joinTask;
final PartitionLocator locator;
/**
*
* @param joinTask
* The join task.
* @param locator
* The locator for the target shard.
*/
public SinkRequest(final DistributedJoinTask joinTask, final PartitionLocator locator) {
this.joinTask = joinTask;
this.locator = locator;
}
/**
* Equals returns true iff parent == o.parent and index == o.index.
*/
public boolean equals(final Object o) {
if (!(o instanceof SinkRequest))
return false;
final SinkRequest r = (SinkRequest) o;
return joinTask == r.joinTask && locator.equals(r.locator);
}
/**
* The hashCode() is based directly on the hash code of the
* {@link PartitionLocator}. All requests against a given
* {@link Memoizer} will have the same {@link DistributedJoinTask} so
* that field can be factored out of the hash code.
*/
public int hashCode() {
return locator.hashCode();
}
}
/**
* Helper establishes a {@link JoinTaskSink} on the target {@link IDataService}.
*/
final private static Computable<SinkRequest, JoinTaskSink> getSink = new Computable<SinkRequest, JoinTaskSink>() {
public JoinTaskSink compute(final SinkRequest req)
throws InterruptedException {
try {
return req.joinTask._getSink(req.locator);
} catch (ExecutionException e) {
throw new RuntimeException(e);
}
}
};
/**
* FIXME javadoc : A {@link Memoizer} subclass which exposes an additional method to remove
* a {@link FutureTask} from the internal cache. This is used as part of an
* explicit protocol to clear out cache
* entries once the sink reference has been set on
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
*/
private static class SinkMemoizer extends
Memoizer<SinkRequest/* request */, JoinTaskSink/* sink */> {
/**
* @param c
*/
public SinkMemoizer(final Computable<SinkRequest, JoinTaskSink> c) {
super(c);
}
int size() {
return cache.size();
}
/**
* FIMXE There are two distinct semantics available here. One is the set
* of current sinks (there is a join task fully up and running on a DS
* somewhere and we have a proxy for that DS). The other is the set of
* sinks which have been requested but may or may not have been fully
* realized yet. When we are breaking a join, we probably want to cancel
* all of the requests to obtain sinks in addition to canceling any
* running sinks. A similar problem may exist if we implement native
* SLICE since we could break the join while there are requests out to
* create sinks.
*
* One way to handle this is to pull the cancelSinks() method into this
* memoizer.
*
* However, if we broad cast the rule to the nodes and move away from
* this sinks model to using NIO buffers then we will just broadcast
* the close of each tail in turn or broadcast the break of the join.
*/
@SuppressWarnings("unchecked")
Iterator<JoinTaskSink> getSinks() {
return new Striterator(cache.values().iterator()).addFilter(new Filter(){
private static final long serialVersionUID = 1L;
@Override
public boolean isValid(final Object e) {
/*
* Filter out any tasks which are not done or which had an
* error.
*/
final Future<JoinTaskSink> f = (Future<JoinTaskSink>)e;
if(!f.isDone()) {
return false;
}
try {f.get();}
catch(final ExecutionException ex) {
return false;
} catch (final InterruptedException ex) {
return false;
}
return true;
}
}).addFilter(new Resolver(){
private static final long serialVersionUID = 1L;
@Override
protected Object resolve(final Object arg0) {
/*
* We filtered out any tasks which were not done and any
* tasks which had errors. The future should be immediately
* available and Future.get() should not throw an error.
*/
final Future<JoinTaskSink> f = (Future<JoinTaskSink>)arg0;
try {
return f.get();
} catch (final InterruptedException e) {
throw new RuntimeException(e);
} catch (final ExecutionException e) {
throw new RuntimeException(e);
}
}
});
}
// /**
// * Called by the thread which atomically sets the
// * {@link AbstractNode#childRefs} element to the computed
// * {@link AbstractNode}. At that point a reference exists to the child
// * on the parent.
// *
// * @param req
// * The request.
// */
// void removeFromCache(final SinkRequest req) {
//
// if (cache.remove(req) == null) {
//
// throw new AssertionError();
//
// }
//
// }
// /**
// * Called from {@link AbstractBTree#close()}.
// *
// * @todo should we do this? There should not be any reads against the
// * the B+Tree when it is close()d. Therefore I do not believe there
// * is any reason to clear the FutureTask cache.
// */
// void clear() {
//
// cache.clear();
//
// }
};
/**
* Used to materialize {@link JoinTaskSink}s without causing concurrent requests
* for different sinks to block.
*/
final private SinkMemoizer memo;
/**
* Inner implementation invoked from the {@link Memoizer}.
*
* @param locator
* The shard locator.
*
* @return The sink which will write on the downstream {@link JoinTask}
* running on the node for that shard.
*
* @throws ExecutionException
* @throws InterruptedException
*/
private JoinTaskSink _getSink(final PartitionLocator locator) throws InterruptedException, ExecutionException {
/*
* Allocate/discover JoinTask on the target data service and
* obtain a sink reference for its future and buffers.
*
* Note: The JoinMasterTask uses very similar logic to setup the
* first join dimension. Of course, it gets to assume that there
* is no such JoinTask in existence at the time.
*/
final int nextOrderIndex = orderIndex + 1;
if (DEBUG)
log.debug("Creating join task: nextOrderIndex="
+ nextOrderIndex + ", indexName="
+ nextScaleOutIndexName + ", partitionId="
+ locator.getPartitionId());
final UUID sinkUUID = locator.getDataServiceUUID();
final IDataService dataService;
if (sinkUUID.equals(fed.getServiceUUID())) {
/*
* As an optimization, special case when the downstream
* data service is _this_ data service.
*/
dataService = (IDataService)fed.getService();
} else {
dataService = fed.getDataService(sinkUUID);
}
final JoinTaskSink sink = new JoinTaskSink(fed, locator, this);
/*
* Export async iterator proxy.
*
* Note: This proxy is used by the sink to draw chunks from the
* source JoinTask(s).
*/
final IAsynchronousIterator<IBindingSet[]> sourceItrProxy;
if (fed.isDistributed()) {
sourceItrProxy = ((AbstractDistributedFederation<?>) fed)
.getProxy(sink.blockingBuffer.iterator(), joinNexus
.getBindingSetSerializer(), joinNexus
.getChunkOfChunksCapacity());
} else {
sourceItrProxy = sink.blockingBuffer.iterator();
}
// the future for the factory task (not the JoinTask).
final Future<?> factoryFuture;
try {
final JoinTaskFactoryTask factoryTask = new JoinTaskFactoryTask(
nextScaleOutIndexName, rule, joinNexus
.getJoinNexusFactory(), order, nextOrderIndex,
locator.getPartitionId(), masterProxy, masterUUID,
sourceItrProxy, keyOrders, requiredVars);
// submit the factory task, obtain its future.
factoryFuture = dataService.submit(factoryTask);
} catch (IOException ex) {
// RMI problem.
throw new RuntimeException(ex);
}
/*
* Obtain the future for the JoinTask from the factory task's
* Future.
*/
sink.setFuture((Future<?>) factoryFuture.get());
stats.fanOut++;
return sink;
}
/**
* Logs an error in {@link JoinTask#call()} on the local log file and adds
* some metadata about the operation which was being executed. This does not
* imply that the error originates with this join task. You have to inspect
* the error messages, the order in which the joins were being evaluated,
* and even correlate the {@link JoinTask#masterUUID} in order to figure out
* what really happened.
*/
@Override
protected void logCallError(final Throwable t) {
log.error("hostname=" + dataService.getHostname() + ", serviceName="
+ dataService.getServiceName() + ", joinTask=" + toString()
+ ", rule=" + rule, t);
}
}