/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Oct 16, 2008
*/
package com.bigdata.relation.rule.eval.pipeline;
import java.io.IOException;
import java.nio.channels.ClosedByInterruptException;
import java.util.LinkedList;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.log4j.Logger;
import com.bigdata.bop.IBindingSet;
import com.bigdata.bop.IPredicate;
import com.bigdata.journal.ConcurrencyManager;
import com.bigdata.journal.ITx;
import com.bigdata.relation.accesspath.BlockingBuffer;
import com.bigdata.relation.accesspath.BufferClosedException;
import com.bigdata.relation.accesspath.IAccessPath;
import com.bigdata.relation.accesspath.IAsynchronousIterator;
import com.bigdata.relation.accesspath.IBlockingBuffer;
import com.bigdata.relation.accesspath.IBuffer;
import com.bigdata.relation.accesspath.ThickAsynchronousIterator;
import com.bigdata.relation.accesspath.UnsynchronizedArrayBuffer;
import com.bigdata.relation.rule.IQueryOptions;
import com.bigdata.relation.rule.IRule;
import com.bigdata.relation.rule.ISlice;
import com.bigdata.relation.rule.eval.ActionEnum;
import com.bigdata.relation.rule.eval.IJoinNexus;
import com.bigdata.relation.rule.eval.IJoinNexusFactory;
import com.bigdata.relation.rule.eval.IRuleState;
import com.bigdata.relation.rule.eval.ISolution;
import com.bigdata.relation.rule.eval.IStepTask;
import com.bigdata.relation.rule.eval.RuleLog;
import com.bigdata.relation.rule.eval.RuleState;
import com.bigdata.relation.rule.eval.RuleStats;
import com.bigdata.service.DataService;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.striterator.IKeyOrder;
import com.bigdata.util.InnerCause;
import com.bigdata.util.concurrent.ExecutionExceptions;
/**
* Master providing efficient distributed evaluation of {@link IRule}s. For
* query, this task should be run by the client that wishes to materialize the
* query results. For mutation, this task may be run by any client or service
* since the data does not flow through the master for mutation.
* <p>
* For the first join dimension, the {@link JoinMasterTask} creates a
* {@link JoinTask} per index partition that will be spanned by the
* {@link IAccessPath} for the first {@link IPredicate} in the evaluation order
* and feeds each {@link JoinTask}(s) in the first join dimension with an
* {@link IAsynchronousIterator} reading on a buffer containing single empty
* {@link IBindingSet}.
* <p>
* Each {@link JoinTask} consumes {@link IBindingSet} chunks read from the
* previous join dimension. For each {@link IBindingSet} chunk read, a new
* {@link IAccessPath} is obtained. Elements are then read from than
* {@link IAccessPath} in chunks. Given the {@link IBindingSet} used to obtain
* the {@link IAccessPath}, a new {@link IBindingSet} is created for each
* element in each chunk read from the {@link IAccessPath}. If the new
* {@link IBindingSet} satisfies the constraint(s) on the {@link IRule} then it
* will be output to the next join dimension. An {@link IBindingSet} is output
* by placing it onto the {@link UnsynchronizedArrayBuffer} for the join
* dimension. Periodically that {@link UnsynchronizedArrayBuffer} will overflow,
* and a chunk of {@link IBindingSet}s will be placed onto the
* {@link IBlockingBuffer} from which the next join dimension will read its
* {@link IBindingSet} chunks.
* <p>
* The last join dimension is slightly different. Its
* {@link UnsynchronizedArrayBuffer} writes onto the
* {@link IJoinNexus#newQueryBuffer()},
* {@link IJoinNexus#newInsertBuffer(com.bigdata.relation.IMutableRelation)}, or
* {@link IJoinNexus#newDeleteBuffer(com.bigdata.relation.IMutableRelation)}
* depending on the {@link ActionEnum}.
* <p>
* For each {@link JoinTask}, once its source iterator(s) have been exhausted
* and the {@link IAccessPath} reading from the last source {@link IBindingSet}
* has been exhausted, then the {@link JoinTask} for that join dimension is done
* and it will flush its {@link UnsynchronizedArrayBuffer} and close its output
* {@link IBuffer} and wait for the downstream {@link JoinTask}s to report their
* {@link RuleStats}. Those {@link RuleStats} are aggregated and passed back to
* its caller in turn.
* <p>
* Each join dimension is single-threaded. Coordination of resources is achieved
* using the output buffer for each join dimension. This allows a source join
* dimension to read ahead and forces the sink join dimension to process chunks
* of {@link IBindingSet}s at a time.
* <p>
* The {@link JoinMasterTask} is responsible for the {@link JoinTask}s for the
* first join dimension. Each {@link JoinTask} is responsible for the downstream
* {@link JoinTask}s. If the {@link JoinMasterTask} is interrupted or cancelled,
* then it interrupts or cancels the {@link JoinTask}s for the first join
* dimension. If {@link JoinTask} is interrupted or cancelled then it must
* cancel any {@link JoinTask}s which it has created for the next join
* dimension.
*
* <h2>Choosing the view</h2>
*
* Rules SHOULD be evaluated against a read-historical state.
* <p>
* This is a hard requirement when computing the fix point closure of a rule
* (set). Each round of closure MUST be evaluated against the commit time
* reported by {@link IBigdataFederation#getLastCommitTime()} and is applied for
* all rules in that round. This allows unisolated tasks to write on the
* generated solutions onto the indices. This is a strong requirement since the
* {@link JoinTask}s will otherwise wind up holding an exclusive lock on the
* {@link ITx#UNISOLATED} index partitions, which would cause a deadlock when
* attempting to write the generated solutions onto the index partitions. At the
* start of the next round of closure, simply update the read-historical
* timestamp to the then current value of
* {@link IBigdataFederation#getLastCommitTime()}.
* <p>
* Queries that use {@link ITx#READ_COMMITTED} or {@link ITx#UNISOLATED} will
* not generate deadlocks, but they are subject to abort from the
* split/join/move of index partition(s) during query evaluation. This problem
* WILL NOT arise if you read instead from the
* {@link IBigdataFederation#getLastCommitTime()}.
*
* <h2>Key-range partitioned joins</h2>
*
* In order to scale-out efficiently, the {@link JoinMasterTask} must distribute
* the {@link JoinTask}s such that they run inside of the
* {@link ConcurrencyManager} on the various {@link DataService}s on which the
* index partitions reside from which the {@link IAccessPath}s must read. This
* allows the {@link IAccessPath} to read on the local index object and reduces
* the message traffic to pulling chunks of {@link IBindingSet}s from the source
* {@link JoinTask}s.
* <p>
* For the {@link JoinMasterTask} and for each {@link JoinTask}, the fan out of
* {@link JoinTask}s is determined by the #of index partitions that are spanned
* by the {@link IAccessPath}s required to evaluate the {@link IBindingSet}s for
* the next join dimension. The {@link IAccessPath} will not be used by the
* source join dimension to read on the index, merely to discover the index
* partitions to which the generating {@link IBindingSet}s must be assigned. The
* index partition spanned for a given {@link IBindingSet} is determined by
* generating an as bound {@link IPredicate} for the next join dimension,
* instantiating the {@link IAccessPath} on the source join dimension that will
* be used by the target join dimension, and then using a locator scan for the
* <i>fromKey</i> and <i>toKey</i> for that {@link IAccessPath}. In the case
* where the {@link IPredicate} is fully bound, the {@link IAccessPath} will be
* restricted to a single index partition, but we still need to know which index
* partition.
* <p>
* The {@link IBindingSet} is written on an {@link UnsynchronizedArrayBuffer}
* corresponding to the target index partition. The
* {@link UnsynchronizedArrayBuffer} (together with the output {@link IBuffer}
* for the {@link IBindingSet} chunks and the {@link Future} for the
* {@link JoinTask} for that index partition) for the target index partition
* exists in an LRU. If it falls off of the end of the LRU, then the
* {@link UnsynchronizedArrayBuffer} is flushed and the output {@link IBuffer}
* is closed. The downstream {@link JoinTask} will eventually exhaust the
* corresponding {@link IAsynchronousIterator} source.
* <p>
* When the source join dimension and the sink join dimension have the same
* {@link IKeyOrder} there will be an orderly progression through the indices
* and each sink {@link JoinTask} can be safely closed once a {@link JoinTask}
* is created on the {@link DataService} for the next index partition. However,
* the {@link IKeyOrder}s offer differ, which can lead to more scattered
* assignment of output {@link IBindingSet}s to index partitions. The LRU helps
* to manage this fan out.
* <p>
* Fan out means that there may be N>1 {@link JoinTask}s for each join
* dimension. For this reason, a QUERY {@link ISlice} must be applied by the
* client reading on the {@link IAsynchronousIterator} returned by the
* {@link JoinMasterTask}.
* <p>
* Fan out also implies a requirement for fan-in in order to reduce the scatter
* of {@link JoinTask}s. Fan-in must aggregate the source {@link JoinTask} such
* that they target the same sink {@link JoinTask} instance for the same rule
* execution instance, the same orderIndex (hence the same {@link IPredicate}),
* and the same index partition. This means that a factory mechanism must be
* used to either create a new {@link JoinTask} or return the existing
* {@link JoinTask} on the {@link DataService} based on those identifying
* properties. This must be done in a thread-safe manner, but contention should
* be restricted to the case where the identifying properties are the same. The
* factory must be given the {@link IAsynchronousIterator} reading
* {@link IBindingSet} chunks from the source join dimension and the
* {@link JoinTask} must not close (unless interrupted or cancelled) until all
* of its source {@link IAsynchronousIterator}s have been exhausted.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* @todo fold these comments into the javadoc.
* <p>
* The goal is to have no more than one {@link JoinTask} per index
* partition per rule execution. If the #of index partitions is very large
* then we may have to use an LRU cache in an attempt to release
* {@link JoinTask}s that are not being written on by a given source
* {@link JoinTask}.
* <p>
* There is a strong requirement for closure to get back the
* mutationCount. That would require us to keep alive a source
* {@link JoinTask} until all downstream {@link JoinTask}s complete.
*
* @todo The pipeline join needs to be modified to force stable evaluation
* (single-threaded) and to observed the {@link ISlice} constraints and
* {@link IQueryOptions#isStable()} constraint.
* <p>
* In order to avoid a hot spot on {@link RuleStats#solutionCount}, that
* field should only be updated as chunks of solutions are produced. When
* evaluating a SLICE, we will constrain the evaluation to be single
* threaded so once again there will not be a hot spot on that field.
* <p>
* All of this should be done when we refactor the pipeline join to use a
* fixed thread pool for join processing. In the meantime, query hints can
* be used to specify the nested subquery join if you need stable
* evaluation. In the long run, SPARQL aware query caching is the right
* way to handle stable query and SLICE for scale-out since a stable
* evaluation order can impose far too much constraint on parallelism
* otherwise.
*
* @todo We are not seeing the totals when a SLICE is used. I believe that the
* test harness is simply exiting once it gets its N results and the
* daemon threads for the workers are not keeping the JVM alive. Ideally
* either the JoinMasterTask the last JoinTask would notice that the
* solution buffer was closed and would use that information to halt the
* ongoing {@link JoinTask}s.
*/
abstract public class JoinMasterTask implements IStepTask, IJoinMaster {
protected static final Logger log = Logger.getLogger(JoinMasterTask.class);
/**
* True iff the {@link #log} level is DEBUG or less.
*/
protected static final boolean DEBUG = log.isDebugEnabled();
/*
* from the ctor.
*/
protected final IRule rule;
protected final IJoinNexus joinNexus;
protected final IJoinNexusFactory joinNexusFactory;
/**
* From the ctor. This will be the {@link IBuffer} on which the last join
* dimension writes the computed {@link ISolution}s.
* <p>
* Note: {@link LocalJoinMasterTask} always passes this along to the last
* {@link LocalJoinTask}.
* <p>
* Note: For a {@link DistributedJoinMasterTask} running a Query this gets
* proxied and the {@link DistributedJoinTask}s all write on the proxy.
* However, the {@link DistributedJoinMasterTask} DOES NOT proxy this for
* mutation in order to keep all data from flowing through the master.
*/
protected final IBuffer<ISolution[]> solutionBuffer;
protected final int tailCount;
protected final IRuleState ruleState;
/**
* The evaluation order.
*/
protected final int[] order;
protected final RuleStats ruleStats;
/**
* Statistics on {@link JoinTask} behavior for each {@link IPredicate} in
* the tail of the rule. These statistics are reported by each
* {@link JoinTask} and then aggregated for each join dimension.
* <p>
* Note: The index into this array is the evaluation order of the predicate.
*/
protected final JoinStats[] joinStats;
/**
* The unique identifier for this {@link JoinMasterTask} instance.
*/
protected final UUID masterUUID;
/**
*
* @param rule
* The rule to be executed.
* @param joinNexus
* The {@link IJoinNexus}.
* @param solutionBuffer
* The {@link ISolution} buffer.
*/
protected JoinMasterTask(final IRule rule, final IJoinNexus joinNexus,
final IBuffer<ISolution[]> solutionBuffer) {
if (rule == null)
throw new IllegalArgumentException();
if (joinNexus == null)
throw new IllegalArgumentException();
this.rule = rule;
this.joinNexus = joinNexus;
this.joinNexusFactory = joinNexus.getJoinNexusFactory();
this.tailCount = rule.getTailCount();
this.masterUUID = UUID.randomUUID();
// computes the eval order.
this.ruleState = new RuleState(rule, joinNexus);
// the evaluation order.
this.order = ruleState.getPlan().getOrder();
// note: evaluation order is fixed by now.
this.ruleStats = joinNexus.getRuleStatisticsFactory().newInstance(
ruleState);
{
this.joinStats = new JoinStats[tailCount];
for (int orderIndex = 0; orderIndex < tailCount; orderIndex++) {
this.joinStats[orderIndex] = new JoinStats(orderIndex);
}
}
this.solutionBuffer = solutionBuffer;
}
final public UUID getUUID() {
return masterUUID;
}
/**
* Evaluate the rule.
*/
public RuleStats call() throws Exception {
if (ruleState.getPlan().isEmpty()) {
if (log.isInfoEnabled())
log.info("Rule proven to have no solutions.");
return ruleStats;
}
final long begin = System.currentTimeMillis();
final List<Future<Void>> futures = start();
try {
awaitAll(futures, Long.MAX_VALUE, TimeUnit.SECONDS);
} catch(InterruptedException ex) {
/*
* The master itself was interrupted.
*
* Note: The most common reason for this exception is a SLICE. When
* the query consumer decides that it has satisfied the SLICE it
* will close the iterator consuming the query and that will cause
* the the query buffer to be closed and the task (this
* JoinMasterTask) that is writing on that query buffer to be
* interrupted.
*
* Note: This can also happen if you shutdown the service on which
* the master is running or deliberately interrupt the master.
*/
if(log.isInfoEnabled())
log.info("Interrupted");
/*
* Fall through!
*
* Note: We fall through so that the rule evaluation appears to
* complete normally for the common case where a SLICE causes the
* master to be interrupted. For this case the query buffer will
* already contain at least those solutions that satisfied the
* slice and we need do nothing more.
*
* Note: The JoinStats information may be incomplete as one or more
* JoinTask(s) may still be running.
*/
if (log.isInfoEnabled()) {
/*
* Give the join tasks a chance to complete so that the join
* stats will get reported to the master so that the master can
* report out the correct stats to its caller.
*
* Note: This is completely optional. You DO NOT need to wait
* here. Whether or not you wait here depends mainly on whether
* the potential additional latency or the potential of having
* the join stats on hand is more important for you.
*/
try {
awaitAll(futures, 1L, TimeUnit.SECONDS);
} catch (Throwable t) {
// ignore.
}
}
} catch(ExecutionExceptions ex) {
// something unexpected
log.error(ex, ex);
throw new RuntimeException(ex);
}
ruleStats.elapsed += System.currentTimeMillis() - begin;
/*
* Aggregate statistics from each join dimension and log anything that
* is interesting.
*/
combineJoinStats();
if (log.isDebugEnabled())
log.debug("Done");
return ruleStats;
}
/**
* Start one or more {@link JoinTask}s for the rule.
*
* @return The {@link Future}s for those {@link JoinTask}s.
*/
abstract List<Future<Void>> start() throws Exception;
/**
* Make sure that each {@link JoinTask} completed successfully.
* <p>
* Note: This waits until all {@link JoinTask}s complete, regardless of
* their outcome (or until the timeout expires), so that all
* {@link JoinTask} have the opportunity to report their {@link JoinStats}
* to the {@link JoinMasterTask}.
*
* @param futures
* The {@link Future} for each {@link JoinTask} that was created
* by the {@link JoinMasterTask}.
* @param timeout
* The timeout for awaiting those futures.
* @param unit
* The unit for that timeout.
*
* @throws ExecutionExceptions
* if one or more {@link JoinTask}s fail.
* @throws InterruptedException
* if the {@link JoinMasterTask} itself was interrupted while
* awaiting its {@link JoinTask}s.
* @throws TimeoutException
* if the timeout expires first.
*/
protected void awaitAll(final List<Future<Void>> futures,
final long timeout, final TimeUnit unit) throws ExecutionExceptions,
InterruptedException, TimeoutException {
final long begin = System.nanoTime();
final long nanos = unit.toNanos(timeout);
long remaining = nanos;
// errors.
final List<ExecutionException> errors = new LinkedList<ExecutionException>();
for (Future<Void> f : futures) {
if (remaining < 0L) {
int ncancelled = 0;
for (Future<Void> x : futures) {
if (x.cancel(true/* mayInterruptIfRunning */)) {
ncancelled++;
}
}
log.warn("Cancelled " + ncancelled + " futures due to timeout");
throw new TimeoutException();
}
try {
f.get(remaining, TimeUnit.NANOSECONDS);
} catch (CancellationException ex) {
/*
* A JoinTask will be canceled if any of its output buffers are
* asynchronously closed. This will occur if a downstream
* JoinTask discovers that it has satisfied a SLICE or
* encountered an error during processing. Either way, we treat
* the CancellationException as a "info" NOT an error.
*
* Note: This exception can also be wrapped, in which case we
* catch it below.
*/
if (log.isInfoEnabled())
log.info(ex.getLocalizedMessage(), ex);
} catch (ExecutionException ex) {
if (InnerCause.isInnerCause(ex, InterruptedException.class)||
InnerCause.isInnerCause(ex, ClosedByInterruptException.class)||
InnerCause.isInnerCause(ex, BufferClosedException.class) ||
InnerCause.isInnerCause(ex, CancellationException.class)) {
/*
* The root cause was the asynchronous close of the
* buffer that is the overflow() target for the
* unsynchronized buffer. This will occur if the
* high-level iterator was closed() while join thread(s)
* are still executing.
*
* Note: InterruptedException will be thrown during
* query if the BlockingBuffer on which the query
* solutions are being written is closed, e.g., because
* someone closed a high-level iterator reading
* solutions from the BlockingBuffer. Closing the
* BlockingBuffer causes the Future that is writing on
* the BlockingBuffer to be interrupted in order to
* eagerly terminate processing.
*
* Note: ClosedByInterruptException will be the cause if
* the interrupt was noticed during an IO by the thread
* in which this exception was thrown.
*
* Note: AsynchronousCloseException will be the cause if
* the interrupt was noticed during an IO by a different
* thread resulting in the asynchronous close of the
* backing channel. However, the
* AsynchronousCloseException is trapped by
* DiskOnlyStrategy and results in the transparent
* re-opening of the backing channel. Since the target
* buffer will be closed, the AsynchronousCloseException
* should be swiftly followed by an BlockingBuffer#add()
* throwing an IllegalStateException if there is an
* attempt to write on a closed buffer.
*
* Note: Using Thread#interrupt() to halt asynchronous
* processing for query is NOT ideal as it will
* typically force the FileChannel to be closed
* asynchronously. You are better off using a SLICE.
* However, when the query has a FILTER as well as a
* SLICE and the filter can not be evaluated inside of
* the the JOINs then the caller must pull solutions
* through the filter and close the iterator once the
* slice is satisfied. That will trigger an interrupt
* of join thread(s) unless join processing is already
* complete.
*/
if (log.isInfoEnabled())
log.info(ex.getLocalizedMessage(), ex);
} else {
/*
* Something unexpected.
*/
// add to list of errors.
errors.add(new ExecutionException(ex));
// log w/ stack trace so that we can see where this came from.
log.error(ex.getMessage(), ex);
}
}
// subtract out the elapsed time so far.
remaining = nanos - (System.nanoTime() - begin);
}
if (!errors.isEmpty()) {
/*
* Throw exception containing all failures.
*/
throw new ExecutionExceptions(errors);
}
}
/**
* Return an {@link IAsynchronousIterator} that will read a single
* {@link IBindingSet}.
*
* @param bindingSet
* the binding set.
*/
protected ThickAsynchronousIterator<IBindingSet[]> newBindingSetIterator(
final IBindingSet bindingSet) {
return new ThickAsynchronousIterator<IBindingSet[]>(
new IBindingSet[][] { new IBindingSet[] { bindingSet } });
}
/**
* Aggregates statistics each {@link JoinTask} onto {@link #ruleStats}.
* There are N {@link JoinTask}s per {@link IPredicate} in the tail of the
* rule, where N is the #of index partitions on which we must read to
* evaluate the {@link IRule} for a given {@link IPredicate} in the tail (N
* is per {@link IPredicate}, not the same for each {@link IPredicate}).
*/
protected void combineJoinStats() {
/*
* Get the #of solutions produced.
*/
final long solutionCount;
if (!joinNexus.getAction().isMutation()) {
/*
* For query all solutions flow through the master so we get to see
* the solution count.
*/
solutionCount = ((BlockingBuffer) solutionBuffer).getElementsAddedCount();
} else {
/*
* The #of binding sets output from the last join dimension is
* another way to get the solution count.
*
* Note: This should work regardless of whether we are evaluating a
* rule for mutation or query.
*/
solutionCount = joinStats[order[tailCount - 1]].bindingSetsOut;
}
ruleStats.solutionCount.addAndGet(solutionCount);
/*
* The mutation count is taken from the last join dimension.
*/
ruleStats.mutationCount
.addAndGet(joinStats[order[tailCount - 1]].mutationCount.get());
final int[] order = ruleState.getPlan().getOrder();
for (int tailIndex = 0; tailIndex < tailCount; tailIndex++) {
final JoinStats o = joinStats[order[tailIndex]];
ruleStats.chunkCount[tailIndex] += o.chunkCount;
ruleStats.elementCount[tailIndex] += o.elementCount;
}
if(log.isInfoEnabled()) {
// the rule state.
log.info("\n" + ruleState);
// the rule statistics.
log.info("\n" + ruleStats);
}
/*
* Note: This provides more detail on this join algorithm than the
* RuleStats view, however the per-predicate per-index partition
* details are not available since these data aggregate across all
* index partitions for a given tail predicate.
*/
RuleLog.log(rule, ruleState, joinStats);
}
/**
* Aggregates the statistics for some join dimension.
*
* @param joinStats
* Statistics for an index partition of some join dimension.
*/
public void report(final JoinStats joinStats) {
if (log.isDebugEnabled()) {
log.debug("\n"+joinStats.toString());
}
// the totals for that join dimension.
final JoinStats total = this.joinStats[joinStats.orderIndex];
total.add(joinStats);
}
/**
* Returns the buffer specified to the ctor (overridden for distributed
* joins).
*/
public IBuffer<ISolution[]> getSolutionBuffer() throws IOException {
return solutionBuffer;
}
}