/* Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Jun 19, 2008 */ package com.bigdata.relation.accesspath; import java.util.Iterator; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.FutureTask; import java.util.concurrent.RejectedExecutionException; import org.apache.log4j.Logger; import com.bigdata.bop.BOpContext; import com.bigdata.bop.BufferAnnotations; import com.bigdata.bop.IBindingSet; import com.bigdata.bop.IPredicate; import com.bigdata.bop.ap.filter.SameVariableConstraint; import com.bigdata.bop.cost.BTreeCostModel; import com.bigdata.bop.cost.DiskCostModel; import com.bigdata.bop.cost.IndexSegmentCostModel; import com.bigdata.bop.cost.ScanCostReport; import com.bigdata.bop.join.BaseJoinStats; import com.bigdata.btree.AbstractBTree; import com.bigdata.btree.BTree; import com.bigdata.btree.IBTreeStatistics; import com.bigdata.btree.IBloomFilter; import com.bigdata.btree.IIndex; import com.bigdata.btree.ILocalBTreeView; import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.IndexSegment; import com.bigdata.btree.Tuple; import com.bigdata.btree.UnisolatedReadWriteIndex; import com.bigdata.btree.isolation.IsolatedFusedView; import com.bigdata.btree.keys.IKeyBuilder; import com.bigdata.btree.proc.ISimpleIndexProcedure; import com.bigdata.btree.view.FusedView; import com.bigdata.io.DirectBufferPool; import com.bigdata.journal.IIndexManager; import com.bigdata.journal.ITx; import com.bigdata.journal.Journal; import com.bigdata.journal.NoSuchIndexException; import com.bigdata.journal.TimestampUtility; import com.bigdata.mdi.IMetadataIndex; import com.bigdata.mdi.LocalPartitionMetadata; import com.bigdata.relation.AbstractResource; import com.bigdata.relation.IRelation; import com.bigdata.service.AbstractClient; import com.bigdata.service.DataService; import com.bigdata.service.IBigdataFederation; import com.bigdata.service.ndx.IClientIndex; import com.bigdata.service.ndx.IScaleOutClientIndex; import com.bigdata.striterator.ChunkedArrayIterator; import com.bigdata.striterator.ChunkedWrappedIterator; import com.bigdata.striterator.EmptyChunkedIterator; import com.bigdata.striterator.IChunkedIterator; import com.bigdata.striterator.IChunkedOrderedIterator; import com.bigdata.striterator.IKeyOrder; import com.bigdata.util.Bytes; import com.bigdata.util.BytesUtil; import cutthecrap.utils.striterators.FilterBase; import cutthecrap.utils.striterators.ICloseableIterator; import cutthecrap.utils.striterators.IFilter; import cutthecrap.utils.striterators.NOPFilter; import cutthecrap.utils.striterators.Striterator; /** * Abstract base class for type-specific {@link IAccessPath} implementations. *<p> * Note: Filters should be specified when the {@link IAccessPath} is constructed * so that they will be evaluated on the data service rather than materializing * the elements and then filtering them. This can be accomplished by adding the * filter as a constraint on the predicate when specifying the access path. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @param R * The generic type of the elements of the {@link IRelation}. * * @todo Add support for non-perfect access paths. This class should layer on an * index local {@link IFilter} which rejects tuples which do not satisfy * the {@link IPredicate}'s bindings. This will give the effect of a SCAN * with an implied filter. The javadoc on * {@link IRelation#getKeyOrder(IPredicate)} should also be updated to * reflect the allowance for non-perfect access paths. */ public class AccessPath<R> implements IAccessPath<R>, IBindingSetAccessPath<R> { static final protected Logger log = Logger.getLogger(IAccessPath.class); private static final boolean DEBUG = log.isDebugEnabled(); /** Relation (resolved lazily if not specified to the ctor). */ private final IRelation<R> relation; /** Access to the index, resource locator, executor service, etc. */ protected final IIndexManager indexManager; /** Timestamp of the view. */ protected final long timestamp; /** Predicate (the resource name on the predicate is the relation namespace). */ protected final IPredicate<R> predicate; /** * The description of the index partition iff the {@link #predicate} is * constrained to an index partition and <code>null</code> otherwise. */ final LocalPartitionMetadata pmd; /** * Index order (the relation namespace plus the index order and the option * partitionId constraint on the predicate identify the index). */ protected final IKeyOrder<R> keyOrder; /** The index. */ protected final IIndex ndx; /** Iterator flags. */ protected final int flags; protected final int chunkOfChunksCapacity; protected final int chunkCapacity; protected final int fullyBufferedReadThreshold; /** * <code>true</code> iff the {@link IPredicate}is fully bound. */ private final boolean isFullyBoundForKey; /** * <code>true</code> iff there is a filter for the access path (either local * or remote). */ private final boolean hasFilter; /** * <code>true</code> iff there is a filter for the access path (either local * or remote). */ public final boolean hasFilter() { return hasFilter; } /** * <code>true</code> iff all elements in the predicate which are required * to generate the key are bound to constants. */ public boolean isFullyBoundForKey() { return isFullyBoundForKey; } /** * @see AbstractResource#getChunkCapacity() */ public int getChunkCapacity() { return chunkCapacity; } /** * @see AbstractResource#getChunkOfChunksCapacity() */ public int getChunkOfChunksCapacity() { return chunkOfChunksCapacity; } /** * The maximum <em>limit</em> that is allowed for a fully-buffered read. The * {@link #asynchronousIterator(Iterator)} will always be used above this * limit. * * FIXME Array limits in truth maintenance code. This should probably be * close to the branching factor or chunk capacity. It has been temporarily * raised to a very large value in order to support truth maintenance where * the code assumes access to the fully buffered result. That change needs * to be examined for an impact on query performance. It is effectively * forcing all access path reads to be fully buffered rather than using an * asynchronous iterator pattern. * * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/606"> * Array limits in truth maintenance code. </a> */ protected static final int MAX_FULLY_BUFFERED_READ_LIMIT = 10000000; /** * We cache some stuff for historical reads. * <p> * Note: We cache results on a per-{@link IAccessPath} basis rather than a * per-{@link IIndex} basis since range counts and range iterators are both * constrained to a specific key range of interest for an * {@link IAccessPath} while they would span the entire {@link IIndex} * otherwise. * * @todo cache the {@link IAccessPath}s themselves so that we benefit from * reuse of the cached data. * * @todo we could also cache small iterator result sets. */ private final boolean historicalRead; /** * For {@link #historicalRead}s only, the range count is cached once it is * computed. It is also set if we discover using {@link #isEmpty()} or * {@link #iterator(long, long, int)} that the {@link IAccessPath} is empty. * Likewise, those methods test this flag to see if we have proven the * {@link IAccessPath} to be empty. */ private long rangeCount = -1L; /** * The filter derived from optional * {@link IPredicate.Annotations#INDEX_LOCAL_FILTER}. If there are shared * variables in the {@link IPredicate} then a {@link SameVariableConstraint} * is added regardless of whether the {@link IPredicate} specified a filter * or not. */ final protected IFilter indexLocalFilter; /** * The filter derived from optional * {@link IPredicate.Annotations#ACCESS_PATH_FILTER}. */ final protected IFilter accessPathFilter; /** * Used to detect failure to call {@link #init()}. */ private boolean didInit = false; private final byte[] fromKey; private final byte[] toKey; /** * The key corresponding to the inclusive lower bound for the * {@link IAccessPath} <code>null</code> if there is no lower bound. */ final public byte[] getFromKey() { return fromKey; } /** * The key corresponding to the exclusive upper bound for the * {@link IAccessPath} -or- <code>null</code> if there is no upper bound. */ final public byte[] getToKey() { return toKey; } @Override final public IKeyOrder<R> getKeyOrder() { return keyOrder; } /** * @param relation * The relation for the access path (optional). The * <i>relation</> is not specified when requested an * {@link IAccessPath} for a specific index partition in order to * avoid forcing the materialization of the {@link IRelation}. * @param localIndexManager * Access to the indices, resource locators, executor service, * etc. * @param predicate * The constraints on the access path. * @param keyOrder * The order in which the elements would be visited for this * access path. */ public AccessPath(// final IRelation<R> relation,// final IIndexManager localIndexManager, // final IPredicate<R> predicate,// final IKeyOrder<R> keyOrder // ) { if (relation == null) throw new IllegalArgumentException(); if (predicate == null) throw new IllegalArgumentException(); if (keyOrder == null) throw new IllegalArgumentException(); this.relation = relation; final int partitionId = predicate.getPartitionId(); /* * If the predicate is addressing a specific shard, then the default is * to assume that it will not be using a remote access path. However, if * a remote access path was explicitly request and the partitionId was * specified, then it will be an error (which is trapped below). */ final boolean remoteAccessPath = predicate .getProperty( IPredicate.Annotations.REMOTE_ACCESS_PATH, partitionId == -1 ? IPredicate.Annotations.DEFAULT_REMOTE_ACCESS_PATH : false); /* * Chose the right index manger. If relation.getIndexManager() is not * federation, then always use that index manager. Otherwise, if AP is * REMOTE use the relation's index manager. Otherwise, the * localIndexManager MUST NOT be null and we will use it. */ if (!(relation.getIndexManager() instanceof IBigdataFederation<?>)) { this.indexManager = relation.getIndexManager(); } else if (remoteAccessPath) { this.indexManager = relation.getIndexManager(); } else { if (localIndexManager == null) { throw new RuntimeException("Local index manager not given but" + " access path specifies local index: pred="+predicate); } this.indexManager = localIndexManager; } this.predicate = predicate; this.keyOrder = keyOrder; final int flags = predicate.getProperty( IPredicate.Annotations.FLAGS, IPredicate.Annotations.DEFAULT_FLAGS); this.flags = flags; /* * Choose the timestamp of the view. If the request is for the * unisolated index but the predicate was flagged as READONLY then * automatically choose READ_COMMITTED instead. */ { long timestamp = relation.getTimestamp(); timestamp = (timestamp == ITx.UNISOLATED && (flags & IRangeQuery.READONLY) != 0 ? ITx.READ_COMMITTED : timestamp); this.timestamp = timestamp; } this.historicalRead = TimestampUtility.isReadOnly(timestamp); // final int partitionId = predicate.getPartitionId(); final IIndex ndx; if (partitionId != -1) { if (remoteAccessPath) { /* * A request for a specific shard is not compatible with a * request for a remote access path. */ throw new RuntimeException("Annotations are not compatible: " + IPredicate.Annotations.REMOTE_ACCESS_PATH + "=" + remoteAccessPath + ", but " + IPredicate.Annotations.PARTITION_ID + "=" + partitionId + " for "+predicate ); } final String namespace = relation.getNamespace(); // The name of the desired index partition. final String name = DataService.getIndexPartitionName(namespace + "." + keyOrder.getIndexName(), partitionId); try { // MUST be a local index view. ndx = (ILocalBTreeView) indexManager.getIndex(name, timestamp); } catch (Throwable t) { throw new RuntimeException(predicate.toString(), t); } if (ndx == null) { /* * Some possible root causes for failing to find a shard on a DS * are listed below. You should verify that the addressed shard * was actually present on the addressed data service as of the * effect read time of the request. * * * - The as-bound predicate was mapped onto the wrong shard. * Some subtle problems have been tracked back to this. See * https://sourceforge.net/apps/trac/bigdata/ticket/457. There * was also a problem where as were failing to use the as-bound * predicate when mapping the predicate onto a shard. * * - A failure in IndexManager to locate the shard. This could * include concurrency holes in the indexCache, the access to * the journal for the appropriate commit time, a * read-historical request without a read-lock (application * error), etc. * * - The shard was moved (but this will be a * StaleLocatorException and can only occur with the unisolated * index view, at least until we implement shard caching as part * of the hybrid shared disk / shared nothing architecture). */ // // For debugging only - comment this out. // dumpMDI((AbstractScaleOutFederation<?>) relation // .getIndexManager(), relation.getNamespace(), timestamp, // keyOrder); throw new RuntimeException("No such index: relation=" + relation.getNamespace() + ", timestamp=" + timestamp + ", keyOrder=" + keyOrder + ", pred=" + predicate + ", indexManager=" + indexManager); } /* * An index partition constraint was specified, so verify that we * were given a local index object and that the index object is for * the correct index partition. */ pmd = ndx.getIndexMetadata().getPartitionMetadata(); if (pmd == null) throw new RuntimeException("Not an index partition"); if (pmd.getPartitionId() != partitionId) { throw new RuntimeException("Expecting partitionId=" + partitionId + ", but have " + pmd.getPartitionId()); } } else { // The predicate is not constrained to an index partition. pmd = null; /* * Obtain the index. * * FIXME The getIndex(IKeyOrder) code path is optimized by * SPORelation and LexiconRelation. However, we should have * automatic caching of the index references to avoid the * significant penalty of going down to the commitRecordIndex and * Name2Addr each time we need to resolve an index. (Scale-out has * separate caching for this in IndexManager.) */ ndx = relation.getIndex(keyOrder); // final String fqn = AbstractRelation.getFQN(relation, keyOrder); // // ndx = AbstractRelation.getIndex(indexManager, fqn, timestamp); if (ndx == null) { throw new RuntimeException("No such index: relation=" + relation.getNamespace() + ", timestamp=" + timestamp + ", keyOrder=" + keyOrder + ", pred=" + predicate + ", indexManager=" + indexManager); } } this.ndx = ndx; /** * See AST2BOpUtility.toPredicate(). It is responsible for copying these * annotations from the StatementPatternNode onto the Predicate so they * can influence the behavior of the AccessPath. * * @see <a href="http://sourceforge.net/apps/trac/bigdata/ticket/791" > * Clean up query hints </a> */ final int chunkOfChunksCapacity = predicate.getProperty( BufferAnnotations.CHUNK_OF_CHUNKS_CAPACITY, BufferAnnotations.DEFAULT_CHUNK_OF_CHUNKS_CAPACITY); final int chunkCapacity = predicate.getProperty( BufferAnnotations.CHUNK_CAPACITY, BufferAnnotations.DEFAULT_CHUNK_CAPACITY); final int fullyBufferedReadThreshold = predicate.getProperty( IPredicate.Annotations.FULLY_BUFFERED_READ_THRESHOLD, IPredicate.Annotations.DEFAULT_FULLY_BUFFERED_READ_THRESHOLD); this.chunkOfChunksCapacity = chunkOfChunksCapacity; this.chunkCapacity = chunkCapacity; this.fullyBufferedReadThreshold = fullyBufferedReadThreshold; this.isFullyBoundForKey = predicate.isFullyBound(keyOrder); { /* * The filter to be evaluated at the index (optional). * * Note: This MUST be an implementation which is "aware" of the * reuse of tuples within tuple iterators. That is why it is being * cast to a BOpTupleIterator. * * @todo if not a perfect index then impose additional filter first * to skip over tuples which do not satisfy the concrete asBound * predicate. This allows us to use the "best" index, not just a * "perfect" index. */ final IFilter indexLocalFilter = predicate.getIndexLocalFilter(); /* * Optional constraint enforces the "same variable" constraint. The * constraint will be null unless at least one variable appears in * more than one position in the predicate. */ final SameVariableConstraint<R> sameVarConstraint = SameVariableConstraint .newInstance(predicate); if (sameVarConstraint != null) { /* * Stack filters. */ final FilterBase tmp = new NOPFilter(); if (indexLocalFilter != null) tmp.addFilter(indexLocalFilter); tmp.addFilter(new SameVariableConstraintTupleFilter<R>( sameVarConstraint)); this.indexLocalFilter = tmp; } else { this.indexLocalFilter = indexLocalFilter; } } // optional filter to be evaluated by the AccessPath. this.accessPathFilter = predicate.getAccessPathFilter(); // true iff there is a filter (either local or remote). this.hasFilter = (indexLocalFilter != null || accessPathFilter != null); final IKeyBuilder keyBuilder = ndx.getIndexMetadata() .getTupleSerializer().getKeyBuilder(); fromKey = keyOrder.getFromKey(keyBuilder, predicate); toKey = keyOrder.getToKey(keyBuilder, predicate); } @Override public String toString() { return getClass().getName() + "{predicate=" + predicate + ", keyOrder=" + keyOrder + ", flags=" + Tuple.flagString(flags) + ", fromKey=" + (fromKey == null ? "n/a" : BytesUtil.toString(fromKey)) + ", toKey=" + (toKey == null ? "n/a" : BytesUtil.toString(toKey)) + ", hasFilter=" + hasFilter + ", indexLocalFilter=" + (indexLocalFilter == null ? "n/a" : indexLocalFilter) + ", accessPathFilter=" + (accessPathFilter == null ? "n/a" : accessPathFilter) + ", indexManager="+indexManager + "}"; } /** * @throws IllegalStateException * unless {@link #init()} has been invoked. */ final protected void assertInitialized() { if (!didInit) throw new IllegalStateException(); } /** * Required post-ctor initialization. * * @return <i>this</i> */ public AccessPath<R> init() { if (didInit) throw new IllegalStateException(); didInit = true; if(DEBUG) { if (fromKey != null && toKey != null) { if (BytesUtil.compareBytes(fromKey, toKey) >= 0) { throw new AssertionError("keys are out of order: " + toString()); } } log.debug(toString()); } return this; } public IRelation<R> getRelation() { return relation; } public IIndexManager getIndexManager() { return indexManager; } public long getTimestamp() { return timestamp; } @Override public IPredicate<R> getPredicate() { return predicate; } @Override public IIndex getIndex() { return ndx; } /** * @todo for scale-out, it may be better to implement {@link #isEmpty()} * without specifying a capacity of ONE (1) and then caching the * returned iterator. This could avoid an expensive RMI test if we * invoke {@link #iterator()} shortly after {@link #isEmpty()} returns * <code>false</code>. */ @Override public boolean isEmpty() { assertInitialized(); if (historicalRead && rangeCount != -1) { /* * Optimization for a historical read in which we have already * proven that the access path is empty. */ return rangeCount == 0L; } if(DEBUG) { log.debug(toString()); } final IChunkedIterator<R> itr = iterator(0L/* offset */, 1L/* limit */, 1/* capacity */); try { final boolean empty = ! itr.hasNext(); if (empty && historicalRead) { // the access path is known to be empty. rangeCount = 0L; } return empty; } finally { itr.close(); } } // /** // * {@inheritDoc} // * // * @see https://sourceforge.net/apps/trac/bigdata/ticket/209 (Access path // * should visit solutions for high level query). // */ // public ICloseableIterator<IBindingSet> solutions(final BaseJoinStats stats) { // //// final IVariable<?>[] vars = BOpUtility //// .getDistinctArgumentVariables(predicate); // // return BOpContext.solutions(iterator(), predicate, /*vars,*/ stats); // // } /** * {@inheritDoc} * * @see https://sourceforge.net/apps/trac/bigdata/ticket/209 (Access path * should visit solutions for high level query). */ @Override public ICloseableIterator<IBindingSet[]> solutions(final BOpContext context, final long limit, final BaseJoinStats stats) { // final IVariable<?>[] vars = BOpUtility // .getDistinctArgumentVariables(predicate); return context.solutions( iterator(0L/* offset */, limit, 0/* capacity */), predicate, stats); } @Override final public IChunkedOrderedIterator<R> iterator() { return iterator(0L/* offset */, 0L/* limit */, 0); } // final public IChunkedOrderedIterator<R> iterator(final int limit, // final int capacity) { // // return iterator(0L/* offset */, limit, capacity); // // } /** * @throws RejectedExecutionException * if the iterator is run asynchronously and the * {@link ExecutorService} is shutdown or has a maximum capacity * and is saturated. * * FIXME Support both offset and limit for asynchronous * iterators. right now this will force the use of the * {@link #synchronousIterator(long, long, Iterator)} when the * offset or limit are non-zero, but that is only permitted up * to a limit of {@link #MAX_FULLY_BUFFERED_READ_LIMIT}. * * FIXME in order to support large limits we need to verify that * the asynchronous iterator can correctly handle REMOVEALL and * that incremental materialization up to the [limit] will not * effect the semantics for REMOVEALL or the other iterator * flags (per above). (In fact, the asynchronous iterator does * not support either [offset] or [limit] at this time). * * FIXME write unit tests for slice handling by this method and * modify the SAIL integration to use it for SLICE on an * {@link IAccessPath} scan. Note that there are several * {@link IAccessPath} implementations and they all need to be * tested with SLICE. * * Those tests should be located in * {@link com.bigdata.rdf.spo.TestSPOAccessPath}. * * FIXME The offset and limit should probably be rolled into the * predicate and removed from the {@link IAccessPath}. This way * they will be correctly applied when {@link #isEmpty()} is * implemented using the {@link #iterator()} to determine if any */ @Override @SuppressWarnings("unchecked") final public IChunkedOrderedIterator<R> iterator(final long offset, long limit, int capacity) { if (offset < 0) throw new IllegalArgumentException(); if (limit < 0) throw new IllegalArgumentException(); if (limit == Long.MAX_VALUE) { // treat MAX_VALUE as meaning NO limit. limit = 0L; } if (limit > MAX_FULLY_BUFFERED_READ_LIMIT) { // Note: remove constraint when async itr supports SLICE. throw new UnsupportedOperationException("limit=" + limit + " exceeds maximum fully buffered read limit: " + MAX_FULLY_BUFFERED_READ_LIMIT); } if (historicalRead && rangeCount >= 0L && ((rangeCount - offset) <= 0L)) { /* * The access path has already been proven to be empty. */ if (DEBUG) log.debug("Proven empty by historical range count"); return new EmptyChunkedIterator<R>(keyOrder); } if (DEBUG) log.debug("offset=" + offset + ", limit=" + limit + ", capacity=" + capacity + ", accessPath=" + this); final boolean fullyBufferedRead; // true iff a point test is a hit on the bloom filter. boolean bloomHit = false; if(isFullyBoundForKey) { if (DEBUG) log.debug("Predicate is fully bound for the key."); /* * If the predicate is fully bound then there can be at most one * element matched so we constrain the limit and capacity * accordingly. */ if (offset > 0L) { // the iterator will be empty if the offset is GT zero. return new EmptyChunkedIterator<R>(keyOrder); } capacity = 1; limit = 1L; fullyBufferedRead = true; /* * Note: Since this is a point test, we apply the bloom filter for * fast rejection. However, we can only apply the bloom filter if * (a) you are using the local index object (either a BTree or a * FusedView); and (b) the bloom filter exists (and is enabled). * * Note: The scale-out case is dealt with by pipelining the * intermediate binding sets to the data service on which the index * partition resides, at which point we again can apply the local * bloom filter efficiently. */ if(ndx instanceof ILocalBTreeView) { final IBloomFilter filter = ((ILocalBTreeView)ndx).getBloomFilter(); if (filter != null) { if(!filter.contains(fromKey)) { // proven to not exist. return new EmptyChunkedIterator<R>(keyOrder); } bloomHit = true; // fall through } // fall through } // fall through } else if (limit > 0L) { /* * A [limit] was specified. * * NOTE: When the [limit] is (GT ZERO) we MUST NOT let the * DataService layer iterator read more than [limit] elements at a * time. * * This is part of the contract for REMOVEALL - when you set the * [limit] and specify REMOVEALL you are only removing the 1st * [limit] elements in the traversal order. * * This is also part of the atomic queue operations contract - the * head and tail queue operations function by specifying [limit := * 1] (tail also specifies the REVERSE traversal option). * * Note: When the [limit] is specified we always do a fully buffered * (aka synchronous) read. This simplifies the behavior of the * iterator and limits are generally quite small. */ capacity = (int) limit; fullyBufferedRead = true; } else { /* * No limit was specified. * * Range count the access path and use a synchronous read if the * rangeCount is LTE the threshold. * * Note: the range count is corrected by the offset so that it gives * the effective remaining range count. When the effective remaining * range count is zero we know that the iterator will not visit * anything. * * @todo this kind of rangeCount might be replaced by an estimated * range count basic on historical data and NOT requiring RMI. */ final long rangeCountRemaining = rangeCount(false/* exact */) - offset; if (DEBUG) log.debug("offset=" + offset + ", limit=" + limit + ", rangeCountRemaining=" + rangeCountRemaining + ", fullyBufferedReadThreashold=" + fullyBufferedReadThreshold); if(rangeCountRemaining <= 0) { /* * Since the range count is an upper bound we KNOW that the * iterator would not visit anything. */ if (DEBUG) log.debug("No elements based on range count."); return new EmptyChunkedIterator<R>(keyOrder); } if(rangeCountRemaining < fullyBufferedReadThreshold) { // adjust limit to no more than the #of remaining elements. if (limit == 0L) { limit = rangeCountRemaining; } else { limit = Math.min(limit, rangeCountRemaining); } // adjust capacity to no more than the maximum capacity. capacity = (int) Math.min(MAX_FULLY_BUFFERED_READ_LIMIT, limit); fullyBufferedRead = true; } else { fullyBufferedRead = false; } } /* * Note: The [capacity] gets passed through to the DataService layer. * * Note: The ElementFilter on the IPredicate (if any) is encapsulated * within [filter] and is passed through to the DataService layer. It * MUST be Serializable and it will be executed right up against the * data. * * FIXME pass the offset and limit into the source iterator * (IRangeQuery, ITupleIterator). This will require a lot of changes to * the code as that gets used everywhere. */ // The raw tuple iterator: the impl depends on the IIndex impl (BTree, // IndexSegment, ClientIndexView, or DataServiceIndexView). final ITupleIterator<R> tupleItr = rangeIterator(capacity, flags, indexLocalFilter); // Wrap raw tuple iterator with resolver that materializes the elements // from the visited tuples. final Iterator<R> src = new Striterator(tupleItr) .addFilter(new TupleObjectResolver()); if (accessPathFilter != null) { /* * Chain in the optional access path filter stack. */ ((Striterator) src).addFilter(accessPathFilter); } if (fullyBufferedRead) { /* * Synchronous fully buffered read of no more than [limit] elements. */ final IChunkedOrderedIterator<R> tmp = synchronousIterator(offset, limit, src); if(bloomHit) { if(!tmp.hasNext()) { // notify filter of a false positive. ((ILocalBTreeView)ndx).getBloomFilter().falsePos(); } } return tmp; } else { /* * Asynchronous read (does not support either offset or limit for * now). */ assert offset == 0L : "offset=" + limit; assert limit == 0L : "limit=" + limit; return asynchronousIterator(src); } } /** * Fully buffers all elements that would be visited by the * {@link IAccessPath} iterator. * * @param accessPath * The access path (including the triple pattern). * @param offset * The first element that will be materialized (non-negative). * @param limit * The maximum #of elements that will be materialized (must be * positive, so use a range count before calling this method if * there was no limit specified by the caller). * * FIXME pass the offset and limit into the source iterator and remove them * from this method's signature. This will require a change to the * {@link IRangeQuery} API and {@link ITupleIterator} impls. */ @SuppressWarnings("unchecked") final protected IChunkedOrderedIterator<R> synchronousIterator( final long offset, final long limit, final Iterator<R> src) { if (offset < 0) throw new IllegalArgumentException(); if (limit <= 0) throw new IllegalArgumentException(); assert limit < MAX_FULLY_BUFFERED_READ_LIMIT : "limit=" + limit + ", max=" + MAX_FULLY_BUFFERED_READ_LIMIT; if (DEBUG) { log.debug("offset=" + offset + ", limit=" + limit); } int nread = 0; int nused = 0; // skip past the offset elements. while (nread < offset && src.hasNext()) { src.next(); nread++; } // read up to [limit] elements into the buffer. R[] buffer = null; while (nused < limit && src.hasNext()) { final R e = src.next(); if (buffer == null) { buffer = (R[]) java.lang.reflect.Array.newInstance( e.getClass(), (int) limit); } buffer[nused] = e; nused++; nread++; } if(DEBUG) { log.debug("Fully buffered: read=" + nread + ", used=" + nused + ", offset=" + offset + ", limit=" + limit); } // if (limit == 1) // System.err.println("Fully buffered: used=" + nused + ", limit=" + limit); if (nread == 0) { return new EmptyChunkedIterator<R>(keyOrder); } return new ChunkedArrayIterator<R>(nused, buffer, keyOrder); } /** * Asynchronous read using a {@link BlockingBuffer}. * * @param src * The source iterator. * * @return * * @throws RejectedExecutionException * if the {@link ExecutorService} is shutdown or has a maximum * capacity and is saturated. */ final protected IChunkedOrderedIterator<R> asynchronousIterator( final Iterator<R> src) { if (src == null) throw new IllegalArgumentException(); if (DEBUG) log.debug(""); /* * Note: The filter is applied by the ITupleIterator so that it gets * evaluated close to the data, not here where it would be evaluated * once the elements were materialized on the client. */ final BlockingBuffer<R[]> buffer = new BlockingBuffer<R[]>( chunkOfChunksCapacity); /** * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/707"> * BlockingBuffer.close() does not unblock threads </a> */ // Wrap computation as FutureTask. final FutureTask<Void> ft = new FutureTask<Void>( new ChunkConsumerTask<R>(this, src, buffer)); // Set Future on BlockingBuffer *before* starting computation. buffer.setFuture(ft); // Start computation. indexManager.getExecutorService().submit(ft); return new ChunkConsumerIterator<R>(buffer.iterator(), keyOrder); } /** * Consumes elements from the source iterator, converting them into chunks * on a {@link BlockingBuffer}. The consumer will drain the chunks from the * buffer. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> */ static private class ChunkConsumerTask<R> implements Callable<Void> { static protected final Logger log = Logger.getLogger(ChunkConsumerTask.class); private final AccessPath<R> accessPath; private final Iterator<R> src; private final BlockingBuffer<R[]> buffer; /** * * @param src * The source iterator visiting elements read from the * relation. * @param buffer * The buffer onto which chunks of those elements will be * written. */ public ChunkConsumerTask(final AccessPath<R> accessPath, final Iterator<R> src, final BlockingBuffer<R[]> buffer) { if (accessPath == null) throw new IllegalArgumentException(); if (src == null) throw new IllegalArgumentException(); if (buffer == null) throw new IllegalArgumentException(); this.accessPath = accessPath; this.src = src; this.buffer = buffer; } @Override public Void call() throws Exception { /* * Chunked iterator reading from the ITupleIterator. The filter was * already applied by the ITupleIterator so we do not use it here. * * Note: The chunk size is determined [chunkCapacity]. * * Note: The BlockingBuffer can combine multiple chunks together * dynamically to provide a larger effective chunk size as long as * those chunks are available with little or no added latency. */ final IChunkedOrderedIterator<R> itr = new ChunkedWrappedIterator<R>( src, accessPath.chunkCapacity, accessPath.keyOrder, null/* filter */); long nchunks = 0; long nelements = 0; try { while (src.hasNext()) { final R[] chunk = itr.nextChunk(); nchunks++; nelements += chunk.length; if (DEBUG) log.debug("#chunks=" + nchunks + ", chunkSize=" + chunk.length + ", nelements=" + nelements); buffer.add(chunk); } } finally { if (log.isInfoEnabled()) log.info("Closing buffer: #chunks=" + nchunks + ", #elements=" + nelements + ", accessPath=" + accessPath); buffer.close(); itr.close(); } return null; } } @Override final public long rangeCount(final boolean exact) { assertInitialized(); long n = 0L; if (exact) { /* * @todo we can cache exact range counts also, but we can not return * a cached estimated range count when an exact range count is * requested. */ if (hasFilter) { /* * If there is a filter, then we need to visit the elements and * apply the filter to those elements. * * FIXME If the filter is properly driven through to the indices * then the index should be able to enable the (KEYS,VALS) flags * locally and we can avoid sending back the full tuple when * just doing a range count. This could be done using a * rangeCount(exact,filter) method on IIndex. */ final IChunkedOrderedIterator<R> itr = iterator(); while (itr.hasNext()) { itr.next(); n++; } } else { n = ndx.rangeCountExact(fromKey, toKey); } } else { if (historicalRead) { // cachable. n = historicalRangeCount(fromKey, toKey); } else { // not cachable. n = ndx.rangeCount(fromKey, toKey); } } if (DEBUG) { log.debug("exact=" + exact + ", filter=" + hasFilter + ", n=" + n + " : " + toString()); } return n; } /** * Note: the range count is cached for a historical read to reduce round * trips to the DataService. */ final private long historicalRangeCount(final byte[] fromKey, final byte[] toKey) { if (rangeCount == -1L) { // do query and cache the result. return rangeCount = ndx.rangeCount(fromKey, toKey); } else { // cached value. return rangeCount; } } // @Override // final public ITupleIterator<R> rangeIterator() { // // return rangeIterator(0/* capacity */, flags, indexLocalFilter); // // } @SuppressWarnings( { "unchecked" }) protected ITupleIterator<R> rangeIterator(final int capacity, final int flags, final IFilter filter) { assertInitialized(); if (DEBUG) { log.debug(this + " : capacity=" + capacity + ", flags=" + flags + ", filter=" + filter); } return ndx.rangeIterator(fromKey, toKey, capacity, flags, filter); } /** * This implementation removes all tuples that would be visited by the * access path from the backing index. * <p> * Note: If you are maintaining multiple indices then you MUST override this * method to remove the data from each of those indices. */ @Override public long removeAll() { assertInitialized(); if (DEBUG) { log.debug(this.toString()); } /* * Remove everything in the key range which satisfies the filter. Do * not materialize keys or values. * * @todo if offset and limit are rolled into the access path then * they would also belong here. */ final ITupleIterator<?> itr = rangeIterator(0/* capacity */, IRangeQuery.REMOVEALL, indexLocalFilter); long n = 0; while (itr.hasNext()) { itr.next(); n++; } return n; } /** * Return an estimate of the cost of a scan on the predicate. * * @param pred * The predicate. * * @return The estimated cost of a scan on that predicate. */ public ScanCostReport estimateCost() { if(ndx instanceof UnisolatedReadWriteIndex) { return ((UnisolatedReadWriteIndex) ndx).estimateCost(diskCostModel, rangeCount(false/* exact */)); } if (ndx instanceof BTree) { /* * Fast path for a local BTree. */ // fast range count (may be cached by the access path). final long rangeCount = rangeCount(false/*exact*/); return estimateCost(diskCostModel, (BTree) ndx, rangeCount); } if (ndx instanceof ILocalBTreeView) { /* * A local view. This path is for both transactions and local * shards. */ // fast range count (may be cached by the access path). final long rangeCount = rangeCount(false/* exact */); return estimateCost((ILocalBTreeView) ndx, rangeCount, fromKey, toKey); } if (ndx instanceof IScaleOutClientIndex) { /* * A scale-out index is being addressed. */ return estimateCost((IScaleOutClientIndex) ndx); } throw new UnsupportedOperationException("index=" + ndx); } /** * Return the estimated cost of an index scan on a local {@link BTree}. * * @param btree * The {@link BTree}. * * @return The estimated cost of the scan. */ private ScanCostReport estimateCost(final DiskCostModel diskCostModel, final BTree btree, final long rangeCount) { // BTree is its own statistics view. final IBTreeStatistics stats = (BTree) btree; // Estimate cost based on random seek per node/leaf. final double cost = new BTreeCostModel(diskCostModel).rangeScan( rangeCount, stats.getBranchingFactor(), stats.getHeight(), stats.getUtilization().getLeafUtilization()); return new ScanCostReport(rangeCount, cost); } /** * Return the estimated cost of a key-range scan for a local B+Tree view. * This handles both {@link IsolatedFusedView} (transactions) and * {@link FusedView} (shards). * * @param view * The view. * * @return The estimated cost. */ static private ScanCostReport estimateCost(final ILocalBTreeView view, final long rangeCount, final byte[] fromKey, final byte[] toKey) { double cost = 0d; final AbstractBTree[] sources = view.getSources(); for (AbstractBTree source : sources) { final IBTreeStatistics stats = source.getStatistics(); // fast range count on that source. final long sourceRangeCount = source.rangeCount(fromKey, toKey); if (source instanceof IndexSegment) { // Cost for an index segment based on multi-block IO. final IndexSegment seg = (IndexSegment) source; final long extentLeaves = seg.getStore().getCheckpoint().extentLeaves; final long leafCount = stats.getLeafCount(); // Note: bytesPerLeaf is never more than an int32 value! final int bytesPerLeaf = (int) Math .ceil(((double) extentLeaves) / leafCount); cost += new IndexSegmentCostModel(diskCostModel).rangeScan( (int) sourceRangeCount, stats.getBranchingFactor(), bytesPerLeaf, DirectBufferPool.INSTANCE .getBufferCapacity()); } else { // Cost for a B+Tree based on random seek per node/leaf. cost += new BTreeCostModel(diskCostModel).rangeScan( sourceRangeCount, stats.getBranchingFactor(), stats .getHeight(), stats.getUtilization() .getLeafUtilization()); } } // @todo pass details per source back in the cost report. return new ScanCostReport(rangeCount, cost); } /** * Return the estimated cost of a key-range scan on a remote view of a * scale-out index. * * @param ndx * The scale-out index. * * @return * * @todo Remote scans can be parallelized. If flags includes PARALLEL then * the cost can be as little as the cost of scanning one shard. * However, the {@link IClientIndex} has a configuration value which * specifies the maximum parallelism of any given operation (this is * self-reported if we cast to the implementation class). Further, * even if we assume that the shards are evenly distributed over the * nodes, when the #of shards is significantly larger than the #of * nodes then the scan can interfere with itself. Finally, this should * include an estimate of the RMI overhead. */ private ScanCostReport estimateCost(final IScaleOutClientIndex ndx) { final String name = ndx.getIndexMetadata().getName(); final AbstractClient<?> client = ndx.getFederation().getClient(); // maximum parallelization by the client : @todo not used yet. final int maxParallel = client.getMaxParallelTasksPerRequest(); // the metadata index for that scale-out index. final IMetadataIndex mdi = ndx.getFederation().getMetadataIndex(name, timestamp); if (mdi == null) throw new NoSuchIndexException("name=" + name + "@" + TimestampUtility.toString(timestamp)); // #of index partitions to be scanned. final long partitionCount = mdi.rangeCount(fromKey, toKey); if (partitionCount == 0) { /* * SWAG in case zero partition count is reported (I am not sure that * this code path is possible). * * @todo This is proven possible. Now figure out why. Maybe this is * fromKey==toKey, in which case we can optimize that out. */ return new ScanCostReport(0L/* rangeCount */, partitionCount, 100/* millis */); // /* // * Should never be "zero" partition count. // */ // throw new AssertionError(); } // fast range count (may be cached by the access path). final long rangeCount = rangeCount(false/* exact */); if (partitionCount == 1) { /* * Delegate the operation to the remote shard. */ return (ScanCostReport) ndx.submit( fromKey == null ? BytesUtil.EMPTY : fromKey, new EstimateShardScanCost(rangeCount, fromKey, toKey)); } /* * Assume a statistical model. Each partition is comprised of 1 journal * with 50k tuples plus two index segments of 100M each. */ // one journal per shard. final int njournals = 1; // two segments per shard. final int nsegments = 2; final long rangeCountOnJournal = rangeCount / (partitionCount * (njournals + nsegments)); final double costPerJournal = new BTreeCostModel(diskCostModel) .rangeScan(rangeCountOnJournal, // mdi.getIndexMetadata().getBranchingFactor(), // 5,// height (SWAG) 70// leafUtilization (percent, SWAG). ); final double costPerSegment = diskCostModel.seekTime + Bytes.megabyte * 100; final double costPerShard = costPerJournal + 2 * costPerSegment; // @todo ignores potential parallelism. final double cost = costPerShard * partitionCount; return new ScanCostReport(rangeCount, partitionCount, cost); } /** * Procedure to estimate the cost of an index range scan on a remote shard. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> */ private static final class EstimateShardScanCost implements ISimpleIndexProcedure<ScanCostReport> { private static final long serialVersionUID = 1L; private final long rangeCount; private final byte[] fromKey; private final byte[] toKey; public EstimateShardScanCost(final long rangeCount, final byte[] fromKey, final byte[] toKey) { this.rangeCount = rangeCount; this.fromKey = fromKey; this.toKey = toKey; } @Override public ScanCostReport apply(final IIndex ndx) { final ScanCostReport scanCostReport = AccessPath.estimateCost( ((ILocalBTreeView) ndx), rangeCount, fromKey, toKey); return scanCostReport; } @Override public boolean isReadOnly() { return true; } } /* * Cost models. */ /** * The cost model associated with the disk on which the indices are stored. * For a {@link Journal}, this is just the cost model of the backing disk. * For the federation, this should be an average cost model. * * @todo This is not parameterized. A simple cost model is always assumed. * The correct cost model is necessary in order to get the tradeoff * point right for SCAN+FILTER versus SUBQUERY on SSD or RAID arrays * with lots of spindles versus normal disk. * * @todo In a shared disk deployment, we might introduce one cost model for * local SSD used to cache journals, one for local non-SSD disks used * to cache index segments, and one for remote storage used to * materialize historical journals and index segments for query. * * @todo In a federation, this should be reported out as metadata for the * federation. Perhaps as a Jini attribute. Or we could self-publish * this using a System property whose value was either the name of the * desired cost model enum or a representation of the cost model which * we could then parse. */ private static final DiskCostModel diskCostModel = DiskCostModel.DEFAULT; // /** // * Dumps the locators for an index of a relation. // * // * @param fed // * @param namespace // * The relation namespace. // * @param timestamp // * The timestamp of the view. // * @param keyOrder // * The index. // */ // private static void dumpMDI(AbstractScaleOutFederation<?> fed, // final String namespace, final long timestamp, // final IKeyOrder<?> keyOrder) { // // final String name = namespace + "." + keyOrder.getIndexName(); // // final Iterator<PartitionLocator> itr = fed // .locatorScan(name, timestamp, new byte[] {}/* fromKey */, // null/* toKey */, false/* reverseScan */); // // System.err.println("name=" + name + " @ " // + TimestampUtility.toString(timestamp)); // while (itr.hasNext()) { // System.err.println(itr.next()); // } // // } }