/* Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Mar 31, 2009 */ package com.bigdata.service.ndx; import java.io.IOException; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.FutureTask; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import org.apache.log4j.Level; import org.apache.log4j.Logger; import com.bigdata.btree.AsynchronousIndexWriteConfiguration; import com.bigdata.btree.ICounter; import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.ITupleCursor; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.ITupleSerializer; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.ResultSet; import com.bigdata.btree.keys.KVO; import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure.ResultBitBuffer; import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure.ResultBuffer; import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedureConstructor; import com.bigdata.btree.proc.BatchContains.BatchContainsConstructor; import com.bigdata.btree.proc.BatchInsert.BatchInsertConstructor; import com.bigdata.btree.proc.BatchLookup.BatchLookupConstructor; import com.bigdata.btree.proc.BatchPutIfAbsent.BatchPutIfAbsentConstructor; import com.bigdata.btree.proc.BatchRemove.BatchRemoveConstructor; import com.bigdata.btree.proc.IIndexProcedure; import com.bigdata.btree.proc.IKeyArrayIndexProcedure; import com.bigdata.btree.proc.IKeyRangeIndexProcedure; import com.bigdata.btree.proc.IParallelizableIndexProcedure; import com.bigdata.btree.proc.IResultHandler; import com.bigdata.btree.proc.ISimpleIndexProcedure; import com.bigdata.btree.proc.LongAggregator; import com.bigdata.btree.proc.RangeCountProcedure; import com.bigdata.counters.CounterSet; import com.bigdata.journal.ITx; import com.bigdata.journal.TimestampUtility; import com.bigdata.mdi.IMetadataIndex; import com.bigdata.mdi.IResourceMetadata; import com.bigdata.mdi.MetadataIndex; import com.bigdata.mdi.MetadataIndex.MetadataIndexMetadata; import com.bigdata.mdi.PartitionLocator; import com.bigdata.relation.accesspath.BlockingBuffer; import com.bigdata.resources.StaleLocatorException; import com.bigdata.service.AbstractClient; import com.bigdata.service.AbstractScaleOutFederation; import com.bigdata.service.IBigdataClient; import com.bigdata.service.IBigdataClient.Options; import com.bigdata.service.IBigdataFederation; import com.bigdata.service.IDataService; import com.bigdata.service.IMetadataService; import com.bigdata.service.Split; import com.bigdata.service.ndx.pipeline.IDuplicateRemover; import com.bigdata.service.ndx.pipeline.IndexWriteTask; import com.bigdata.util.BytesUtil; import cutthecrap.utils.striterators.IFilter; /** * Abstract base class for the {@link IScaleOutClientIndex} implementation(s). * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ * * @todo If the index was dropped then that should cause the operation to abort * (only possible for read committed or unisolated operations). * <p> * Likewise, if a transaction is aborted, then then index should refuse * further operations. * * @todo detect data service failure and coordinate cutover to the failover data * services. ideally you can read on a failover data service at any time * but it should not accept write operations unless it is the primary data * service in the failover chain. * <p> * Offer policies for handling index partitions that are unavailable at * the time of the request (continued operation during partial failure). * * @todo We should be able to transparently use either a hash mod N approach to * distributed index partitions or a dynamic approach based on overflow. * This could even be decided on a per-index basis. The different * approaches would be hidden by appropriate implementations of this * class. * <p> * A hash partitioned index will need to enforce optional read-consistent * semantics. This can be done by choosing a recent broadcast commitTime * for the read or by re-issuing queries that come in with a different * commitTime. */ abstract public class AbstractScaleOutClientIndexView implements IScaleOutClientIndex { /** * Note: Invocations of the non-batch API are logged at the WARN level since * they result in an application that can not scale-out efficiently. */ protected static final transient Logger log = Logger .getLogger(AbstractScaleOutClientIndexView.class); /** * True iff the {@link #log} level is WARN or less. */ final protected boolean WARN = log.getEffectiveLevel().toInt() <= Level.WARN .toInt(); /** * Error message used if we were unable to start a new transaction in order * to provide read-consistent semantics for an {@link ITx#READ_COMMITTED} * view or for a read-only operation on an {@link ITx#UNISOLATED} view. */ static protected final transient String ERR_NEW_TX = "Could not start transaction"; /** * Error message used if we were unable to abort a transaction that we * started in order to provide read-consistent semantics for an * {@link ITx#READ_COMMITTED} view or for a read-only operation on an * {@link ITx#UNISOLATED} view. */ static protected final transient String ERR_ABORT_TX = "Could not abort transaction: tx="; protected final AbstractScaleOutFederation fed; public AbstractScaleOutFederation getFederation() { return fed; } /** * The thread pool exposed by {@link IBigdataFederation#getExecutorService()} */ protected ThreadPoolExecutor getThreadPool() { return (ThreadPoolExecutor) fed.getExecutorService(); } /** * The timeout in milliseconds for tasks run on an {@link IDataService}. * * @see Options#CLIENT_TASK_TIMEOUT */ protected final long taskTimeout; /** * */ protected static final String NON_BATCH_API = "Non-batch API"; /** * This may be used to disable the non-batch API, which is quite convenient * for locating code that needs to be re-written to use * {@link IIndexProcedure}s. */ protected final boolean batchOnly; /** * The default capacity for the {@link #rangeIterator(byte[], byte[])} */ private final int capacity; /** * The timestamp from the ctor. */ protected final long timestamp; @Override final public long getTimestamp() { return timestamp; } /** * The name of the scale-out index (from the ctor). */ protected final String name; @Override final public String getName() { return name; } /** * The {@link IMetadataIndex} for this scale-out index. * * @todo This is a bit dangerous since most of the time when you want the * metadata index you may have a timestamp in effect which is * different from the timestamp of the view (e.g., a read-consistent * transaction). */ private final IMetadataIndex metadataIndex; /** * The {@link IndexMetadata} for the {@link MetadataIndex} that manages the * scale-out index. The metadata template for the managed scale-out index is * available as a field on this object. */ private final MetadataIndexMetadata metadataIndexMetadata; /** * Obtain the proxy for a metadata service. if this instance fails, then we * can always ask for a new instance for the same federation (failover). */ final protected IMetadataService getMetadataService() { return fed.getMetadataService(); } /** * Return a view of the metadata index for the scale-out index as of the * timestamp associated with this index view. * * @todo This is a bit dangerous since most of the time when you want the * metadata index you may have a timestamp in effect which is * different from the timestamp of the view (e.g., a read-consistent * transaction). * * @see IBigdataFederation#getMetadataIndex(String, long) */ final protected IMetadataIndex getMetadataIndex() { return metadataIndex; } /** * @see IBigdataClient#isReadConsistent() */ final protected boolean readConsistent; @Override public String toString() { final StringBuilder sb = new StringBuilder(); sb.append(getClass().getSimpleName()); sb.append("{ name=" + name); sb.append(", timestamp=" + timestamp); sb.append(", readConsistent=" + readConsistent); sb.append("}"); return sb.toString(); } /** * Create a view on a scale-out index. * * @param fed * The federation containing the index. * @param name * The index name. * @param timestamp * A transaction identifier, {@link ITx#UNISOLATED} for the * unisolated index view, {@link ITx#READ_COMMITTED}, or * <code>timestamp</code> for a historical view no later than * the specified timestamp. * @param metadataIndex * The {@link IMetadataIndex} for the named scale-out index as of * that timestamp. Note that the {@link IndexMetadata} on this * object contains the template {@link IndexMetadata} for the * scale-out index partitions. */ public AbstractScaleOutClientIndexView(final AbstractScaleOutFederation fed, final String name, final long timestamp, final IMetadataIndex metadataIndex) { if (fed == null) throw new IllegalArgumentException(); if (name == null) throw new IllegalArgumentException(); if (metadataIndex == null) throw new IllegalArgumentException(); this.fed = fed; this.name = name; this.timestamp = timestamp; this.metadataIndex = metadataIndex; this.metadataIndexMetadata = metadataIndex.getIndexMetadata(); final AbstractClient<?> client = fed.getClient(); this.capacity = client.getDefaultRangeQueryCapacity(); this.batchOnly = client.getBatchApiOnly(); this.taskTimeout = client.getTaskTimeout(); this.readConsistent = client.isReadConsistent(); } /** * Metadata for the {@link MetadataIndex} that manages the scale-out index * (cached). */ public MetadataIndexMetadata getMetadataIndexMetadata() { return metadataIndexMetadata; } /** * The metadata for the managed scale-out index. Among other things, this * gets used to determine how we serialize keys and values for * {@link IKeyArrayIndexProcedure}s when we serialize a procedure to be * sent to a remote {@link IDataService}. */ @Override public IndexMetadata getIndexMetadata() { return metadataIndexMetadata.getManagedIndexMetadata(); } @Override public IDataService getDataService(final PartitionLocator pmd) { return fed.getDataService(pmd.getDataServiceUUID()); } @Override @SuppressWarnings("unchecked") public Iterator<PartitionLocator> locatorScan(final long ts, final byte[] fromKey, final byte[] toKey, final boolean reverseScan) { return fed.locatorScan(name, ts, fromKey, toKey, reverseScan); } /** * This operation is not supported - the resource description of a scale-out * index would include all "live" resources in the corresponding * {@link MetadataIndex}. */ @Override public IResourceMetadata[] getResourceMetadata() { throw new UnsupportedOperationException(); } @Override public ICounter getCounter() { throw new UnsupportedOperationException(); } private volatile ITupleSerializer tupleSer = null; protected ITupleSerializer getTupleSerializer() { if (tupleSer == null) { synchronized (this) { if (tupleSer == null) { tupleSer = getIndexMetadata().getTupleSerializer(); } } } return tupleSer; } @Override public boolean contains(Object key) { key = getTupleSerializer().serializeKey(key); return contains((byte[])key); } @Override public boolean contains(final byte[] key) { if (batchOnly) log.error(NON_BATCH_API,new RuntimeException()); else if(WARN) log.warn(NON_BATCH_API); final byte[][] keys = new byte[][] { key }; final IResultHandler resultHandler = new IdentityHandler(); submit(0/* fromIndex */, 1/* toIndex */, keys, null/* vals */, BatchContainsConstructor.INSTANCE, resultHandler); return ((ResultBitBuffer) resultHandler.getResult()).getResult()[0]; } @Override public Object insert(Object key,Object val) { final ITupleSerializer tupleSer = getTupleSerializer(); key = tupleSer.serializeKey(key); val = tupleSer.serializeKey(val); final byte[] oldval = insert((byte[])key, (byte[])val); // FIXME decode tuple to old value. throw new UnsupportedOperationException(); } @Override public byte[] insert(final byte[] key, final byte[] value) { if (batchOnly) log.error(NON_BATCH_API,new RuntimeException()); else if(WARN) log.warn(NON_BATCH_API); final byte[][] keys = new byte[][] { key }; final byte[][] vals = new byte[][] { value }; final IResultHandler resultHandler = new IdentityHandler(); submit(0/* fromIndex */, 1/* toIndex */, keys, vals, BatchInsertConstructor.RETURN_OLD_VALUES, resultHandler); return ((ResultBuffer) resultHandler.getResult()).getResult(0); } @Override public byte[] putIfAbsent(final byte[] key, final byte[] value) { if (batchOnly) log.error(NON_BATCH_API,new RuntimeException()); else if(WARN) log.warn(NON_BATCH_API); final byte[][] keys = new byte[][] { key }; final byte[][] vals = new byte[][] { value }; final IResultHandler resultHandler = new IdentityHandler(); submit(0/* fromIndex */, 1/* toIndex */, keys, vals, BatchPutIfAbsentConstructor.RETURN_OLD_VALUES, resultHandler); return ((ResultBuffer) resultHandler.getResult()).getResult(0); } @Override public Object lookup(Object key) { key = getTupleSerializer().serializeKey(key); final byte[] val = lookup((byte[])key); // FIXME decode tuple to old value. throw new UnsupportedOperationException(); } @Override public byte[] lookup(final byte[] key) { if (batchOnly) log.error(NON_BATCH_API,new RuntimeException()); else if(WARN) log.warn(NON_BATCH_API); final byte[][] keys = new byte[][]{key}; final IResultHandler resultHandler = new IdentityHandler(); submit(0/* fromIndex */, 1/* toIndex */, keys, null/* vals */, BatchLookupConstructor.INSTANCE, resultHandler); return ((ResultBuffer) resultHandler.getResult()).getResult(0); } @Override public Object remove(Object key) { key = getTupleSerializer().serializeKey(key); final byte[] oldval = remove((byte[])key); // FIXME decode tuple to old value. throw new UnsupportedOperationException(); } @Override public byte[] remove(final byte[] key) { if (batchOnly) log.error(NON_BATCH_API,new RuntimeException()); else if(WARN) log.warn(NON_BATCH_API); final byte[][] keys = new byte[][]{key}; final IResultHandler resultHandler = new IdentityHandler(); submit(0/* fromIndex */, 1/* toIndex */, keys, null/* vals */, BatchRemoveConstructor.RETURN_OLD_VALUES, resultHandler); return ((ResultBuffer) resultHandler.getResult()).getValues().get(0); } /* * All of these methods need to divide up the operation across index * partitions. */ @Override public long rangeCount() { return rangeCount(null, null); } /** * Returns the sum of the range count for each index partition spanned by * the key range. */ @Override public long rangeCount(final byte[] fromKey, final byte[] toKey) { final LongAggregator handler = new LongAggregator(); final RangeCountProcedure proc = new RangeCountProcedure( false/* exact */, false/* deleted */, fromKey, toKey); submit(fromKey, toKey, proc, handler); return handler.getResult(); } /** * The exact range count is obtained by mapping a key-range scan over the * index partitions. The operation is parallelized. */ @Override final public long rangeCountExact(final byte[] fromKey, final byte[] toKey) { final LongAggregator handler = new LongAggregator(); final RangeCountProcedure proc = new RangeCountProcedure( true/* exact */, false/*deleted*/, fromKey, toKey); submit(fromKey, toKey, proc, handler); return handler.getResult(); } /** * The exact range count of deleted and undeleted tuples is obtained by * mapping a key-range scan over the index partitions. The operation is * parallelized. */ @Override final public long rangeCountExactWithDeleted(final byte[] fromKey, final byte[] toKey) { final LongAggregator handler = new LongAggregator(); final RangeCountProcedure proc = new RangeCountProcedure( true/* exact */, true/* deleted */, fromKey, toKey); submit(fromKey, toKey, proc, handler); return handler.getResult(); } @Override final public ITupleIterator rangeIterator() { return rangeIterator(null, null); } /** * An {@link ITupleIterator} that kinds the use of a series of * {@link ResultSet}s to cover all index partitions spanned by the key * range. */ @Override public ITupleIterator rangeIterator(final byte[] fromKey, final byte[] toKey) { return rangeIterator(fromKey, toKey, capacity, IRangeQuery.DEFAULT /* flags */, null/* filter */); } /** * Identifies the index partition(s) that are spanned by the key range query * and maps an iterator across each index partition. The iterator buffers * responses up to the specified capacity and a follow up iterator request * is automatically issued if the iterator has not exhausted the key range * on a given index partition. Once the iterator is exhausted on a given * index partition it is then applied to the next index partition spanned by * the key range. * * @todo If the return iterator implements {@link ITupleCursor} then this * will need be modified to defer request of the initial result set * until the caller uses first(), last(), seek(), hasNext(), or * hasPrior(). */ @Override public ITupleIterator rangeIterator(final byte[] fromKey, final byte[] toKey, int capacity, final int flags, final IFilter filter) { if (capacity == 0) { capacity = this.capacity; } /* * Does the iterator declare that it will not write back on the index? */ final boolean readOnly = ((flags & READONLY) != 0); if (readOnly && ((flags & REMOVEALL) != 0)) { throw new IllegalArgumentException(); } final boolean isReadConsistentTx; final long ts; if ((timestamp == ITx.UNISOLATED && readOnly) || (timestamp == ITx.READ_COMMITTED && readConsistent)) { try { // run as globally consistent read. ts = fed.getTransactionService().newTx(ITx.READ_COMMITTED); } catch (IOException ex) { throw new RuntimeException(ERR_NEW_TX, ex); } isReadConsistentTx = true; } else { ts = timestamp; isReadConsistentTx = false; } return new PartitionedTupleIterator(this, ts, isReadConsistentTx, fromKey, toKey, capacity, flags, filter); } /** * Utility method to split a set of ordered keys into partitions based the * index partitions defined for a scale-out index. * <p> * Find the partition for the first key. Check the last key, if it is in the * same partition then then this is the simplest case and we can just send * the data along. * <p> * Otherwise, perform a binary search on the remaining keys looking for the * index of the first key GTE the right separator key for that partition. * The batch for this partition is formed from all keys from the first key * for that partition up to but excluding the index position identified by * the binary search (if there is a match; if there is a miss, then the * binary search result needs to be converted into a key index and that will * be the last key for the current partition). * <p> * Examine the next key and repeat the process until all keys have been * allocated to index partitions. * <p> * Note: Split points MUST respect the "row" identity for a sparse row * store, but we get that constraint by maintaining the index partition * boundaries in agreement with the split point constraints for the index. * * @param ts * The timestamp for the {@link IMetadataIndex} view that will be * applied to choose the {@link Split}s. * @param fromIndex * The index of the first key in <i>keys</i> to be processed * (inclusive). * @param toIndex * The index of the last key in <i>keys</i> to be processed. * @param keys * An array of keys. Each key is an interpreted as an unsigned * byte[]. All keys must be non-null. The keys must be in sorted * order. * * @return The {@link Split}s that you can use to form requests based on * the identified first/last key and partition identified by this * process. * * @see Arrays#sort(Object[], int, int, java.util.Comparator) * * @see BytesUtil#compareBytes(byte[], byte[]) * * @todo Caching? This procedure performs the minimum #of lookups using * {@link IMetadataIndex#find(byte[])} since that operation will be an * RMI in a distributed federation. The find(byte[] key) operation is * difficult to cache since it locates the index partition that would * span the key and many, many different keys could fit into that same * index partition. The only effective cache technique may be an LRU * that scans ~10 caches locators to see if any of them is a match * before reaching out to the remote {@link IMetadataService}. Or * perhaps the locators can be cached in a local BTree and a miss * there would result in a read through to the remote * {@link IMetadataService} but then we have the problem of figuring * out when to release locators if the client is long-lived. */ @Override public LinkedList<Split> splitKeys(final long ts, final int fromIndex, final int toIndex, final byte[][] keys) { assert keys != null; assert fromIndex >= 0; assert fromIndex < toIndex; assert toIndex <= keys.length; final LinkedList<Split> splits = new LinkedList<Split>(); // start w/ the first key. int currentIndex = fromIndex; while (currentIndex < toIndex) { /* * This is partition spanning the current key (RMI) * * Note: Using the caller's timestamp here! */ final PartitionLocator locator = fed.getMetadataIndex(name, ts) .find(keys[currentIndex]); if (locator == null) throw new RuntimeException("No index partitions?: name=" + name); final byte[] rightSeparatorKey = locator.getRightSeparatorKey(); if (rightSeparatorKey == null) { /* * The last index partition does not have an upper bound and * will absorb any keys that order GTE to its left separator * key. */ assert isValidSplit( locator, currentIndex, toIndex, keys ); splits.add(new Split(locator, currentIndex, toIndex)); // done. currentIndex = toIndex; } else { /* * Otherwise this partition has an upper bound, so figure out * the index of the last key that would go into this partition. * * We do this by searching for the rightSeparator of the index * partition itself. */ int pos = BytesUtil.binarySearch(keys, currentIndex, toIndex - currentIndex, rightSeparatorKey); if (pos >= 0) { /* * There is a hit on the rightSeparator key. The index * returned by the binarySearch is the exclusive upper bound * for the split. The key at that index is excluded from the * split - it will be the first key in the next split. * * Note: There is a special case when the keys[] includes * duplicates of the key that corresponds to the * rightSeparator. This causes a problem where the * binarySearch returns the index of ONE of the keys that is * equal to the rightSeparator key and we need to back up * until we have found the FIRST ONE. * * Note: The behavior of the binarySearch is effectively * under-defined here and sometimes it will return the index * of the first key EQ to the rightSeparator while at other * times it will return the index of the second or greater * key that is EQ to the rightSeparatoer. */ while (pos > currentIndex) { if (BytesUtil.bytesEqual(keys[pos - 1], rightSeparatorKey)) { // keep backing up. pos--; continue; } break; } if (log.isDebugEnabled()) log.debug("Exact match on rightSeparator: pos=" + pos + ", key=" + BytesUtil.toString(keys[pos])); } else if (pos < 0) { /* * There is a miss on the rightSeparator key (it is not * present in the keys that are being split). In this case * the binary search returns the insertion point. We then * compute the exclusive upper bound from the insertion * point. */ pos = -pos - 1; assert pos > currentIndex && pos <= toIndex : "Expected pos in [" + currentIndex + ":" + toIndex + ") but pos=" + pos; } /* * Note: this test can be enabled if you are having problems * with KeyAfterPartition or KeyBeforePartition. It will go * through more effort to validate the constraints on the split. * However, due to the additional byte[] comparisons, this * SHOULD be disabled except when tracking a bug. */ // assert validSplit( locator, currentIndex, pos, keys ); splits.add(new Split(locator, currentIndex, pos)); currentIndex = pos; } } return splits; } @Override public LinkedList<Split> splitKeys(final long ts, final int fromIndex, final int toIndex, final KVO[] a) { /* * Change the shape of the data so that we can split it. */ final byte[][] keys = new byte[a.length][]; for (int i = 0; i < a.length; i++) { keys[i] = a[i].key; } return splitKeys(ts, fromIndex, toIndex, keys); } /** * Paranoia testing for generated splits. * * @param locator * @param fromIndex * @param toIndex * @param keys * @return */ private boolean isValidSplit(final PartitionLocator locator, final int fromIndex, final int toIndex, final byte[][] keys) { assert fromIndex <= toIndex : "fromIndex=" + fromIndex + ", toIndex=" + toIndex; assert fromIndex >= 0 : "fromIndex=" + fromIndex; assert toIndex <= keys.length : "toIndex=" + toIndex + ", keys.length=" + keys.length; // begin with the left separator on the index partition. byte[] lastKey = locator.getLeftSeparatorKey(); assert lastKey != null; for (int i = fromIndex; i < toIndex; i++) { final byte[] key = keys[i]; assert key != null; if (lastKey != null) { final int ret = BytesUtil.compareBytes(lastKey, key); assert ret <= 0 : "keys out of order: i=" + i + ", lastKey=" + BytesUtil.toString(lastKey) + ", key=" + BytesUtil.toString(key)+", keys="+BytesUtil.toString(keys); } lastKey = key; } // Note: Must be strictly LT the rightSeparator key (when present). { final byte[] key = locator.getRightSeparatorKey(); if (key != null) { int ret = BytesUtil.compareBytes(lastKey, key); assert ret < 0 : "keys out of order: lastKey=" + BytesUtil.toString(lastKey) + ", rightSeparator=" + BytesUtil.toString(key)+", keys="+BytesUtil.toString(keys); } } return true; } @Override public void staleLocator(final long ts, final PartitionLocator locator, final StaleLocatorException cause) { if (locator == null) throw new IllegalArgumentException(); if (ts != ITx.UNISOLATED && ts != ITx.READ_COMMITTED) { /* * Stale locator exceptions should not be thrown for these views. */ throw new RuntimeException( "Stale locator, but views should be consistent? timestamp=" + TimestampUtility.toString(ts)); } // notify the metadata index view that it has a stale locator. fed.getMetadataIndex(name, timestamp).staleLocator(locator); } @Override public Object submit(final byte[] key, final ISimpleIndexProcedure proc) { if (readConsistent && proc.isReadOnly() && TimestampUtility.isReadCommittedOrUnisolated(getTimestamp())) { /* * Use globally consistent reads for the mapped procedure. */ final long tx; try { tx = fed.getTransactionService().newTx(ITx.READ_COMMITTED); } catch (IOException ex) { throw new RuntimeException(ERR_NEW_TX, ex); } try { return submit(tx, key, proc); } finally { try { fed.getTransactionService().abort(tx); } catch (IOException ex) { // log error and ignore since the operation is complete. log.error(ERR_ABORT_TX + tx, ex); } } } else { /* * Timestamp is either a tx already or the caller is risking errors * with lightweight historical reads. */ return submit(timestamp, key, proc); } } /** * Maps an {@link IIndexProcedure} across a key range by breaking it down * into one task per index partition spanned by that key range. * <p> * Note: In order to avoid growing the task execution queue without bound, * an upper bound of {@link Options#CLIENT_MAX_PARALLEL_TASKS_PER_REQUEST} * tasks will be placed onto the queue at a time. More tasks will be * submitted once those tasks finish until all tasks have been executed. * When the task is not parallelizable the tasks will be submitted to the * corresponding index partitions at a time and in key order. */ @Override public void submit(final byte[] fromKey, final byte[] toKey, final IKeyRangeIndexProcedure proc, final IResultHandler resultHandler) { if (proc == null) throw new IllegalArgumentException(); if (readConsistent && proc.isReadOnly() && TimestampUtility.isReadCommittedOrUnisolated(getTimestamp())) { /* * Use globally consistent reads for the mapped procedure. */ final long tx; try { tx = fed.getTransactionService().newTx(ITx.READ_COMMITTED); } catch (IOException ex) { throw new RuntimeException(ERR_NEW_TX, ex); } try { submit(tx, fromKey, toKey, proc, resultHandler); } finally { try { fed.getTransactionService().abort(tx); } catch (IOException ex) { // log error and ignore since the operation is complete. log.error(ERR_ABORT_TX + tx, ex); } } } else { /* * Timestamp is either a tx already or the caller is risking errors * with lightweight historical reads. */ submit(timestamp, fromKey, toKey, proc, resultHandler); } } /** * The procedure will be transparently broken down and executed against each * index partitions spanned by its keys. If the <i>ctor</i> creates * instances of {@link IParallelizableIndexProcedure} then the procedure * will be mapped in parallel against the relevant index partitions. * <p> * Note: Unlike mapping an index procedure across a key range, this method * is unable to introduce a truely enourmous burden on the client's task * queue since the #of tasks arising is equal to the #of splits and bounded * by <code>n := toIndex - fromIndex</code>. * * @return The aggregated result of applying the procedure to the relevant * index partitions. */ @Override public void submit(final int fromIndex, final int toIndex, final byte[][] keys, final byte[][] vals, final AbstractKeyArrayIndexProcedureConstructor ctor, final IResultHandler aggregator) { if (ctor == null) { throw new IllegalArgumentException(); } // iff we created a read-historical tx in this method. final boolean isTx; // the timestamp that will be used for the operation. final long ts; { /* * Instantiate the procedure on all the data so we can figure out if * it is read-only and whether or not we need to create a read-only * transaction to run it. * * @todo This assumes that people write procedures that are * flyweight in how they encode the data in their ctor. If the don't * then there will be an overhead for this. */ final IKeyArrayIndexProcedure proc = ctor.newInstance(this, fromIndex, toIndex, keys, vals); if (readConsistent && proc.isReadOnly() && TimestampUtility .isReadCommittedOrUnisolated(getTimestamp())) { /* * Create a read-historical transaction from the last commit * point of the federation in order to provide consistent * reads for the mapped procedure. */ isTx = true; try { ts = fed.getTransactionService().newTx(ITx.READ_COMMITTED); } catch (IOException e) { throw new RuntimeException(ERR_NEW_TX, e); } } else { // might be a tx, but not one that we created here. isTx = false; ts = getTimestamp(); } } try { submit(ts, fromIndex, toIndex, keys, vals, ctor, aggregator); } finally { if (isTx) { try { fed.getTransactionService().abort(ts); } catch (IOException e) { /* * log error but do not rethrow since operation is over * anyway. */ log.error(ERR_ABORT_TX + ": " + ts, e); } } } } /** * Variant uses the caller's timestamp. * * @param ts * @param key * @param proc * @return */ abstract protected Object submit(final long ts, final byte[] key, final ISimpleIndexProcedure proc); /** * Variant uses the caller's timestamp. * * @param ts * @param fromKey * @param toKey * @param proc * @param resultHandler */ abstract protected void submit(final long ts, final byte[] fromKey, final byte[] toKey, final IKeyRangeIndexProcedure proc, final IResultHandler resultHandler); /** * Variant uses the caller's timestamp. * * @param ts * @param fromIndex * @param toIndex * @param keys * @param vals * @param ctor * @param aggregator */ abstract protected void submit(final long ts, final int fromIndex, final int toIndex, final byte[][] keys, final byte[][] vals, final AbstractKeyArrayIndexProcedureConstructor ctor, final IResultHandler aggregator); public <T extends IKeyArrayIndexProcedure, O, R, A> BlockingBuffer<KVO<O>[]> newWriteBuffer( final IResultHandler<R, A> resultHandler, final IDuplicateRemover<O> duplicateRemover, final AbstractKeyArrayIndexProcedureConstructor<T> ctor) { final AsynchronousIndexWriteConfiguration conf = getIndexMetadata() .getAsynchronousIndexWriteConfiguration(); final BlockingBuffer<KVO<O>[]> writeBuffer = new BlockingBuffer<KVO<O>[]>( // @todo array vs linked w/ capacity and fair vs unfair. new ArrayBlockingQueue<KVO<O>[]>(conf.getMasterQueueCapacity()), conf.getMasterChunkSize(),// conf.getMasterChunkTimeoutNanos(),// TimeUnit.NANOSECONDS,// true// ordered ); final IndexWriteTask.M<T, O, R, A> task = new IndexWriteTask.M<T, O, R, A>( this, // conf.getSinkIdleTimeoutNanos(),// conf.getSinkPollTimeoutNanos(),// conf.getSinkQueueCapacity(), // conf.getSinkChunkSize(), // conf.getSinkChunkTimeoutNanos(),// duplicateRemover,// ctor,// resultHandler,// fed.getIndexCounters(name).asynchronousStats, writeBuffer// ); /** * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/707"> * BlockingBuffer.close() does not unblock threads </a> */ // Wrap computation as FutureTask. @SuppressWarnings({ "unchecked", "rawtypes" }) final FutureTask<?> ft = new FutureTask(task); // Set Future on BlockingBuffer writeBuffer.setFuture(ft); // Submit computation for evaluation. fed.getExecutorService().submit(ft); return task.getBuffer(); } /** * Return a new {@link CounterSet} backed by the {@link ScaleOutIndexCounters} * for this scale-out index. */ @Override public CounterSet getCounters() { return getFederation().getIndexCounters(name).getCounters(); } }