/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.bigdata.service; import java.io.IOException; import java.rmi.RemoteException; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.RejectedExecutionException; import com.bigdata.bfs.BigdataFileSystem; import com.bigdata.bop.engine.IQueryPeer; import com.bigdata.btree.IIndex; import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.ResultSet; import com.bigdata.btree.proc.IIndexProcedure; import com.bigdata.journal.IConcurrencyManager; import com.bigdata.journal.ITx; import com.bigdata.journal.NoSuchIndexException; import com.bigdata.journal.Options; import com.bigdata.mdi.IResourceMetadata; import com.bigdata.mdi.LocalPartitionMetadata; import com.bigdata.mdi.PartitionLocator; import com.bigdata.rawstore.IBlock; import com.bigdata.rawstore.IRawStore; import com.bigdata.resources.StaleLocatorException; import com.bigdata.resources.StoreManager; import com.bigdata.service.ndx.ClientIndexView; import com.bigdata.service.ndx.DataServiceTupleIterator; import com.bigdata.sparse.SparseRowStore; import cutthecrap.utils.striterators.IFilter; /** * <p> * The data service interface provides remote access to named indices, provides * for both unisolated and isolated operations on those indices, and exposes the * {@link ITxCommitProtocol} interface to the {@link ITransactionManagerService} * service for the coordination of distributed transactions. Clients normally * write to the {@link IIndex} interface. The {@link ClientIndexView} provides * an implementation of that interface supporting range partitioned scale-out * indices which transparently handles lookup of data services in the metadata * index and mapping of operations across the appropriate data services. * </p> * <p> * Indices are identified by name. Scale-out indices are broken into index * partitions, each of which is a named index hosted on a data service. The name * of an index partition is given by * {@link DataService#getIndexPartitionName(String, int)}. Clients are * <em>strongly</em> encouraged to use the {@link ClientIndexView} which * encapsulates lookup and distribution of operations on range partitioned * scale-out indices. * </p> * <p> * The data service exposes both fully isolated read-write transactions, * read-only transactions, lightweight read-historical operations, and * unisolated operations on named indices. These choices are captured by the * timestamp associated with the operation. When it is a transaction, this is * also known as the transaction identifier or <i>tx</i>. The following * distinctions are available: * <dl> * * <dt>Unisolated</dt> * * <dd> * <p> * Unisolated operation specify {@link ITx#UNISOLATED} as their transaction * identifier. Unisolated operations are ACID, but their scope is limited to the * commit group on the data service where the operation is executed. Unisolated * operations correspond more or less to read-committed semantics except that * writes are immediately visible to other operations in the same commit group. * </p> * <p> * Unisolated operations that allow writes obtain an exclusive lock on the live * version of the named index for the duration of the operation. Unisolated * operations that are declared as read-only read from the last committed state * of the named index and therefore do not compete with read-write unisolated * operations. This allows unisolated read operations to achieve higher * concurrency. The effect is as if the unisolated read operation runs before * the unisolated writes in a given commit group since the impact of those * writes are not visible to unisolated readers until the next commit point. * </p> * <p> * Unisolated write operations MAY be used to achieve "auto-commit" semantics * when distributed transactions are not required. Fully isolated transactions * are useful when multiple operations must be composed into a ACID unit. * </p> * <p> * While unisolated operations on a single data service are ACID, clients * generally operate against scale-out indices having multiple index partitions * hosted on multiple data services. Therefore client MUST NOT assume that an * unisolated operation described by the client against a scale-out index will * be ACID when that operation is distributed across the various index * partitions relevant to the client's request. In practice, this means that * contract for ACID unisolated operations is limited to either: (a) operations * where the data is located on a single data service instance; or (b) * unisolated operations that are inherently designed to achieve a * <em>consistent</em> result. Sometimes it is sufficient to configure a * scale-out index such that index partitions never split some logical unit - * for example, the {schema + primaryKey} for a {@link SparseRowStore}, thereby * obtaining an ACID guarentee since operations on a logical row will always * occur within the same index partition. * </p> * </dd> * * <dt>Light weight historical reads</dt> * * <dd>Historical reads are indicated using <code>tx</code>, where <i>tx</i> * is a timestamp and is associated with the closest commit point LTE to the * timestamp. A historical read is fully isolated but has very low overhead and * does NOT require the caller to open the transaction. The read will have a * consistent view of the data as of the most recent commit point not greater * than <i>tx</i>. Unlike a distributed read-only transaction, a historical * read does NOT impose a distributed read lock. While the operation will have * access to the necessary resources on the local data service, it is possible * that resources for the same timestamp will be concurrently released on other * data services. If you need to map a read operation across the distributed * database, the you must use a read only transaction which will assert the * necessary read-lock.</dd> * * <dt>Distributed transactions</dt> * * <dd>Distributed transactions are coordinated using an * {@link ITransactionManagerService} service and incur more overhead than both * unisolated and historical read operations. Transactions are assigned a start * time (the transaction identifier) when they begin and must be explicitly * closed by either an abort or a commit. Both read-only and read-write * transactions assert read locks which force the retention of resources * required for a consistent view as of the transaction start time until the * transaction is closed.</dd> * </dl> * </p> * <p> * Implementations of this interface MUST be thread-safe. Methods declared by * this interface MUST block for each operation. Client operations SHOULD be * buffered by a thread pool with a FIFO policy so that client requests may be * decoupled from data service operations and clients may achieve greater * parallelism. * </p> * * <h2>Index Partitions: Split, Join, and Move</h2> * * <p> * * Scale-out indices are broken tranparently down into index partitions. When a * scale-out index is initially registered, one or more index partitions are * created and registered on one or more data services. * </p> * * <p> * * Note that each index partitions is just an {@link IIndex} registered under * the name assigned by {@link DataService#getIndexPartitionName(String, int)} * and whose {@link IndexMetadata#getPartitionMetadata()} returns a description * of the resources required to compose a view of that index partition from the * resources located on a {@link DataService}. The {@link IDataService} will * respond for that index partition IFF there is an index under that name * registered on the {@link IDataService} as of the <i>timestamp</i> associated * with the request. If the index is not registered then a * {@link NoSuchIndexException} will be thrown. If the index was registered and * has since been split, joined or moved then a {@link StaleLocatorException} * will be thrown (this will occur only for index partitions of scale-out * indices). <strong>All methods on this and derived interfaces which are * defined for an index name and timestamp MUST conform to these semantics.</strong> * * </p> * * <p> * * As index partitions grow in size they may be <em>split</em> into 2 or more * index partitions covering the same key range as the original index partition. * When this happens a new index partition identifier is assigned by the * metadata service to each of the new index partitions and the old index * partition is retired in an atomic operation. A similar operation can * <em>move</em> an index partition to a different {@link IDataService} in * order to load balance a federation. Finally, when two index partitions shrink * in size, they maybe moved to the same {@link IDataService} and an atomic * <i>join</i> operation may re-combine them into a single index partition * spanning the same key range. * * </p> * * <p> * * Split, join, and move operations all result in the old index partition being * dropped on the {@link IDataService}. Clients having a stale * {@link PartitionLocator} record will attempt to reach the now defunct index * partition after it has been dropped and will receive a * {@link StaleLocatorException}. * * </p> * * * <h2>{@link StaleLocatorException}</h2> * * <p> * * {@link IDataService} clients MUST handle this exception by refreshing their * cached {@link PartitionLocator} for the key range associated with the index * partition which they wish to query and then re-issuing their request. By * following this simple rule the client will automatically handle index * partition splits, joins, and moves without error and in a manner which is * completely transparent to the application. Note that splits, joins, and moves * DO NOT alter the {@link PartitionLocator} for historical reads, only for * ongoing writes. This exception is generally (but not always) wrapped. * Applications typically DO NOT write directly to the {@link IDataService} * interface and therefore DO NOT need to worry about this. See * {@link ClientIndexView}, which automatically handles this exception. * * </p> * * <h2>{@link IOException}</h2> * * <p> * * All methods on this and derived interfaces can throw an {@link IOException}. * In all cases an <em>unwrapped</em> exception that is an instance of * {@link IOException} indicates an error in the Remote Method Invocation (RMI) * layer. * * </p> * * <h2>{@link ExecutionException} and {@link InterruptedException}</h2> * * <p> * * An <em>unwrapped</em> {@link ExecutionException} or * {@link InterruptedException} indicates a problem when running the request as * a task in the {@link IConcurrencyManager} on the {@link IDataService}. The * exception always wraps a root cause which may indicate the underlying * problem. Methods which do not declare these exceptions are not run under the * {@link IConcurrencyManager}. * * </p> * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ * * @todo add support for triggers? unisolated triggers must be asynchronous if * they will take actions with high latency (such as writing on a * different index partition, which could be remote). Low latency actions * might include emitting asynchronous messages. transactional triggers * can have more flexibility since they are under less of a latency * constraint. */ public interface IDataService extends ITxCommitProtocol, IService, IRemoteExecutor { /** * Register a named mutable index on the {@link DataService}. * <p> * Note: In order to register an index partition the * {@link IndexMetadata#getPartitionMetadata() partition metadata} property * MUST be set. The {@link LocalPartitionMetadata#getResources() resources} * property will then be overriden when the index is actually registered so * as to reflect the {@link IResourceMetadata} description of the journal on * which the index actually resides. * * @param name * The name that can be used to recover the index. In order to * create a partition of an index you must form the name of the * index partition using * {@link DataService#getIndexPartitionName(String, int)} (this * operation is generally performed by the * {@link IMetadataService} which manages scale-out indices). * * @param metadata * The metadata describing the index. * <p> * The {@link LocalPartitionMetadata#getResources()} property on * the {@link IndexMetadata#getPartitionMetadata()} SHOULD NOT be * set. The correct {@link IResourceMetadata}[] will be assigned * when the index is registered on the {@link IDataService}. * * @return <code>true</code> iff the index was created. <code>false</code> * means that the index was pre-existing, but the metadata specifics * for the index MAY differ from those specified. * * @todo exception if index exists? or modify to validate consistent decl * and exception iff not consistent. right now it just silently * succeeds if the index already exists. */ public void registerIndex(String name, IndexMetadata metadata) throws IOException, InterruptedException, ExecutionException; /** * Return the metadata for the named index. * * @param name * The index name. * @param timestamp * A transaction identifier, {@link ITx#UNISOLATED} for the * unisolated index view, {@link ITx#READ_COMMITTED}, or * <code>timestamp</code> for a historical view no later than * the specified timestamp. * * @return The metadata for the named index. * * @throws IOException */ public IndexMetadata getIndexMetadata(String name, long timestamp) throws IOException, InterruptedException, ExecutionException; /** * Drops the named index. * <p> * Note: In order to drop a partition of an index you must form the name of * the index partition using * {@link DataService#getIndexPartitionName(String, int)} (this operation is * generally performed by the {@link IMetadataService} which manages * scale-out indices). * * @param name * The index name. * * @exception IllegalArgumentException * if <i>name</i> does not identify a registered index. */ public void dropIndex(String name) throws IOException, InterruptedException, ExecutionException; /** * <p> * Streaming traversal of keys and/or values in a key range. * </p> * <p> * Note: In order to visit all keys in a range, clients are expected to * issue repeated calls in which the <i>fromKey</i> is incremented to the * successor of the last key visited until either an empty {@link ResultSet} * is returned or the {@link ResultSet#isLast()} flag is set, indicating * that all keys up to (but not including) the <i>startKey</i> have been * visited. See {@link ClientIndexView} (scale-out indices) and * {@link DataServiceTupleIterator} (unpartitioned indices), both of which * encapsulate this method. * </p> * <p> * Note: If the iterator can be determined to be read-only and it is * submitted as {@link ITx#UNISOLATED} then it will be run as * {@link ITx#READ_COMMITTED} to improve concurrency. * </p> * * @param tx * The transaction identifier -or- {@link ITx#UNISOLATED} IFF the * operation is NOT isolated by a transaction -or- * <code> - tx </code> to read from the most recent commit point * not later than the absolute value of <i>tx</i> (a fully * isolated read-only transaction using a historical start time). * @param name * The index name (required). * @param fromKey * The starting key for the scan (or <code>null</code> iff * there is no lower bound). * @param toKey * The first key that will not be visited (or <code>null</code> * iff there is no upper bound). * @param capacity * When non-zero, this is the maximum #of entries to process. * @param flags * One or more flags formed by bitwise OR of zero or more of the * constants defined by {@link IRangeQuery}. * @param filter * An optional object that may be used to layer additional * semantics onto the iterator. The filter will be constructed on * the server and in the execution context for the iterator, so * it will execute directly against the index for the maximum * efficiency. * * @exception InterruptedException * if the operation was interrupted. * @exception ExecutionException * If the operation caused an error. See * {@link ExecutionException#getCause()} for the underlying * error. */ public ResultSet rangeIterator(long tx, String name, byte[] fromKey, byte[] toKey, int capacity, int flags, IFilter filter) throws InterruptedException, ExecutionException, IOException; /** * <p> * Submit a procedure. * </p> * <p> * Unisolated operations SHOULD be used to achieve "auto-commit" semantics. * Fully isolated transactions are useful IFF multiple operations must be * composed into a ACID unit. * </p> * <p> * While unisolated batch operations on a single data service are ACID, * clients are required to locate all index partitions for the logical * operation and distribute their operation across the distinct data service * instances holding the affected index partitions. In practice, this means * that contract for ACID unisolated operations is limited to operations * where the data is located on a single data service instance. For ACID * operations that cross multiple data service instances the client MUST use * a fully isolated transaction. * </p> * * @param tx * The transaction identifier, {@link ITx#UNISOLATED} for an ACID * operation NOT isolated by a transaction, * {@link ITx#READ_COMMITTED} for a read-committed operation not * protected by a transaction (no global read lock), or any valid * commit time for a read-historical operation not protected by a * transaction (no global read lock). * @param name * The name of the index partition. * @param proc * The procedure to be executed. * * @return The {@link Future} from which the outcome of the procedure may be * obtained. * * @throws RejectedExecutionException * if the task can not be accepted for execution. * @throws IOException * if there is an RMI problem. */ public <T> Future<T> submit(long tx, String name, IIndexProcedure<T> proc) throws IOException; /** * {@inheritDoc} * <p> * Note: This interface is specialized by the {@link IDataService} for tasks * which need to gain access to the {@link IDataService} in order to gain * local access to index partitions, etc. Such tasks declare the * {@link IDataServiceCallable}. For example, scale-out joins use * this mechanism. * * @see IDataServiceCallable */ @Override public Future<? extends Object> submit(Callable<? extends Object> proc) throws RemoteException; /** * Read a low-level record from the described {@link IRawStore} described by * the {@link IResourceMetadata}. * * @param resource * The description of the resource containing that block. * @param addr * The address of the block in that resource. * * @return An object that may be used to read the block from the data * service. * * @throws IllegalArgumentException * if the resource is <code>null</code> * @throws IllegalArgumentException * if the addr is <code>0L</code> * @throws IllegalStateException * if the resource is not available. * @throws IllegalArgumentException * if the record identified by addr can not be read from the * resource. * * @deprecated This was a first try at adding support for reading low-level records * from a journal or index segment in support of the * {@link BigdataFileSystem}. * <p> * The API should provide a means to obtain a socket from which record * data may be streamed. The client sends the resource identifier * (UUID of the journal or index segment) and the address of the * record and the data service sends the record data. This is designed * for streaming reads of up to 64M or more (a record recorded on the * store as identified by the address). */ public IBlock readBlock(IResourceMetadata resource, long addr) throws IOException; /* * Methods in support of unit tests. * * @todo could be moved to their own interface. */ /** * Method sets a flag that will force overflow processing during the next * group commit and optionally forces a group commit <strong>(Note: This * method exists primarily for unit tests and benchmarking activities and * SHOULD NOT be used on a deployed federation as the overhead associated * with a compacting merge of each index partition can be significant). * </strong> * <p> * Normally there is no reason to invoke this method directly. Overflow * processing is triggered automatically on a bottom-up basis when the * extent of the live journal nears the {@link Options#MAXIMUM_EXTENT}. * * @param immediate * The purpose of this argument is to permit the caller to * trigger an overflow event even though there are no writes * being made against the data service. When <code>true</code> * the method will write a token record on the live journal in * order to provoke a group commit. In this case synchronous * overflow processing will have occurred by the time the method * returns. When <code>false</code> a flag is set and overflow * processing will occur on the next commit. * @param compactingMerge * The purpose of this flag is to permit the caller to indicate * that a compacting merge should be performed for all indices on * the data service (at least, all indices whose data are not * simply copied onto the new journal) during the next * synchronous overflow. Note that compacting merges of indices * are performed automatically from time to time so this flag * exists mainly for people who want to force a compacting merge * for some reason. * * @throws IOException * @throws InterruptedException * may be thrown if <i>immediate</i> is <code>true</code>. * @throws ExecutionException * may be thrown if <i>immediate</i> is <code>true</code>. */ public void forceOverflow(boolean immediate, boolean compactingMerge) throws IOException, InterruptedException, ExecutionException; /** * This attempts to pause the service accepting {@link ITx#UNISOLATED} * writes and then purges any resources that are no longer required based on * the {@link StoreManager.Options#MIN_RELEASE_AGE}. * <p> * Note: Resources are normally purged during synchronous overflow handling. * However, asynchronous overflow handling can cause resources to no longer * be needed as new index partition views are defined. This method MAY be * used to trigger a release before the next overflow event. * * @param timeout * The timeout (in milliseconds) that the method will await the * pause of the write service. * @param truncateJournal * When <code>true</code>, the live journal will be truncated * to its minimum extent (all writes will be preserved but there * will be no free space left in the journal). This may be used * to force the {@link DataService} to its minimum possible * footprint for the configured history retention policy. * * @return <code>true</code> if successful and <code>false</code> if the * write service could not be paused after the specified timeout. * * @param truncateJournal * When <code>true</code> the live journal will be truncated * such that no free space remains in the journal. * * @throws IOException * @throws InterruptedException */ public boolean purgeOldResources(long timeout, boolean truncateJournal) throws IOException, InterruptedException; /** * The #of asynchronous overflows that have taken place on this data service * (the counter is not restart safe). */ public long getAsynchronousOverflowCounter() throws IOException; /** * Return <code>true</code> iff the data service is currently engaged in * overflow processing. */ public boolean isOverflowActive() throws IOException; /** * Return the {@link IQueryPeer} running on this service. */ public IQueryPeer getQueryEngine() throws IOException; // /** // * Shutdown the service immediately and destroy any persistent data // * associated with the service. // * // * moved to {@link IService}? // */ // public void destroy() throws IOException; }