/*
* Created on Jan 17, 2008
*/
package com.bigdata.bfs;
import java.io.DataInput;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import java.util.Vector;
import org.apache.log4j.Logger;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexTypeEnum;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.proc.ISimpleIndexProcedure;
import com.bigdata.journal.IIndexManager;
import com.bigdata.journal.IResourceLock;
import com.bigdata.journal.ITx;
import com.bigdata.journal.Journal;
import com.bigdata.mdi.MetadataIndex;
import com.bigdata.rawstore.IBlock;
import com.bigdata.rawstore.WormAddressManager;
import com.bigdata.relation.AbstractResource;
import com.bigdata.relation.IDatabase;
import com.bigdata.relation.RelationSchema;
import com.bigdata.relation.locator.DefaultResourceLocator;
import com.bigdata.search.FullTextIndex;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.service.IDataService;
import com.bigdata.sparse.AutoIncIntegerCounter;
import com.bigdata.sparse.IRowStoreConstants;
import com.bigdata.sparse.ITPS;
import com.bigdata.sparse.ITPV;
import com.bigdata.sparse.LogicalRowSplitHandler;
import com.bigdata.sparse.Schema;
import com.bigdata.sparse.SparseRowStore;
import com.bigdata.util.Bytes;
import cutthecrap.utils.striterators.Resolver;
import cutthecrap.utils.striterators.Striterator;
/**
* A distributed file system with extensible metadata and atomic append
* implemented using the bigdata scale-out architecture. Files have a client
* assigned identifier, which is a Unicode string. The file identifier MAY be
* structured so as to look like a hierarchical file system using any desired
* convention. Files are versioned and historical versions MAY be accessed until
* the next compacting merge discards their data. File data is stored in large
* {@link #blockSize} blocks. Partial and even empty blocks are allowed and only
* the data written will be stored. <code>2^63-1</code> distinct blocks may be
* written per file version, making the maximum possible file size
* <code>536,870,912</code> exabytes. Files may be used as queues, in which
* case blocks containing new records are atomically appended while a map/reduce
* style master consumes the head block of the file.
* <p>
* Efficient method are offered for streaming and block oriented IO. All block
* read and write operations are atomic, including block append. Files may be
* easily written such that records never cross a block boundary by the
* expediency of flushing the output stream if a record would overflow the
* current block. A flush forces the atomic write of a partial block. Partial
* blocks are stored efficiently - only the bytes actually written are stored.
* Blocks are large enough that most applications can safely store a large
* number of logical records in each block. Files comprised of application
* defined logical records organized into a sequence of blocks are well-suited
* to map/reduce processing. They may be efficiently split at block boundaries
* and references to the blocks distributed to clients. Likewise, reduce clients
* can aggregate data into large files suitable for further map/reduce
* processing.
* <p>
* The distributed file system uses two scale-out indices to support ACID
* operations on file metadata and atomic file append. These ACID guarantees
* arise from the use of unisolated operations on the respective indices and
* therefore apply only to the individual file metadata or file block
* operations. In particular, file metadata read and write are atomic and all
* individual file block IO (read, write, and append) operations are atomic.
* Atomicity is NOT guaranteed when performing more than a single file block IO
* operation, e.g., multiple appends MIGHT NOT write sequential blocks since
* other block operations could have intervened.
* <p>
* The content length of the file is not stored as file metadata. Instead it MAY
* be estimated by a range count of the index entries spanned by the file's
* data. The exact file size may be readily determined when reading small files
* by the expediency of sucking the entire file into a buffer - all reads are at
* least one block. Streaming processing is advised in all cases when handling
* large files, including when the file is to be delivered via HTTP.
* <p>
* The {@link #getFileMetadataIndex() metadata index} uses a {@link SparseRowStore}
* design, similar to Google's bigtable or Hadoop's HBase. All updates to file
* version metadata are atomic. The primary key in the metadata index for every
* file is its {@link FileMetadataSchema#ID}. In addition, each version of a file
* has a distinct {@link FileMetadataSchema#VERSION} property. File creation time,
* version creation time, and file version metadata update timestamps may be
* recovered from the timestamps associated with the properties in the metadata
* index. The use of the {@link FileMetadataSchema#CONTENT_TYPE} and
* {@link FileMetadataSchema#CONTENT_ENCODING} properties is enforced by the
* high-level {@link Document} interface. Applications are free to define
* additional properties.
* <p>
* Each time a file is created a new version number is assigned. The data index
* uses the {@link FileMetadataSchema#ID} as the first field in a compound key. The
* second field is the {@link FileMetadataSchema#VERSION} - a 32-bit integer. The
* remainder of the key is a 64-bit signed block identifier (2^63-1 distinct
* block identifiers). The block identifiers are strictly monotonic (e.g., one
* up) and their sequence orders the blocks into the logical byte order of the
* file.
* <p>
* Operations that create a new file actually create a new file version. The old
* file version will eventually be garbage collected depending on the policy in
* effect for compacting merges. Likewise, operations that delete a file simply
* mark the metadata for the file version as deleted and the file version will
* be eventually reclaimed. The high-level {@link #update(Document)} operation
* in fact simply creates a new file version.
* <p>
* <h2>Use cases</h2>
* <p>
* Use case: A REST-ful repository. Documents may be stored, updated, read,
* deleted, and searched using a full text index.
* <p>
* Use case: A map/reduce master reads document metadata using an index scan. It
* examines the data index's {@link MetadataIndex} (that is, the index that
* knows where each partition of the scale-out data index is stored) and
* determines which map clients are going to be "close" to each document and
* then hands off the document to one of those map clients.
* <p>
* Use case: The same as the use case above, but large files are being processed
* and there is a requirement to "break" the files into splits and hand off the
* splits. This can be achieved by estimating the file size using a range
* count and multiplying through by the block size. Blocks may be handed off to
* the clients in parallel (of course, clients need to deal with the hassle of
* processing files where records will cross split boundaries unless they always
* pad out with unused bytes to the next {@link #blockSize} boundary).
* <p>
* Use case: A reduce client wants to write a very large files so it creates a
* metadata record for the file and then does a series of atomic appears to the
* file. The file may grow arbitrarily large. Clients may begin to read from the
* file as soon as the first block has been flushed.
* <p>
* Use case: Queues MAY be built from the operations to atomically read or
* delete the first block for the file version. The "design pattern" is to have
* clients append blocks to the file version, taking care that logical rows
* never cross a block boundary (e.g., by flushing partial blocks). A master
* then reads the head block from the file version, distributing the logical
* records therein to consumers and providing fail safe processing in case
* consumers die or take too long. Once all records for the head block have been
* processed the master simply deletes the head block. This "pattern" is quite
* similar to map/reduce and, like map/reduce, requires that the consumer
* operations may be safely re-run.
* <p>
* Use case: File replication, retention of deleted versions, and media indexing
* are administered by creating "zones" comprising one or more index partitions
* with a shared file identifier prefix, e.g., /tmp or /highly-available, or
* /deployment-text-index. All files in a given zone share the same policy for
* file replication, compacting merges (determining when a deleted or even a
* non-deleted file version will be discarded), and media indexing.
* <p>
* Use case: File rename is NOT a cheap operation. It essentially creates a new
* file version with the desired name and copies the data from the old file
* version to the new file version. Finally the old file version is "deleted".
* This approach is necessary since files may moved from one "zone" to another
* and since the file data must reside on the index partition(s) identified by
* its file version.
*
* FIXME write a JSON API that interoperates to the extent possible with GAE and
* HBASE.
*
* @todo implement "zones" and their various policies (replication, retention,
* and media indexing). access control could also be part of the zones.
*
* @todo should compression be applied? applications are obviously free to apply
* their own compression, but it could be convenient to stored compressed
* blocks. the caller could specify the compression method on a per block
* basis (we don't want to lookup the file metadata for this). the
* compression method would be written into a block header. blocks can
* always be decompressed by examining the header.
*
* @todo there should be some constraints on the file identifier but it general
* it represents a client determined absolute file path name. It is
* certainly possible to use a flat file namespace, but you can just as
* readily use a hierarchical one. Unicode characters are supported in the
* file identifiers.
*
* @todo do we need a global lock mechanism to prevent concurrent high-level
* create/update/delete of the same file? a distributed lease-based lock
* system derived from jini or built ourselves? Can this be supported with
* the historical and not yet purged timestamped metadata for the file?
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class BigdataFileSystem extends
AbstractResource<IDatabase<BigdataFileSystem>> implements
IContentRepository, IRowStoreConstants {
final protected static Logger log = Logger.getLogger(BigdataFileSystem.class);
/**
* True iff the {@link #log} level is INFO or less.
*/
final protected static boolean INFO = log.isInfoEnabled();
/**
* True iff the {@link #log} level is DEBUG or less.
*/
final protected static boolean DEBUG = log.isDebugEnabled();
/**
* Configuration options.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public static interface Options extends com.bigdata.journal.Options, KeyBuilder.Options {
}
/**
* The #of offset bits.
*/
private final int offsetBits;
/** The size of a file block. */
private final int blockSize;
/**
* The #of bits in a 64-bit long integer identifier that are used to encode
* the byte offset of a record in the store as an unsigned integer.
*
* @see com.bigdata.journal.Options#OFFSET_BITS
* @see #getBlockSize()
*/
public final int getOffsetBits() {
return offsetBits;
}
/**
* The size of a file block. Block identifiers are 64-bit signed integers.
* The maximum file length is <code>2^63 - 1 </code> blocks ( 536,870,912
* Exabytes).
* <p>
* Note: The {@link BigdataFileSystem} makes the <strong>assumption</strong>
* that the {@link com.bigdata.journal.Options#OFFSET_BITS} is the #of
* offset bits configured for the {@link IDataService}s in the connected
* {@link IBigdataFederation} and computes the
* {@link BigdataFileSystem#getBlockSize()} based on that assumption. It is
* NOT possible to write blocks on the {@link BigdataFileSystem} whose size
* is greater than the maximum block size actually configured for the
* {@link IDataService}s in the connected {@link IBigdataFederation}.
*
* @see com.bigdata.journal.Options#OFFSET_BITS
* @see #getOffsetBits()
*/
public final int getBlockSize() {
return blockSize;
}
/**
* The maximum block identifier that can be assigned to a file version.
* <p>
* Note: This is limited to {@value Long#MAX_VALUE}-1 so that we can always
* form the key greater than any valid key for a file version. This is
* required by the atomic append logic when it seeks the next block
* identifier. See {@link AtomicBlockAppendProc}.
*/
protected static final long MAX_BLOCK = Long.MAX_VALUE - 1;
/**
* The basename of the index in which the file metadata are stored. The
* fully qualified name of the index uses {@link #getNamespace()} as a
* prefix.
* <p>
* Note: This is a {@link SparseRowStore} governed by the
* {@link FileMetadataSchema}.
*/
public static final String FILE_METADATA_INDEX_BASENAME = "fileMetadata";
/**
* The basename of the index in which the file data blocks are stored. The
* fully qualified name of the index uses {@link #getNamespace()} as a
* prefix.
* <p>
* Note: The entries in this index are a series of blocks for a file. Blocks
* are {@link #blockSize} bytes each and are assigned monotonically
* increasing block numbers by the atomic append operation. The final block
* may be smaller (there is no need to pad out the data with nulls). The
* keys are formed from two fields - a field containing the content
* identifier followed by an integer field containing the sequential block
* number. A range scan with a fromKey of the file identifier and a toKey
* computed using the successor of the file identifier will naturally visit
* all blocks in a file in sequence.
*/
public static final String FILE_DATA_INDEX_BASENAME = "fileData";
public static final FileMetadataSchema metadataSchema = new FileMetadataSchema();
private SparseRowStore fileMetadataIndex;
private IIndex fileDataIndex;
protected static void assertString(Map<String, Object> properties, String name) {
Object val = properties.get(name);
if (val == null)
throw new IllegalArgumentException(name + " is null");
if (!(val instanceof String))
throw new IllegalArgumentException(name + " must be String");
}
protected static void assertLong(Map<String, Object> properties, String name) {
Object val = properties.get(name);
if (val == null)
throw new IllegalArgumentException(name + " is null");
if (!(val instanceof Long))
throw new IllegalArgumentException(name + " must be Long");
}
/**
* Ctor specified by {@link DefaultResourceLocator}.
*
* @see Options
*/
public BigdataFileSystem(IIndexManager indexManager, String namespace,
Long timestamp, Properties properties) {
super(indexManager,namespace,timestamp,properties);
/*
* @todo This should probably be raised directly to a property reported
* by the federation. Right now it relies on the same default logic
* being replicated here and in AbstractJournal.
*/
offsetBits = Integer
.parseInt(properties
.getProperty(
Options.OFFSET_BITS,
Integer
.toString((indexManager instanceof Journal ? WormAddressManager.SCALE_UP_OFFSET_BITS
: WormAddressManager.SCALE_OUT_OFFSET_BITS))));
blockSize = WormAddressManager.getMaxByteCount(offsetBits) - 1;
if (INFO)
log.info("offsetBits=" + offsetBits + ", blockSize=" + blockSize);
}
/**
* The index in which the file metadata is stored (the index must exist).
*/
public SparseRowStore getFileMetadataIndex() {
if (fileMetadataIndex == null) {
throw new IllegalStateException();
}
return fileMetadataIndex;
}
/**
* The index in which the file blocks are stored (the index must exist).
*/
public IIndex getFileDataIndex() {
if (fileDataIndex == null) {
throw new IllegalStateException();
}
return fileDataIndex;
}
/**
* <code>true</code> unless {{@link #getTimestamp()} is {@link ITx#UNISOLATED}.
*/
public boolean isReadOnly() {
return getTimestamp() != ITx.UNISOLATED;
}
// final protected void assertWritable() {
//
// if(isReadOnly()) {
//
// throw new IllegalStateException("READ_ONLY");
//
// }
//
// }
/**
* Note: A commit is required in order for a read-committed view to have
* access to the registered indices. When running against an
* {@link IBigdataFederation}, {@link ITx#UNISOLATED} operations will take
* care of this for you. Otherwise you must do this yourself.
*/
@Override
public void create() {
assertWritable();
final IResourceLock resourceLock = acquireExclusiveLock();
try {
final Properties tmp = getProperties();
// final int branchingFactor = Integer.parseInt(tmp.getProperty(
// Options.BRANCHING_FACTOR, Options.DEFAULT_BRANCHING_FACTOR));
// set property that will let the contained relations locate their
// container.
tmp.setProperty(RelationSchema.CONTAINER, getNamespace());
super.create();
final IIndexManager indexManager = getIndexManager();
// setup metadata index.
{
/*
* Note: This specifies an split handler that keeps the logical
* row together. This is a hard requirement. The atomic
* read/update guarantee depends on this.
*/
final String name = getNamespace()+"."+FILE_METADATA_INDEX_BASENAME;
final IndexMetadata md = new IndexMetadata(indexManager, tmp,
name, UUID.randomUUID(), IndexTypeEnum.BTree);
// Ensure that splits do not break logical rows.
md.setSplitHandler(LogicalRowSplitHandler.INSTANCE);
indexManager.registerIndex(md);
final IIndex ndx = indexManager.getIndex(name, getTimestamp());
fileMetadataIndex = new SparseRowStore(ndx);
}
// setup data index.
{
/*
* @todo specify split handler that tends to keep the blocks for a
* file together (soft requirement).
*/
final String name = getNamespace()+"."+FILE_DATA_INDEX_BASENAME;
final IndexMetadata md = new IndexMetadata(indexManager, tmp,
name, UUID.randomUUID(), IndexTypeEnum.BTree);
/*
* @todo unit tests for correct copying of blobs during overflow.
* See {@link IOverflowHandler}.
*/
md.setOverflowHandler(new BlobOverflowHandler());
// register the index.
indexManager.registerIndex(md);
fileDataIndex = indexManager.getIndex(name,getTimestamp());
}
} finally {
unlock(resourceLock);
}
}
@Override
public void destroy() {
assertWritable();
final IResourceLock resourceLock = acquireExclusiveLock();
try {
getIndexManager().dropIndex(getNamespace()+"."+FILE_METADATA_INDEX_BASENAME);
getIndexManager().dropIndex(getNamespace()+"."+FILE_DATA_INDEX_BASENAME);
super.destroy();
} finally {
unlock(resourceLock);
}
}
/**
* Creates a new file version from the specified metadata. The new file
* version will not have any blocks. You can use either stream-oriented or
* block oriented IO to write data on the newly created file version.
*
* @param metadata
* The file metadata.
*
* @return The new version identifier.
*/
public int create(Map<String, Object> metadata) {
if (metadata == null)
throw new IllegalArgumentException();
// check required properties.
assertString(metadata, FileMetadataSchema.ID);
// clone the map since it may be unmodifiable.
metadata = new HashMap<String, Object>(metadata);
// auto-increment the last defined version counter.
metadata.put(FileMetadataSchema.VERSION, AutoIncIntegerCounter.INSTANCE);
// write the metadata (atomic operation).
final ITPS tps = getFileMetadataIndex().write(metadataSchema, metadata,
AUTO_TIMESTAMP_UNIQUE, null/* filter */, null/*precondition*/);
final int version = (Integer) tps.get(FileMetadataSchema.VERSION).getValue();
if(INFO)
log.info("Created new version: id=" + metadata.get(FileMetadataSchema.ID)
+ ", version=" + version);
return version;
}
@Override
public int create(final Document doc) {
if (doc == null)
throw new IllegalArgumentException();
final String id = doc.getId();
if (id == null)
throw new RuntimeException("The " + FileMetadataSchema.ID
+ " property must be defined.");
final Map<String,Object> metadata = doc.asMap();
// /*
// * Verify content type was specified since we will write on the file
// * version.
// */
// assertString(metadata, MetadataSchema.CONTENT_TYPE);
/*
* Vreate new file version.
*/
final int version = create( metadata );
/*
* Copy data from the document.
*/
copyStream(id, version, doc.getInputStream());
return version;
}
/**
* Reads the document metadata for the current version of the specified
* file.
*
* @param id
* The file identifier.
*
* @return A read-only view of the file version that is capable of reading
* the content from the repository -or- <code>null</code> iff
* there is no current version for that file identifier.
*/
public Document read(String id) {
RepositoryDocumentImpl doc = new RepositoryDocumentImpl(this, id);
if (!doc.exists()) {
// no current version for that document.
if(INFO)
log.info("No current version: id="+id);
return null;
}
return doc;
}
/**
* Return the file metadata for the version of the file associated with the
* specified timestamp.
*
* @param id
* The file identifier.
* @param timestamp
* The timestamp.
*
* @return A read-only view of the logical row of metadata for that file as
* of that timestamp.
*
* @see ITPS
* @see SparseRowStore#read(Schema, Object, long, com.bigdata.sparse.INameFilter)
*/
public ITPS readMetadata(final String id, final long timestamp) {
return getFileMetadataIndex()
.read(metadataSchema, id, timestamp/* fromTime */,
timestamp + 1/* toTime */, null/* filter */);
}
/**
* Update the metadata for the current file version.
*
* @param id
* The file identifier.
*
* @param metadata
* The properties to be written. A <code>null</code> value for
* a property will cause the corresponding property to be
* deleted. Properties not present in this map will NOT be
* modified.
*
* @return The complete metadata for the current file version.
*/
public Map<String,Object> updateMetadata(String id, Map<String,Object> metadata) {
// copy since the map might be unmodifyable.
metadata = new HashMap<String,Object>(metadata);
// set the id - this is required for the primary key.
metadata.put(FileMetadataSchema.ID, id);
// remove the version identifier if any - we do not want this modified!
metadata.remove(FileMetadataSchema.VERSION);
return getFileMetadataIndex().write(metadataSchema, metadata,
AUTO_TIMESTAMP_UNIQUE, null/* filter */,null/*precondition*/).asMap();
}
/**
* Create a new file version using the supplied file metadata.
* <p>
* Note: This is essentially a delete + create operation. Since the combined
* operation is NOT atomic it is possible that conflicts can arise when more
* than one client attempts to update a file concurrently.
*
* @param doc
* The file metadata.
*/
public int update(Document doc) {
final Map<String,Object> metadata = doc.asMap();
final String id = (String) metadata.get(FileMetadataSchema.ID);
// delete the existing file version (if any).
delete( id );
// create a new file version using that metadata.
return create( doc );
}
/**
* Note: A new file version is marked as deleted and then the file blocks
* for the old version are deleted from the data index. This sequence means
* (a) that clients attempting to read on the file using the high level API
* will not see the file as soon as its metadata is updated; (b) that the
* timestamp on the deleted version will be strictly LESS THAN the commit
* time(s) when the file blocks are deleted, so reading from the timestamp
* of the deleted version will let you see the deleted file blocks. This is
* a deliberate convenience - if we were to delete the file blocks first
* then we would not have ready access to a timestamp that would be before
* the first file block delete and hence sufficient to perform a historical
* read on the last state of the file before it was deleted.
*/
public long delete(String id) {
final RepositoryDocumentImpl doc = (RepositoryDocumentImpl) read(id);
if (!doc.exists()) {
// no current version.
log.warn("No current version: id=" + id);
return 0L;
}
final int version = doc.getVersion();
/*
* Mark the file version as deleted.
*
* Note: This only deletes the "version" property - the other properties
* are not changed. Howevery, the file version will be understood as
* "deleted" by this class.
*/
{
final Map<String, Object> metadata = new HashMap<String, Object>();
// primary key.
metadata.put(FileMetadataSchema.ID, id);
// delete marker.
metadata.put(FileMetadataSchema.VERSION, null);
getFileMetadataIndex().write(metadataSchema, metadata, AUTO_TIMESTAMP_UNIQUE,
null/* filter */, null/*precondition*/);
}
/*
* Delete blocks from the file version.
*
* Note: This is efficient in that it handles the delete on the data
* service for each index partition. However, if the data spans more
* than one index partition then the requests to delete the data on each
* index partition are issued in sequence. A range-delete procedure
* could be even more efficient since it can be parallelized when the
* operation spans more than one index partition.
*/
long blockCount = 0;
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder();
// the key for {file,version}
final byte[] fromKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.getKey();
// the key for {file,successor(version)}
final byte[] toKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version + 1)
.getKey();
final ITupleIterator itr = getFileDataIndex().rangeIterator(fromKey, toKey,
0/* capacity */, IRangeQuery.REMOVEALL, null/* filter */);
while (itr.hasNext()) {
itr.next();
blockCount++;
}
if(INFO)
log.info("Deleted " + blockCount + " blocks : id=" + id + ", version="
+ version);
/*
* There was a current version for the file. We have written a delete
* marker and also deleted any blocks for that file version.
*/
return blockCount;
}
/**
* Return an array describing all non-eradicated versions of a file.
* <p>
* This method returns all known version identifiers together with their
* timestamps, thereby making it possible to read either the metadata or the
* data for historical file versions - as long as the metadata and/or data
* has not yet been eradicated.
* <p>
* The file metadata and data blocks for historical version(s) of a file
* remain available until they are eradicated from their respective indices
* by a compacting merge in which the history policies no longer perserve
* those data.
* <p>
* In order to read the historical file metadata you need to know the
* timestamp associated with the version identifer which you wish to read.
* This should be timestamp when that version was <em>deleted</em> MINUS
* ONE in order to read the last valid metadata for the file version that
* file version was deleted.
* <p>
* Likewise, in order to read the historical version data you need to know
* the version identifer which you wish to read as well as the timestamp.
* In this case, use the timestamp when that version was <em>deleted</em>
* in order to read the last committed state for the file version.
* <p>
* Historical file version metadata is eradicated atomically since the
* entire logical row will be hosted on the same index partition. Either the
* file version metadata is available or it is now.
* <p>
* Historical file version data is eradicated one index partition at a time.
* If the file version spans more than one index partition then it may be
* possible to read some blocks from the file but not others.
* <p>
* Historical file version metadata and data will remain available until
* their governing history policy is no longer satisified. Therefore, when
* in doubt, you can consult the history policy in force for the file to
* determine whether or not its data may have been eradicated.
*
* @param id
* The file identifier.
*
* @return An array containing (timestamp,version) tuples. Tuples where the
* {@link ITPV#getValue()} returns <code>null</code> give the
* timestamp at which a file version was <em>deleted</em>. Tuples
* where the {@link ITPV#getValue()} returns non-<code>null</code>
* give the timestamp at which a file version was <em>created</em>.
*
* @see #readMetadata(String, long), to read the file version metadata based
* on a timestamp.
*
* @see #inputStream(String, int, long), to read the file data as of a
* specific timestamp.
*
* @todo expose history policy for a file (from its zone metadata, which is
* replicated onto the index partition metadata). Make sure that the
* zone metadata is consistent for the file version metadata and file
* version data. This means looking up the {@link IndexMetadata} for
* the index partition in which the file data is stored.
*/
public ITPV[] getAllVersionInfo(String id) {
/*
* Query for all metadata for the file.
*/
ITPS tps = readMetadata(id,Long.MAX_VALUE);
Vector<ITPV> vec = new Vector<ITPV>();
/*
* Filter for only the version properties, skipping "delete" entries.
*/
Iterator<? extends ITPV> itr = tps.iterator();
while(itr.hasNext()) {
ITPV tpv = itr.next();
if(!tpv.getName().equals(FileMetadataSchema.VERSION)) {
// Not a version field.
continue;
}
vec.add(tpv);
}
return vec.toArray(new ITPV[vec.size()]);
}
/**
* @todo write tests.
*/
@SuppressWarnings("unchecked")
public Iterator<? extends DocumentHeader> getDocumentHeaders(String fromId,
String toId) {
return new Striterator(getFileMetadataIndex().rangeIterator(metadataSchema,
fromId, toId)).addFilter(new Resolver() {
private static final long serialVersionUID = 1L;
@Override
protected Object resolve(Object arg0) {
final ITPS tps = (ITPS) arg0;
final String id = (String) tps.get(
FileMetadataSchema.ID).getValue();
return new RepositoryDocumentImpl(
BigdataFileSystem.this, id, tps);
}
});
}
/**
* Efficient delete of file metadata and file data for all files and file
* versions spanned by the specified file identifiers. File versions are
* marked "deleted" before the file blocks are deleted so that you can
* read on historical file version with exactly the same semantics as
* {@link #delete(String)}.
*/
public long deleteAll(String fromId, String toId) {
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder();
// the key for {fromId}
final byte[] fromKey = keyBuilder.reset().appendText(fromId,
true/* unicode */, false/* successor */).getKey();
// the key for {successor(toId)}
final byte[] toKey = keyBuilder.reset().appendText(toId,
true/* unicode */, true/* successor */).getKey();
// delete file metadata
long ndeleted = 0;
{
/*
* Delete the file version metadata for each document in the key
* range by replacing its VERSION column value with a null value
* (and updating the timestamp in the key).
*/
getFileMetadataIndex().getIndex().rangeIterator(
fromKey,
toKey,
0/* capacity */,
IRangeQuery.CURSOR,
new FileVersionDeleter(
IRowStoreConstants.AUTO_TIMESTAMP_UNIQUE));
}
// delete file blocks.
{
final ITupleIterator itr = getFileDataIndex()
.rangeIterator(fromKey, toKey, 0/* capacity */,
IRangeQuery.REMOVEALL, null/* filter */);
long blockCount = 0;
while (itr.hasNext()) {
itr.next();
blockCount++;
}
}
return ndeleted;
}
/**
* FIXME Integrate with {@link FullTextIndex} to providing indexing and
* search of file versions. Deleted file versions should be removed from the
* text index. There should be explicit metadata on the file version in
* order for it to be indexed. The text indexer will require content type
* and encoding information in order to handle indexing. Low-level output
* stream, writer, block write and block append operations will not trigger
* the indexer since it depends on the metadata index to know whether or not
* a file version should be indexed. However you could explicitly submit a
* file version for indexing.
* <p>
* Perhaps the best way to handle this is to queue document metadata up for
* a distributed full text indexing service. The service accepts metadata
* for documents from the queue and decides whether or not the document
* should be indexed based on its metadata and how the document should be
* processed if it is to be indexed. Those business rules would be
* registered with the full text indexing service. (Alternatively, they can
* be configured with the {@link BigdataFileSystem} and applied locally as
* the blocks of the file are written into the repository. That's certainly
* easier right off the bat.)
*
* @todo crawl or query job obtains a set of URLs, writing them onto a file.
* <p>
* m/r job downloads documents based on set of URLs, writing all
* documents into a single file version. text-based downloads can be
* record compressed and decompressed after the record is read. binary
* downloads will be truncated at 64M and might be skipped all
* together if the exceed the block size (get images, but not wildely
* large files).
* <p>
* m/r job extracts a simplified html format from the source image,
* writing the result onto another file. this job will optionally
* split documents into "pages" by breaking where necessary at
* paragraph boundaries.
* <p>
* m/r job builds text index from simplified html format.
* <p>
* m/r job runs extractors on simplified html format, producing
* rdf/xml which is written onto another file. The rdf/xml for each
* harvested document is written as its own logical record, perhaps
* one record per block.
* <p>
* concurrent batch load of rdf/xml into scale-out knowledge base. the
* input is a single file comprised of blocks, each of which is an
* rdf/xml file.
*/
public Iterator<String> search(String query) {
throw new UnsupportedOperationException();
}
/*
* file data operations (read, atomic append).
*/
/**
* Returns an iterator that visits all block identifiers for the file
* version in sequence.
* <p>
* Note: This may be used to efficiently distribute blocks among a
* population of clients, e.g., in a map/reduce paradigm.
*/
public Iterator<Long> blocks(String id, int version) {
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata()
.getKeyBuilder();
final byte[] fromKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.getKey();
final byte[] toKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version + 1)
.getKey();
// just the keys.
final int flags = IRangeQuery.KEYS;
// visits the keys for the file version in block order.
final ITupleIterator itr = getFileDataIndex().rangeIterator(fromKey, toKey,
0/* capacity */, flags, null/* filter */);
// resolve keys to block identifiers.
return new BlockIdentifierIterator( id, version, itr );
}
/**
* Copies blocks from one file version to another. The data in each block of
* the source file version is copied into a new block that is appended to
* the target file version. Empty blocks are copied. Partial blocks are NOT
* combined. The block identifiers are NOT preserved since atomic append is
* used to add blocks to the target file version.
*
* @param fromId
* @param fromVersion
* @param toId
* @param toVersion
*
* @return The #of blocks copied.
*
* FIXME This could be made more efficient by sending the copy operation to
* each index partition in turn. that would avoid having to copy the data
* first to the client and thence to the target index partition.
*/
public long copyBlocks(String fromId, int fromVersion, String toId,
int toVersion) {
final Iterator<Long> src = blocks(fromId,fromVersion);
long nblocks = 0L;
while(src.hasNext()) {
final long blockId = src.next();
// read block
final byte[] block = readBlock(fromId, fromVersion, blockId);
// write block.
appendBlock(toId, toVersion, block, 0, block.length);
nblocks++;
}
return nblocks;
}
/**
* Atomic write of a block for a file version.
* <p>
* Note: You can write any valid block identifier at any time. If the block
* exists then its data will be replaced.
* <p>
* Note: Writing blocks out of sequence can create "holes". Those holes may
* be filled by later writing the "missing" blocks.
* {@link #copyBlocks(String, int, String, int)} will renumber the blocks
* and produce a dense sequence of blocks.
* <p>
* Note: Atomic append will always write the successor of the largest block
* identifier already written on the file version. If you write block
* {@link #MAX_BLOCK} then it will no longer be possible to append blocks to
* that file version, but you can still write blocks using
* {@link #writeBlock(String, int, long, byte[], int, int)}.
*
* @param id
* The file identifier.
* @param version
* The file version.
* @param block
* The block identifier in [0:{@link #MAX_BLOCK}].
* @param b
* The buffer containing the bytes to be written. When the buffer
* contains more than {@link #blockSize} bytes it will be broken
* up into multiple blocks.
* @param off
* The offset of the 1st byte to be written.
* @param len
* The #of bytes to be written.
*
* @return <code>true</code> iff the block was overwritten (ie., if the
* block already exists, which case its contents were replaced).
*
* @throws IllegalArgumentException
* if <i>id</id> is <code>null</code> or an empty string.
* @throws IllegalArgumentException
* if <i>version</id> is negative.
* @throws IllegalArgumentException
* if <i>block</id> is negative.
* @throws IllegalArgumentException
* if <i>b</id> is <code>null</code>.
* @throws IllegalArgumentException
* if <i>off</id> is negative or greater than the length of the
* byte[].
* @throws IllegalArgumentException
* if <i>len</id> is negative or <i>off+len</i> is greater
* than the length of the byte[].
* @throws IllegalArgumentException
* if <i>len</i> is greater than {@link #blockSize}.
*
* @todo return the data for the old block instead in the case of an
* overwrite?
*/
public boolean writeBlock(String id, int version, long block, byte[] b, int off, int len) {
if (id == null || id.length() == 0)
throw new IllegalArgumentException();
if (version < 0)
throw new IllegalArgumentException();
if (block < 0L) {
/*
* Note: restriction implies 63-bit block identifier (no
* negative#s).
*/
throw new IllegalArgumentException();
}
if (block > MAX_BLOCK) {
throw new IllegalArgumentException();
}
if (b == null)
throw new IllegalArgumentException();
if (off < 0 || off > b.length)
throw new IllegalArgumentException("off="+off+", b.length="+b.length);
if (len < 0 || off + len > b.length)
throw new IllegalArgumentException("off="+off+", len="+len+", b.length="+b.length);
if(len>blockSize) {
throw new IllegalArgumentException();
}
// construct the atomic write operation.
final ISimpleIndexProcedure proc = new AtomicBlockWriteProc(this, id, version,
block, b, off, len);
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder();
// the key for the {file,version,block}
final byte[] key = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.append(block).getKey();
return (Boolean) getFileDataIndex().submit(key, proc);
}
/**
* Atomic delete of the first block of the file version.
*
* @param id
* The file identifier.
* @param version
* The version identifier.
*
* @return The block identifier of the deleted block -or- <code>-1L</code>
* if nothing was deleted.
*/
public long deleteHead(String id, int version) {
if (INFO)
log.info("id=" + id + ", version=" + version);
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata()
.getKeyBuilder();
// the key for {file,version}
final byte[] fromKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.getKey();
// the key for {file,successor(version)}
final byte[] toKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(
version + 1).getKey();
/*
* The REMOVALL flag together with a limit of ONE (1) is used to obtain
* an atomic delete of the first block for this file version.
*/
final ITupleIterator itr = getFileDataIndex()
.rangeIterator(fromKey, toKey,
1, // Note: limit is ONE block!
IRangeQuery.KEYS|IRangeQuery.REMOVEALL, null/* filter */);
if (!itr.hasNext()) {
log.warn("Nothing to delete: id=" + id + ", version=" + version);
return -1L;
}
/*
* Consume the iterator but note that the block was already deleted if
* this was a remote request.
*/
final long block = new BlockIdentifierIterator(id, version, itr).next();
if(INFO)
log.info("id="+id+", version="+version+" : deleted block="+block);
return block;
}
/**
* Atomic delete of a block for a file version.
*
* @param id
* The file identifier.
* @param version
* The version identifier.
* @param block
* The block identifier -or- <code>-1L</code> to read the first
* block in the file version regardless of its block identifier.
*
* @return <code>true</code> iff the block was deleted.
*/
public boolean deleteBlock(String id, int version, long block) {
if (id == null || id.length() == 0)
throw new IllegalArgumentException();
if (version < 0)
throw new IllegalArgumentException();
if (block < 0L) {
/*
* Note: restriction implies 63-bit block identifier (no
* negative#s).
*/
throw new IllegalArgumentException();
}
if (block > MAX_BLOCK) {
throw new IllegalArgumentException();
}
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder();
final byte[] key = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.append(block).getKey();
/*
* Note: The return value is just the serialized address of that block
* on the journal (8 bytes).
*/
final boolean deleted = getFileDataIndex().remove(key) != null;
return deleted;
}
/**
* Atomic read of the first block of the file version.
*
* @param id
* The file identifier.
* @param version
* The version identifier.
*
* @return The contents of the block -or- <code>null</code> iff there are
* no blocks for that file version. Note that an empty block will
* return an empty byte[] rather than <code>null</code>.
*/
public byte[] readHead(String id, int version) {
/*
* Setup range scan than will span all blocks for the file version. We
* are only interested in the first block, but this is how we get at its
* data using an atomic read.
*/
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder();
final byte[] fromKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.append(0L).getKey();
final byte[] toKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.append(Long.MAX_VALUE).getKey();
/*
* Resolve the requested block : keys and data.
*/
final ITupleIterator itr = getFileDataIndex()
.rangeIterator(fromKey, toKey, 1/* capacity */,
IRangeQuery.KEYS | IRangeQuery.VALS, null/* filter */);
if (!itr.hasNext()) {
if (INFO)
log.info("id=" + id + ", version=" + version + " : no blocks");
return null;
}
return readBlock(id, version, itr.next());
}
/**
* Atomic read of a block for a file version.
*
* @param id
* The file identifier.
* @param version
* The version identifier.
* @param block
* The block identifier.
*
* @return The contents of the block -or- <code>null</code> iff the block
* does not exist. Note that an empty block will return an empty
* byte[] rather than <code>null</code>.
*
* @todo offer a variant that returns an {@link InputStream}?
*/
public byte[] readBlock(String id, int version, long block) {
if (id == null)
throw new IllegalArgumentException();
/*
* Setup range scan than will span exactly the specified block.
*
* Note: This uses a range scan because a lookup will return the address
* of the block rather than its data!
*/
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder();
final byte[] fromKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.append(block).getKey();
final byte[] toKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.append(block + 1).getKey();
/*
* Resolve the requested block : keys and data.
*/
final ITupleIterator itr = getFileDataIndex()
.rangeIterator(fromKey, toKey, 1/* capacity */,
IRangeQuery.KEYS | IRangeQuery.VALS, null/* filter */);
if (!itr.hasNext()) {
if (INFO)
log.info("id=" + id + ", version=" + version + ", block="
+ block + " : does not exist");
return null;
}
return readBlock(id, version, itr.next());
}
/**
* Helper to read a block from an {@link ITuple}.
*
* @param id
* @param version
* @param tuple
* @return
*/
private byte[] readBlock(String id, int version, ITuple tuple) {
final byte[] key = tuple.getKey();
// decode the block identifier from the key.
// block = KeyBuilder.decodeLong(tuple.getKeyBuffer().array(),
// tuple.getKeyBuffer().pos() - Bytes.SIZEOF_LONG);
long block = KeyBuilder.decodeLong(key, key.length - Bytes.SIZEOF_LONG);
final long addr;
try {
DataInput in = tuple.getValueStream();
addr = in.readLong();
} catch (IOException e) {
throw new RuntimeException(e);
}
if (addr == 0L) {
/*
* Note: empty blocks are allowed and are recorded with 0L as
* their address.
*/
if(INFO)
log.info("id=" + id + ", version=" + version + ", block=" + block
+ " : empty block.");
return new byte[]{};
}
/*
* Read the block from the backing store.
*/
final IBlock tmp = tuple.readBlock(addr);
final int len = tmp.length();
if(INFO)
log.info("id=" + id + ", version=" + version + ", block=" + block
+ " : " + len + " bytes");
// @todo reuse buffers, but must return {byte[],off,len} tuple.
final byte[] data = new byte[len];
try {
final int nread = tmp.inputStream().read(data, 0, len);
if (nread != len) {
throw new RuntimeException("Expecting to read " + len
+ " bytes but read " + nread + " bytes");
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return data;
}
/**
* Atomic append of a block to a file version.
*
* @param id
* The file identifier.
* @param version
* The file version.
* @param b
* The buffer containing the data to be written..
* @param off
* The offset of the 1st byte to be written.
* @param len
* The #of bytes to be written in [0:{@link #blockSize}].
*
* @return The block identifer for the written block.
*
* @throws IllegalArgumentException
* if <i>id</id> is <code>null</code> or an empty string.
* @throws IllegalArgumentException
* if <i>version</id> is negative.
* @throws IllegalArgumentException
* if <i>b</id> is <code>null</code>.
* @throws IllegalArgumentException
* if <i>off</id> is negative or greater than the length of the
* byte[].
* @throws IllegalArgumentException
* if <i>len</id> is negative or <i>off+len</i> is greater
* than the length of the byte[].
* @throws IllegalArgumentException
* if <i>len</i> is greater than {@link #blockSize}.
*/
public long appendBlock(String id, int version, byte[] b, int off, int len) {
if (id == null || id.length() == 0)
throw new IllegalArgumentException();
if (version < 0)
throw new IllegalArgumentException();
if (b == null)
throw new IllegalArgumentException();
if (off < 0 || off > b.length)
throw new IllegalArgumentException("off="+off+", b.length="+b.length);
if (len < 0 || off + len > b.length)
throw new IllegalArgumentException("off="+off+", len="+len+", b.length="+b.length);
if (len > blockSize) {
throw new IllegalArgumentException();
}
// construct the atomic append operation.
final ISimpleIndexProcedure proc = new AtomicBlockAppendProc(this, id,
version, b, off, len);
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder();
// the last possible key for this file
final byte[] key = keyBuilder.reset().appendText(id,
true/* unicode */, true/* successor */).append(version)
.append(-1L).getKey();
/*
* Figure out which index partition will absorb writes on the end of the
* file. We do this by finding the index partition that would contain
* the successor of the id and then considering its leftSeparator. If
* the leftSeparator is greater than the id then the id does not enter
* this index partition and we use the prior index partition. Otherwise
* the id enters this partition and we use it.
*
* Note: File versions allow us to avoid painful edge cases when a file
* has been deleted that spans more than one index partition. Since we
* never attempt to write on the deleted file version we are not faced
* with the problem of locating the largest index partition that
* actually has data for that file. When a large file has been deleted
* there can be EMPTY index partitions (containing only deleted entries)
* until the next compacting merge.
*/
return (Long) getFileDataIndex().submit(key, proc);
}
/**
* Return the maximum #of blocks in the file version. The return value
* includes any deleted but not yet eradicated blocks for the specified file
* version, so it represents an upper bound on the #of blocks that could be
* read for that file version.
* <p>
* Note: the block count only decreases when a compacting merge eradicates
* deleted blocks from an index partition. It will increase any time there
* is a write on a block for the file version for which neither a delete nor
* an undeleted entry exists. The only way to count the #of non-deleted
* blocks for a file version is to traverse the {@link #blocks(String, int)}
* iterator.
*
* @param id
* The file identifier.
* @param version
* The file version identifier.
*
* @return The #of blocks in that file.
*/
public long getBlockCount(String id, int version) {
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata()
.getKeyBuilder();
final byte[] fromKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.getKey();
final byte[] toKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version + 1)
.getKey();
final long nblocks = getFileDataIndex().rangeCount(fromKey, toKey);
if (INFO)
log.info("id=" + id + ", version=" + version + ", nblocks=" + nblocks);
return nblocks;
}
/**
* Return a {@link Writer} that will <em>append</em> character data on the
* file version. Characters written on the {@link Writer} will be converted
* to bytes using the specified encoding. Bytes will be buffered until the
* block is full and then written on the file version using an atomic
* append. An {@link Writer#flush()} will force a non-empty partial block to
* be written immediately.
* <p>
* Note: Map/Reduce processing of a file version MAY be facilitated greatly
* by ensuring that "records" never cross a block boundary - this means that
* file versions can be split into blocks and blocks distributed to clients
* without any regard for the record structure within those blocks. The
* caller can prevent records from crossing block boundaries by the simple
* expediency of invoking {@link Writer#flush()} to force the atomic append
* of a (partial but non-empty) block to the file.
* <p>
* Since the characters are being converted to bytes, the caller MUST make
* {@link Writer#flush()} decisions with an awareness of the expansion rate
* of the specified encoding. For simplicity, it is easy to specify
* <code>UTF-16</code> in which case you can simply count two bytes
* written for each character written.
*
* @param id
* The file identifier.
* @param version
* The version identifier.
* @param encoding
* The character set encoding.
*
* @return The writer on which to write the character data.
*
* @throws UnsupportedEncodingException
*/
public Writer writer(String id, int version, String encoding)
throws UnsupportedEncodingException {
if(INFO)
log.info("id="+id+", version="+version+", encoding="+encoding);
return new OutputStreamWriter(outputStream(id, version), encoding);
}
/**
* Read character data from a file version.
*
* @param id
* The file identifier.
* @param version
* The version identifier.
* @param encoding
* The character set encoding.
*
* @return The reader from which you can read the character data.
*
* @throws UnsupportedEncodingException
*/
public Reader reader(String id, int version, String encoding) throws UnsupportedEncodingException {
if(INFO)
log.info("id="+id+", version="+version+", encoding="+encoding);
if (encoding == null) {
throw new IllegalStateException();
}
return new InputStreamReader(inputStream(id, version), encoding);
}
/**
* Read data from a file version.
* <p>
* Note: The input stream will remain coherent for the file version as of
* the time that the view on the file version is formed. Additional atomic
* appends MAY be read, but that is NOT guarenteed. If the file is deleted
* and its data is expunged by a compacting merge during the read then the
* read MAY be truncated.
*
* @param id
* The file identifier.
* @param version
* The version identifier.
*
* @return An input stream from which the caller may read the data in the
* file -or- <code>null</code> if there is no data for that file
* version, including no deleted blocks pending garbage collection.
* An empty input stream MAY be returned since empty blocks are
* allowed. An empty stream will also be returned after a file
* version is deleted until the deleted blocks are eradicated from
* the file data index.
*/
public FileVersionInputStream inputStream(String id,int version) {
return inputStream(id, version, ITx.UNISOLATED);
}
/**
* Read data from a file version.
* <p>
* Some points about consistency and transaction identifiers.
* <ol>
*
* <li> When using an {@link ITx#UNISOLATED} read addition atomic writes and
* atomic appends issued after the input stream view was formed MAY be read,
* but that is NOT guarenteed - it depends on the buffering of the range
* iterator used to read blocks for the file version. Likewise, if the file
* is deleted and its data is expunged by a compacting merge during the read
* then the read MAY be truncated. </li>
*
* <li> It is possible to re-create historical states of a file version
* corresponding to a <em>commit point</em> for the
* {@link #getFileDataIndex() data index} provided that the relevant data has
* not been eradicated by a compacting merge. It is not possible to recover
* all states - merely committed states - since unisolated writes may be
* grouped together by group commit and therefore have the same commit
* point. </li>
*
* <li> It is possible to issue transactional read requests, but you must
* first open a transaction with an {@link ITransactionManagerService}. In general
* the use of full transactions is discouraged as the
* {@link BigdataFileSystem} is designed for high throughput and high
* concurrency with weaker isolation levels suitable for scale-out
* processing techniques including map/reduce.</li>
*
* </ol>
*
* @param id
* The file identifier.
* @param version
* The version identifier.
* @param tx
* The transaction identifier. This is generally either
* {@link ITx#UNISOLATED} to use an unisolated read -or-
* <code>- timestamp</code> to use a historical read for the
* most recent consistent state of the file data not later than
* <i>timestamp</i>.
*
* @return An input stream from which the caller may read the data in the
* file -or- <code>null</code> if there is no data for that file
* version, including no deleted blocks pending garbage collection.
* An empty input stream MAY be returned since empty blocks are
* allowed. An empty stream will also be returned after a file
* version is deleted until the deleted blocks are eradicated from
* the file data index.
*/
public FileVersionInputStream inputStream(String id, int version, long tx) {
if (INFO)
log.info("id=" + id + ", version=" + version + ", tx=" + tx);
/*
* Range count the file and version on the federation - this is the
* number of blocks of data for that file and version as of the start of
* this read operation. If the result is zero then there are no index
* partitions which span that file and version and we return null.
*
* Note: This step is skipped for historical and transactional reads
* since getBlockCount() does not accept the transaction identifier.
*/
if (tx == ITx.UNISOLATED && getBlockCount(id, version) == 0L) {
if (INFO)
log.info("No data: id=" + id + ", version=" + version);
return null;
}
/*
* Return an input stream that will progress through a range scan of the
* blocks for that file and version.
*/
final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder();
final byte[] fromKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version)
.getKey();
final byte[] toKey = keyBuilder.reset().appendText(id,
true/* unicode */, false/* successor */).append(version + 1)
.getKey();
/*
* The capacity is essentially the #of block addresses to transfer at a
* time, not the #of blocks. I've set a moderately low limit here since
* the blocks themselves need to be transferred as well, so there is
* little point in buffering too many block addresses.
*
* The addresses associated with a block identifier are updated when the
* block is re-written, so if you buffer a lot of block addresses here
* then updates to the blocks for the buffered identifiers will not be
* visible to the client.
*
* Finally, for very large files you may find that the block addresses
* grow stale (the resource on which they were written may be moved or
* deleted following a compacting merge), forcing a re-start of the read
* from the last visited block identifier.
*
* @todo handle automatic restart of the read from the next block
* identifier if we learn that the resource on which a block was written
* has been deleted.
*/
final int capacity = 1000;
// both keys and values.
final int flags = IRangeQuery.KEYS | IRangeQuery.VALS;
final ITupleIterator itr;
final IIndex dataIndex;
if (tx == ITx.UNISOLATED) {
dataIndex = getFileDataIndex();
} else {
/*
* Obtain the index view for that historical timestamp or isolated
* by the specified transaction.
*/
dataIndex = getIndexManager().getIndex(getNamespace()+"."+FILE_DATA_INDEX_BASENAME,tx);
}
itr = dataIndex
.rangeIterator(fromKey, toKey, capacity, flags, null/* filter */);
return new FileVersionInputStream(id, version, itr);
}
/**
* Return an output stream that will <em>append</em> on the file version.
* Bytes written on the output stream will be buffered until they are full
* blocks and then written on the file version using an atomic append. An
* {@link OutputStream#flush()} will force a non-empty partial block to be
* written immediately.
* <p>
* Note: Map/Reduce processing of a file version MAY be facilitated greatly
* by ensuring that "records" never cross a block boundary - this means that
* files can be split into blocks and blocks distributed to clients without
* any regard for the record structure within those blocks. The caller can
* prevent records from crossing block boundaries by the simple expediency
* of invoking {@link OutputStream#flush()} to force the atomic append of a
* (partial but non-empty) block to the file.
*
* @param id
* The file identifier.
* @param version
* The version identifier.
*
* @return The output stream.
*/
public OutputStream outputStream(String id, int version) {
if(INFO)
log.info("id="+id+", version="+version);
return new FileVersionOutputStream(this, id, version);
}
/**
* Copies data from the input stream to the file version. The data is
* buffered into blocks. Each block is written on the file version using an
* atomic append. Writing an empty stream will cause an empty block to be
* appended (this ensures that read back will read an empty stream).
*
* @param id
* The file identifier.
* @param version
* The version identifier.
* @param is
* The input stream (closed iff it is fully consumed).
*
* @return The #of bytes copied.
*/
public long copyStream(String id, int version, InputStream is) {
final FileVersionOutputStream os = (FileVersionOutputStream) outputStream(
id, version);
final long ncopied;
try {
ncopied = os.copyStream( is );
if (ncopied == 0) {
// force an empty block to be written.
appendBlock(id, version, new byte[]{}, 0, 0);
}
os.close();
} catch(IOException ex) {
throw new RuntimeException(ex);
}
return ncopied;
}
}