/* * Created on Jan 17, 2008 */ package com.bigdata.bfs; import java.io.DataInput; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Reader; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.UUID; import java.util.Vector; import org.apache.log4j.Logger; import com.bigdata.btree.IIndex; import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.ITuple; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.IndexTypeEnum; import com.bigdata.btree.keys.IKeyBuilder; import com.bigdata.btree.keys.KeyBuilder; import com.bigdata.btree.proc.ISimpleIndexProcedure; import com.bigdata.journal.IIndexManager; import com.bigdata.journal.IResourceLock; import com.bigdata.journal.ITx; import com.bigdata.journal.Journal; import com.bigdata.mdi.MetadataIndex; import com.bigdata.rawstore.IBlock; import com.bigdata.rawstore.WormAddressManager; import com.bigdata.relation.AbstractResource; import com.bigdata.relation.IDatabase; import com.bigdata.relation.RelationSchema; import com.bigdata.relation.locator.DefaultResourceLocator; import com.bigdata.search.FullTextIndex; import com.bigdata.service.IBigdataFederation; import com.bigdata.service.IDataService; import com.bigdata.sparse.AutoIncIntegerCounter; import com.bigdata.sparse.IRowStoreConstants; import com.bigdata.sparse.ITPS; import com.bigdata.sparse.ITPV; import com.bigdata.sparse.LogicalRowSplitHandler; import com.bigdata.sparse.Schema; import com.bigdata.sparse.SparseRowStore; import com.bigdata.util.Bytes; import cutthecrap.utils.striterators.Resolver; import cutthecrap.utils.striterators.Striterator; /** * A distributed file system with extensible metadata and atomic append * implemented using the bigdata scale-out architecture. Files have a client * assigned identifier, which is a Unicode string. The file identifier MAY be * structured so as to look like a hierarchical file system using any desired * convention. Files are versioned and historical versions MAY be accessed until * the next compacting merge discards their data. File data is stored in large * {@link #blockSize} blocks. Partial and even empty blocks are allowed and only * the data written will be stored. <code>2^63-1</code> distinct blocks may be * written per file version, making the maximum possible file size * <code>536,870,912</code> exabytes. Files may be used as queues, in which * case blocks containing new records are atomically appended while a map/reduce * style master consumes the head block of the file. * <p> * Efficient method are offered for streaming and block oriented IO. All block * read and write operations are atomic, including block append. Files may be * easily written such that records never cross a block boundary by the * expediency of flushing the output stream if a record would overflow the * current block. A flush forces the atomic write of a partial block. Partial * blocks are stored efficiently - only the bytes actually written are stored. * Blocks are large enough that most applications can safely store a large * number of logical records in each block. Files comprised of application * defined logical records organized into a sequence of blocks are well-suited * to map/reduce processing. They may be efficiently split at block boundaries * and references to the blocks distributed to clients. Likewise, reduce clients * can aggregate data into large files suitable for further map/reduce * processing. * <p> * The distributed file system uses two scale-out indices to support ACID * operations on file metadata and atomic file append. These ACID guarantees * arise from the use of unisolated operations on the respective indices and * therefore apply only to the individual file metadata or file block * operations. In particular, file metadata read and write are atomic and all * individual file block IO (read, write, and append) operations are atomic. * Atomicity is NOT guaranteed when performing more than a single file block IO * operation, e.g., multiple appends MIGHT NOT write sequential blocks since * other block operations could have intervened. * <p> * The content length of the file is not stored as file metadata. Instead it MAY * be estimated by a range count of the index entries spanned by the file's * data. The exact file size may be readily determined when reading small files * by the expediency of sucking the entire file into a buffer - all reads are at * least one block. Streaming processing is advised in all cases when handling * large files, including when the file is to be delivered via HTTP. * <p> * The {@link #getFileMetadataIndex() metadata index} uses a {@link SparseRowStore} * design, similar to Google's bigtable or Hadoop's HBase. All updates to file * version metadata are atomic. The primary key in the metadata index for every * file is its {@link FileMetadataSchema#ID}. In addition, each version of a file * has a distinct {@link FileMetadataSchema#VERSION} property. File creation time, * version creation time, and file version metadata update timestamps may be * recovered from the timestamps associated with the properties in the metadata * index. The use of the {@link FileMetadataSchema#CONTENT_TYPE} and * {@link FileMetadataSchema#CONTENT_ENCODING} properties is enforced by the * high-level {@link Document} interface. Applications are free to define * additional properties. * <p> * Each time a file is created a new version number is assigned. The data index * uses the {@link FileMetadataSchema#ID} as the first field in a compound key. The * second field is the {@link FileMetadataSchema#VERSION} - a 32-bit integer. The * remainder of the key is a 64-bit signed block identifier (2^63-1 distinct * block identifiers). The block identifiers are strictly monotonic (e.g., one * up) and their sequence orders the blocks into the logical byte order of the * file. * <p> * Operations that create a new file actually create a new file version. The old * file version will eventually be garbage collected depending on the policy in * effect for compacting merges. Likewise, operations that delete a file simply * mark the metadata for the file version as deleted and the file version will * be eventually reclaimed. The high-level {@link #update(Document)} operation * in fact simply creates a new file version. * <p> * <h2>Use cases</h2> * <p> * Use case: A REST-ful repository. Documents may be stored, updated, read, * deleted, and searched using a full text index. * <p> * Use case: A map/reduce master reads document metadata using an index scan. It * examines the data index's {@link MetadataIndex} (that is, the index that * knows where each partition of the scale-out data index is stored) and * determines which map clients are going to be "close" to each document and * then hands off the document to one of those map clients. * <p> * Use case: The same as the use case above, but large files are being processed * and there is a requirement to "break" the files into splits and hand off the * splits. This can be achieved by estimating the file size using a range * count and multiplying through by the block size. Blocks may be handed off to * the clients in parallel (of course, clients need to deal with the hassle of * processing files where records will cross split boundaries unless they always * pad out with unused bytes to the next {@link #blockSize} boundary). * <p> * Use case: A reduce client wants to write a very large files so it creates a * metadata record for the file and then does a series of atomic appears to the * file. The file may grow arbitrarily large. Clients may begin to read from the * file as soon as the first block has been flushed. * <p> * Use case: Queues MAY be built from the operations to atomically read or * delete the first block for the file version. The "design pattern" is to have * clients append blocks to the file version, taking care that logical rows * never cross a block boundary (e.g., by flushing partial blocks). A master * then reads the head block from the file version, distributing the logical * records therein to consumers and providing fail safe processing in case * consumers die or take too long. Once all records for the head block have been * processed the master simply deletes the head block. This "pattern" is quite * similar to map/reduce and, like map/reduce, requires that the consumer * operations may be safely re-run. * <p> * Use case: File replication, retention of deleted versions, and media indexing * are administered by creating "zones" comprising one or more index partitions * with a shared file identifier prefix, e.g., /tmp or /highly-available, or * /deployment-text-index. All files in a given zone share the same policy for * file replication, compacting merges (determining when a deleted or even a * non-deleted file version will be discarded), and media indexing. * <p> * Use case: File rename is NOT a cheap operation. It essentially creates a new * file version with the desired name and copies the data from the old file * version to the new file version. Finally the old file version is "deleted". * This approach is necessary since files may moved from one "zone" to another * and since the file data must reside on the index partition(s) identified by * its file version. * * FIXME write a JSON API that interoperates to the extent possible with GAE and * HBASE. * * @todo implement "zones" and their various policies (replication, retention, * and media indexing). access control could also be part of the zones. * * @todo should compression be applied? applications are obviously free to apply * their own compression, but it could be convenient to stored compressed * blocks. the caller could specify the compression method on a per block * basis (we don't want to lookup the file metadata for this). the * compression method would be written into a block header. blocks can * always be decompressed by examining the header. * * @todo there should be some constraints on the file identifier but it general * it represents a client determined absolute file path name. It is * certainly possible to use a flat file namespace, but you can just as * readily use a hierarchical one. Unicode characters are supported in the * file identifiers. * * @todo do we need a global lock mechanism to prevent concurrent high-level * create/update/delete of the same file? a distributed lease-based lock * system derived from jini or built ourselves? Can this be supported with * the historical and not yet purged timestamped metadata for the file? * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class BigdataFileSystem extends AbstractResource<IDatabase<BigdataFileSystem>> implements IContentRepository, IRowStoreConstants { final protected static Logger log = Logger.getLogger(BigdataFileSystem.class); /** * True iff the {@link #log} level is INFO or less. */ final protected static boolean INFO = log.isInfoEnabled(); /** * True iff the {@link #log} level is DEBUG or less. */ final protected static boolean DEBUG = log.isDebugEnabled(); /** * Configuration options. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public static interface Options extends com.bigdata.journal.Options, KeyBuilder.Options { } /** * The #of offset bits. */ private final int offsetBits; /** The size of a file block. */ private final int blockSize; /** * The #of bits in a 64-bit long integer identifier that are used to encode * the byte offset of a record in the store as an unsigned integer. * * @see com.bigdata.journal.Options#OFFSET_BITS * @see #getBlockSize() */ public final int getOffsetBits() { return offsetBits; } /** * The size of a file block. Block identifiers are 64-bit signed integers. * The maximum file length is <code>2^63 - 1 </code> blocks ( 536,870,912 * Exabytes). * <p> * Note: The {@link BigdataFileSystem} makes the <strong>assumption</strong> * that the {@link com.bigdata.journal.Options#OFFSET_BITS} is the #of * offset bits configured for the {@link IDataService}s in the connected * {@link IBigdataFederation} and computes the * {@link BigdataFileSystem#getBlockSize()} based on that assumption. It is * NOT possible to write blocks on the {@link BigdataFileSystem} whose size * is greater than the maximum block size actually configured for the * {@link IDataService}s in the connected {@link IBigdataFederation}. * * @see com.bigdata.journal.Options#OFFSET_BITS * @see #getOffsetBits() */ public final int getBlockSize() { return blockSize; } /** * The maximum block identifier that can be assigned to a file version. * <p> * Note: This is limited to {@value Long#MAX_VALUE}-1 so that we can always * form the key greater than any valid key for a file version. This is * required by the atomic append logic when it seeks the next block * identifier. See {@link AtomicBlockAppendProc}. */ protected static final long MAX_BLOCK = Long.MAX_VALUE - 1; /** * The basename of the index in which the file metadata are stored. The * fully qualified name of the index uses {@link #getNamespace()} as a * prefix. * <p> * Note: This is a {@link SparseRowStore} governed by the * {@link FileMetadataSchema}. */ public static final String FILE_METADATA_INDEX_BASENAME = "fileMetadata"; /** * The basename of the index in which the file data blocks are stored. The * fully qualified name of the index uses {@link #getNamespace()} as a * prefix. * <p> * Note: The entries in this index are a series of blocks for a file. Blocks * are {@link #blockSize} bytes each and are assigned monotonically * increasing block numbers by the atomic append operation. The final block * may be smaller (there is no need to pad out the data with nulls). The * keys are formed from two fields - a field containing the content * identifier followed by an integer field containing the sequential block * number. A range scan with a fromKey of the file identifier and a toKey * computed using the successor of the file identifier will naturally visit * all blocks in a file in sequence. */ public static final String FILE_DATA_INDEX_BASENAME = "fileData"; public static final FileMetadataSchema metadataSchema = new FileMetadataSchema(); private SparseRowStore fileMetadataIndex; private IIndex fileDataIndex; protected static void assertString(Map<String, Object> properties, String name) { Object val = properties.get(name); if (val == null) throw new IllegalArgumentException(name + " is null"); if (!(val instanceof String)) throw new IllegalArgumentException(name + " must be String"); } protected static void assertLong(Map<String, Object> properties, String name) { Object val = properties.get(name); if (val == null) throw new IllegalArgumentException(name + " is null"); if (!(val instanceof Long)) throw new IllegalArgumentException(name + " must be Long"); } /** * Ctor specified by {@link DefaultResourceLocator}. * * @see Options */ public BigdataFileSystem(IIndexManager indexManager, String namespace, Long timestamp, Properties properties) { super(indexManager,namespace,timestamp,properties); /* * @todo This should probably be raised directly to a property reported * by the federation. Right now it relies on the same default logic * being replicated here and in AbstractJournal. */ offsetBits = Integer .parseInt(properties .getProperty( Options.OFFSET_BITS, Integer .toString((indexManager instanceof Journal ? WormAddressManager.SCALE_UP_OFFSET_BITS : WormAddressManager.SCALE_OUT_OFFSET_BITS)))); blockSize = WormAddressManager.getMaxByteCount(offsetBits) - 1; if (INFO) log.info("offsetBits=" + offsetBits + ", blockSize=" + blockSize); } /** * The index in which the file metadata is stored (the index must exist). */ public SparseRowStore getFileMetadataIndex() { if (fileMetadataIndex == null) { throw new IllegalStateException(); } return fileMetadataIndex; } /** * The index in which the file blocks are stored (the index must exist). */ public IIndex getFileDataIndex() { if (fileDataIndex == null) { throw new IllegalStateException(); } return fileDataIndex; } /** * <code>true</code> unless {{@link #getTimestamp()} is {@link ITx#UNISOLATED}. */ public boolean isReadOnly() { return getTimestamp() != ITx.UNISOLATED; } // final protected void assertWritable() { // // if(isReadOnly()) { // // throw new IllegalStateException("READ_ONLY"); // // } // // } /** * Note: A commit is required in order for a read-committed view to have * access to the registered indices. When running against an * {@link IBigdataFederation}, {@link ITx#UNISOLATED} operations will take * care of this for you. Otherwise you must do this yourself. */ @Override public void create() { assertWritable(); final IResourceLock resourceLock = acquireExclusiveLock(); try { final Properties tmp = getProperties(); // final int branchingFactor = Integer.parseInt(tmp.getProperty( // Options.BRANCHING_FACTOR, Options.DEFAULT_BRANCHING_FACTOR)); // set property that will let the contained relations locate their // container. tmp.setProperty(RelationSchema.CONTAINER, getNamespace()); super.create(); final IIndexManager indexManager = getIndexManager(); // setup metadata index. { /* * Note: This specifies an split handler that keeps the logical * row together. This is a hard requirement. The atomic * read/update guarantee depends on this. */ final String name = getNamespace()+"."+FILE_METADATA_INDEX_BASENAME; final IndexMetadata md = new IndexMetadata(indexManager, tmp, name, UUID.randomUUID(), IndexTypeEnum.BTree); // Ensure that splits do not break logical rows. md.setSplitHandler(LogicalRowSplitHandler.INSTANCE); indexManager.registerIndex(md); final IIndex ndx = indexManager.getIndex(name, getTimestamp()); fileMetadataIndex = new SparseRowStore(ndx); } // setup data index. { /* * @todo specify split handler that tends to keep the blocks for a * file together (soft requirement). */ final String name = getNamespace()+"."+FILE_DATA_INDEX_BASENAME; final IndexMetadata md = new IndexMetadata(indexManager, tmp, name, UUID.randomUUID(), IndexTypeEnum.BTree); /* * @todo unit tests for correct copying of blobs during overflow. * See {@link IOverflowHandler}. */ md.setOverflowHandler(new BlobOverflowHandler()); // register the index. indexManager.registerIndex(md); fileDataIndex = indexManager.getIndex(name,getTimestamp()); } } finally { unlock(resourceLock); } } @Override public void destroy() { assertWritable(); final IResourceLock resourceLock = acquireExclusiveLock(); try { getIndexManager().dropIndex(getNamespace()+"."+FILE_METADATA_INDEX_BASENAME); getIndexManager().dropIndex(getNamespace()+"."+FILE_DATA_INDEX_BASENAME); super.destroy(); } finally { unlock(resourceLock); } } /** * Creates a new file version from the specified metadata. The new file * version will not have any blocks. You can use either stream-oriented or * block oriented IO to write data on the newly created file version. * * @param metadata * The file metadata. * * @return The new version identifier. */ public int create(Map<String, Object> metadata) { if (metadata == null) throw new IllegalArgumentException(); // check required properties. assertString(metadata, FileMetadataSchema.ID); // clone the map since it may be unmodifiable. metadata = new HashMap<String, Object>(metadata); // auto-increment the last defined version counter. metadata.put(FileMetadataSchema.VERSION, AutoIncIntegerCounter.INSTANCE); // write the metadata (atomic operation). final ITPS tps = getFileMetadataIndex().write(metadataSchema, metadata, AUTO_TIMESTAMP_UNIQUE, null/* filter */, null/*precondition*/); final int version = (Integer) tps.get(FileMetadataSchema.VERSION).getValue(); if(INFO) log.info("Created new version: id=" + metadata.get(FileMetadataSchema.ID) + ", version=" + version); return version; } @Override public int create(final Document doc) { if (doc == null) throw new IllegalArgumentException(); final String id = doc.getId(); if (id == null) throw new RuntimeException("The " + FileMetadataSchema.ID + " property must be defined."); final Map<String,Object> metadata = doc.asMap(); // /* // * Verify content type was specified since we will write on the file // * version. // */ // assertString(metadata, MetadataSchema.CONTENT_TYPE); /* * Vreate new file version. */ final int version = create( metadata ); /* * Copy data from the document. */ copyStream(id, version, doc.getInputStream()); return version; } /** * Reads the document metadata for the current version of the specified * file. * * @param id * The file identifier. * * @return A read-only view of the file version that is capable of reading * the content from the repository -or- <code>null</code> iff * there is no current version for that file identifier. */ public Document read(String id) { RepositoryDocumentImpl doc = new RepositoryDocumentImpl(this, id); if (!doc.exists()) { // no current version for that document. if(INFO) log.info("No current version: id="+id); return null; } return doc; } /** * Return the file metadata for the version of the file associated with the * specified timestamp. * * @param id * The file identifier. * @param timestamp * The timestamp. * * @return A read-only view of the logical row of metadata for that file as * of that timestamp. * * @see ITPS * @see SparseRowStore#read(Schema, Object, long, com.bigdata.sparse.INameFilter) */ public ITPS readMetadata(final String id, final long timestamp) { return getFileMetadataIndex() .read(metadataSchema, id, timestamp/* fromTime */, timestamp + 1/* toTime */, null/* filter */); } /** * Update the metadata for the current file version. * * @param id * The file identifier. * * @param metadata * The properties to be written. A <code>null</code> value for * a property will cause the corresponding property to be * deleted. Properties not present in this map will NOT be * modified. * * @return The complete metadata for the current file version. */ public Map<String,Object> updateMetadata(String id, Map<String,Object> metadata) { // copy since the map might be unmodifyable. metadata = new HashMap<String,Object>(metadata); // set the id - this is required for the primary key. metadata.put(FileMetadataSchema.ID, id); // remove the version identifier if any - we do not want this modified! metadata.remove(FileMetadataSchema.VERSION); return getFileMetadataIndex().write(metadataSchema, metadata, AUTO_TIMESTAMP_UNIQUE, null/* filter */,null/*precondition*/).asMap(); } /** * Create a new file version using the supplied file metadata. * <p> * Note: This is essentially a delete + create operation. Since the combined * operation is NOT atomic it is possible that conflicts can arise when more * than one client attempts to update a file concurrently. * * @param doc * The file metadata. */ public int update(Document doc) { final Map<String,Object> metadata = doc.asMap(); final String id = (String) metadata.get(FileMetadataSchema.ID); // delete the existing file version (if any). delete( id ); // create a new file version using that metadata. return create( doc ); } /** * Note: A new file version is marked as deleted and then the file blocks * for the old version are deleted from the data index. This sequence means * (a) that clients attempting to read on the file using the high level API * will not see the file as soon as its metadata is updated; (b) that the * timestamp on the deleted version will be strictly LESS THAN the commit * time(s) when the file blocks are deleted, so reading from the timestamp * of the deleted version will let you see the deleted file blocks. This is * a deliberate convenience - if we were to delete the file blocks first * then we would not have ready access to a timestamp that would be before * the first file block delete and hence sufficient to perform a historical * read on the last state of the file before it was deleted. */ public long delete(String id) { final RepositoryDocumentImpl doc = (RepositoryDocumentImpl) read(id); if (!doc.exists()) { // no current version. log.warn("No current version: id=" + id); return 0L; } final int version = doc.getVersion(); /* * Mark the file version as deleted. * * Note: This only deletes the "version" property - the other properties * are not changed. Howevery, the file version will be understood as * "deleted" by this class. */ { final Map<String, Object> metadata = new HashMap<String, Object>(); // primary key. metadata.put(FileMetadataSchema.ID, id); // delete marker. metadata.put(FileMetadataSchema.VERSION, null); getFileMetadataIndex().write(metadataSchema, metadata, AUTO_TIMESTAMP_UNIQUE, null/* filter */, null/*precondition*/); } /* * Delete blocks from the file version. * * Note: This is efficient in that it handles the delete on the data * service for each index partition. However, if the data spans more * than one index partition then the requests to delete the data on each * index partition are issued in sequence. A range-delete procedure * could be even more efficient since it can be parallelized when the * operation spans more than one index partition. */ long blockCount = 0; final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder(); // the key for {file,version} final byte[] fromKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .getKey(); // the key for {file,successor(version)} final byte[] toKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version + 1) .getKey(); final ITupleIterator itr = getFileDataIndex().rangeIterator(fromKey, toKey, 0/* capacity */, IRangeQuery.REMOVEALL, null/* filter */); while (itr.hasNext()) { itr.next(); blockCount++; } if(INFO) log.info("Deleted " + blockCount + " blocks : id=" + id + ", version=" + version); /* * There was a current version for the file. We have written a delete * marker and also deleted any blocks for that file version. */ return blockCount; } /** * Return an array describing all non-eradicated versions of a file. * <p> * This method returns all known version identifiers together with their * timestamps, thereby making it possible to read either the metadata or the * data for historical file versions - as long as the metadata and/or data * has not yet been eradicated. * <p> * The file metadata and data blocks for historical version(s) of a file * remain available until they are eradicated from their respective indices * by a compacting merge in which the history policies no longer perserve * those data. * <p> * In order to read the historical file metadata you need to know the * timestamp associated with the version identifer which you wish to read. * This should be timestamp when that version was <em>deleted</em> MINUS * ONE in order to read the last valid metadata for the file version that * file version was deleted. * <p> * Likewise, in order to read the historical version data you need to know * the version identifer which you wish to read as well as the timestamp. * In this case, use the timestamp when that version was <em>deleted</em> * in order to read the last committed state for the file version. * <p> * Historical file version metadata is eradicated atomically since the * entire logical row will be hosted on the same index partition. Either the * file version metadata is available or it is now. * <p> * Historical file version data is eradicated one index partition at a time. * If the file version spans more than one index partition then it may be * possible to read some blocks from the file but not others. * <p> * Historical file version metadata and data will remain available until * their governing history policy is no longer satisified. Therefore, when * in doubt, you can consult the history policy in force for the file to * determine whether or not its data may have been eradicated. * * @param id * The file identifier. * * @return An array containing (timestamp,version) tuples. Tuples where the * {@link ITPV#getValue()} returns <code>null</code> give the * timestamp at which a file version was <em>deleted</em>. Tuples * where the {@link ITPV#getValue()} returns non-<code>null</code> * give the timestamp at which a file version was <em>created</em>. * * @see #readMetadata(String, long), to read the file version metadata based * on a timestamp. * * @see #inputStream(String, int, long), to read the file data as of a * specific timestamp. * * @todo expose history policy for a file (from its zone metadata, which is * replicated onto the index partition metadata). Make sure that the * zone metadata is consistent for the file version metadata and file * version data. This means looking up the {@link IndexMetadata} for * the index partition in which the file data is stored. */ public ITPV[] getAllVersionInfo(String id) { /* * Query for all metadata for the file. */ ITPS tps = readMetadata(id,Long.MAX_VALUE); Vector<ITPV> vec = new Vector<ITPV>(); /* * Filter for only the version properties, skipping "delete" entries. */ Iterator<? extends ITPV> itr = tps.iterator(); while(itr.hasNext()) { ITPV tpv = itr.next(); if(!tpv.getName().equals(FileMetadataSchema.VERSION)) { // Not a version field. continue; } vec.add(tpv); } return vec.toArray(new ITPV[vec.size()]); } /** * @todo write tests. */ @SuppressWarnings("unchecked") public Iterator<? extends DocumentHeader> getDocumentHeaders(String fromId, String toId) { return new Striterator(getFileMetadataIndex().rangeIterator(metadataSchema, fromId, toId)).addFilter(new Resolver() { private static final long serialVersionUID = 1L; @Override protected Object resolve(Object arg0) { final ITPS tps = (ITPS) arg0; final String id = (String) tps.get( FileMetadataSchema.ID).getValue(); return new RepositoryDocumentImpl( BigdataFileSystem.this, id, tps); } }); } /** * Efficient delete of file metadata and file data for all files and file * versions spanned by the specified file identifiers. File versions are * marked "deleted" before the file blocks are deleted so that you can * read on historical file version with exactly the same semantics as * {@link #delete(String)}. */ public long deleteAll(String fromId, String toId) { final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder(); // the key for {fromId} final byte[] fromKey = keyBuilder.reset().appendText(fromId, true/* unicode */, false/* successor */).getKey(); // the key for {successor(toId)} final byte[] toKey = keyBuilder.reset().appendText(toId, true/* unicode */, true/* successor */).getKey(); // delete file metadata long ndeleted = 0; { /* * Delete the file version metadata for each document in the key * range by replacing its VERSION column value with a null value * (and updating the timestamp in the key). */ getFileMetadataIndex().getIndex().rangeIterator( fromKey, toKey, 0/* capacity */, IRangeQuery.CURSOR, new FileVersionDeleter( IRowStoreConstants.AUTO_TIMESTAMP_UNIQUE)); } // delete file blocks. { final ITupleIterator itr = getFileDataIndex() .rangeIterator(fromKey, toKey, 0/* capacity */, IRangeQuery.REMOVEALL, null/* filter */); long blockCount = 0; while (itr.hasNext()) { itr.next(); blockCount++; } } return ndeleted; } /** * FIXME Integrate with {@link FullTextIndex} to providing indexing and * search of file versions. Deleted file versions should be removed from the * text index. There should be explicit metadata on the file version in * order for it to be indexed. The text indexer will require content type * and encoding information in order to handle indexing. Low-level output * stream, writer, block write and block append operations will not trigger * the indexer since it depends on the metadata index to know whether or not * a file version should be indexed. However you could explicitly submit a * file version for indexing. * <p> * Perhaps the best way to handle this is to queue document metadata up for * a distributed full text indexing service. The service accepts metadata * for documents from the queue and decides whether or not the document * should be indexed based on its metadata and how the document should be * processed if it is to be indexed. Those business rules would be * registered with the full text indexing service. (Alternatively, they can * be configured with the {@link BigdataFileSystem} and applied locally as * the blocks of the file are written into the repository. That's certainly * easier right off the bat.) * * @todo crawl or query job obtains a set of URLs, writing them onto a file. * <p> * m/r job downloads documents based on set of URLs, writing all * documents into a single file version. text-based downloads can be * record compressed and decompressed after the record is read. binary * downloads will be truncated at 64M and might be skipped all * together if the exceed the block size (get images, but not wildely * large files). * <p> * m/r job extracts a simplified html format from the source image, * writing the result onto another file. this job will optionally * split documents into "pages" by breaking where necessary at * paragraph boundaries. * <p> * m/r job builds text index from simplified html format. * <p> * m/r job runs extractors on simplified html format, producing * rdf/xml which is written onto another file. The rdf/xml for each * harvested document is written as its own logical record, perhaps * one record per block. * <p> * concurrent batch load of rdf/xml into scale-out knowledge base. the * input is a single file comprised of blocks, each of which is an * rdf/xml file. */ public Iterator<String> search(String query) { throw new UnsupportedOperationException(); } /* * file data operations (read, atomic append). */ /** * Returns an iterator that visits all block identifiers for the file * version in sequence. * <p> * Note: This may be used to efficiently distribute blocks among a * population of clients, e.g., in a map/reduce paradigm. */ public Iterator<Long> blocks(String id, int version) { final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata() .getKeyBuilder(); final byte[] fromKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .getKey(); final byte[] toKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version + 1) .getKey(); // just the keys. final int flags = IRangeQuery.KEYS; // visits the keys for the file version in block order. final ITupleIterator itr = getFileDataIndex().rangeIterator(fromKey, toKey, 0/* capacity */, flags, null/* filter */); // resolve keys to block identifiers. return new BlockIdentifierIterator( id, version, itr ); } /** * Copies blocks from one file version to another. The data in each block of * the source file version is copied into a new block that is appended to * the target file version. Empty blocks are copied. Partial blocks are NOT * combined. The block identifiers are NOT preserved since atomic append is * used to add blocks to the target file version. * * @param fromId * @param fromVersion * @param toId * @param toVersion * * @return The #of blocks copied. * * FIXME This could be made more efficient by sending the copy operation to * each index partition in turn. that would avoid having to copy the data * first to the client and thence to the target index partition. */ public long copyBlocks(String fromId, int fromVersion, String toId, int toVersion) { final Iterator<Long> src = blocks(fromId,fromVersion); long nblocks = 0L; while(src.hasNext()) { final long blockId = src.next(); // read block final byte[] block = readBlock(fromId, fromVersion, blockId); // write block. appendBlock(toId, toVersion, block, 0, block.length); nblocks++; } return nblocks; } /** * Atomic write of a block for a file version. * <p> * Note: You can write any valid block identifier at any time. If the block * exists then its data will be replaced. * <p> * Note: Writing blocks out of sequence can create "holes". Those holes may * be filled by later writing the "missing" blocks. * {@link #copyBlocks(String, int, String, int)} will renumber the blocks * and produce a dense sequence of blocks. * <p> * Note: Atomic append will always write the successor of the largest block * identifier already written on the file version. If you write block * {@link #MAX_BLOCK} then it will no longer be possible to append blocks to * that file version, but you can still write blocks using * {@link #writeBlock(String, int, long, byte[], int, int)}. * * @param id * The file identifier. * @param version * The file version. * @param block * The block identifier in [0:{@link #MAX_BLOCK}]. * @param b * The buffer containing the bytes to be written. When the buffer * contains more than {@link #blockSize} bytes it will be broken * up into multiple blocks. * @param off * The offset of the 1st byte to be written. * @param len * The #of bytes to be written. * * @return <code>true</code> iff the block was overwritten (ie., if the * block already exists, which case its contents were replaced). * * @throws IllegalArgumentException * if <i>id</id> is <code>null</code> or an empty string. * @throws IllegalArgumentException * if <i>version</id> is negative. * @throws IllegalArgumentException * if <i>block</id> is negative. * @throws IllegalArgumentException * if <i>b</id> is <code>null</code>. * @throws IllegalArgumentException * if <i>off</id> is negative or greater than the length of the * byte[]. * @throws IllegalArgumentException * if <i>len</id> is negative or <i>off+len</i> is greater * than the length of the byte[]. * @throws IllegalArgumentException * if <i>len</i> is greater than {@link #blockSize}. * * @todo return the data for the old block instead in the case of an * overwrite? */ public boolean writeBlock(String id, int version, long block, byte[] b, int off, int len) { if (id == null || id.length() == 0) throw new IllegalArgumentException(); if (version < 0) throw new IllegalArgumentException(); if (block < 0L) { /* * Note: restriction implies 63-bit block identifier (no * negative#s). */ throw new IllegalArgumentException(); } if (block > MAX_BLOCK) { throw new IllegalArgumentException(); } if (b == null) throw new IllegalArgumentException(); if (off < 0 || off > b.length) throw new IllegalArgumentException("off="+off+", b.length="+b.length); if (len < 0 || off + len > b.length) throw new IllegalArgumentException("off="+off+", len="+len+", b.length="+b.length); if(len>blockSize) { throw new IllegalArgumentException(); } // construct the atomic write operation. final ISimpleIndexProcedure proc = new AtomicBlockWriteProc(this, id, version, block, b, off, len); final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder(); // the key for the {file,version,block} final byte[] key = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .append(block).getKey(); return (Boolean) getFileDataIndex().submit(key, proc); } /** * Atomic delete of the first block of the file version. * * @param id * The file identifier. * @param version * The version identifier. * * @return The block identifier of the deleted block -or- <code>-1L</code> * if nothing was deleted. */ public long deleteHead(String id, int version) { if (INFO) log.info("id=" + id + ", version=" + version); final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata() .getKeyBuilder(); // the key for {file,version} final byte[] fromKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .getKey(); // the key for {file,successor(version)} final byte[] toKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append( version + 1).getKey(); /* * The REMOVALL flag together with a limit of ONE (1) is used to obtain * an atomic delete of the first block for this file version. */ final ITupleIterator itr = getFileDataIndex() .rangeIterator(fromKey, toKey, 1, // Note: limit is ONE block! IRangeQuery.KEYS|IRangeQuery.REMOVEALL, null/* filter */); if (!itr.hasNext()) { log.warn("Nothing to delete: id=" + id + ", version=" + version); return -1L; } /* * Consume the iterator but note that the block was already deleted if * this was a remote request. */ final long block = new BlockIdentifierIterator(id, version, itr).next(); if(INFO) log.info("id="+id+", version="+version+" : deleted block="+block); return block; } /** * Atomic delete of a block for a file version. * * @param id * The file identifier. * @param version * The version identifier. * @param block * The block identifier -or- <code>-1L</code> to read the first * block in the file version regardless of its block identifier. * * @return <code>true</code> iff the block was deleted. */ public boolean deleteBlock(String id, int version, long block) { if (id == null || id.length() == 0) throw new IllegalArgumentException(); if (version < 0) throw new IllegalArgumentException(); if (block < 0L) { /* * Note: restriction implies 63-bit block identifier (no * negative#s). */ throw new IllegalArgumentException(); } if (block > MAX_BLOCK) { throw new IllegalArgumentException(); } final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder(); final byte[] key = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .append(block).getKey(); /* * Note: The return value is just the serialized address of that block * on the journal (8 bytes). */ final boolean deleted = getFileDataIndex().remove(key) != null; return deleted; } /** * Atomic read of the first block of the file version. * * @param id * The file identifier. * @param version * The version identifier. * * @return The contents of the block -or- <code>null</code> iff there are * no blocks for that file version. Note that an empty block will * return an empty byte[] rather than <code>null</code>. */ public byte[] readHead(String id, int version) { /* * Setup range scan than will span all blocks for the file version. We * are only interested in the first block, but this is how we get at its * data using an atomic read. */ final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder(); final byte[] fromKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .append(0L).getKey(); final byte[] toKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .append(Long.MAX_VALUE).getKey(); /* * Resolve the requested block : keys and data. */ final ITupleIterator itr = getFileDataIndex() .rangeIterator(fromKey, toKey, 1/* capacity */, IRangeQuery.KEYS | IRangeQuery.VALS, null/* filter */); if (!itr.hasNext()) { if (INFO) log.info("id=" + id + ", version=" + version + " : no blocks"); return null; } return readBlock(id, version, itr.next()); } /** * Atomic read of a block for a file version. * * @param id * The file identifier. * @param version * The version identifier. * @param block * The block identifier. * * @return The contents of the block -or- <code>null</code> iff the block * does not exist. Note that an empty block will return an empty * byte[] rather than <code>null</code>. * * @todo offer a variant that returns an {@link InputStream}? */ public byte[] readBlock(String id, int version, long block) { if (id == null) throw new IllegalArgumentException(); /* * Setup range scan than will span exactly the specified block. * * Note: This uses a range scan because a lookup will return the address * of the block rather than its data! */ final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder(); final byte[] fromKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .append(block).getKey(); final byte[] toKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .append(block + 1).getKey(); /* * Resolve the requested block : keys and data. */ final ITupleIterator itr = getFileDataIndex() .rangeIterator(fromKey, toKey, 1/* capacity */, IRangeQuery.KEYS | IRangeQuery.VALS, null/* filter */); if (!itr.hasNext()) { if (INFO) log.info("id=" + id + ", version=" + version + ", block=" + block + " : does not exist"); return null; } return readBlock(id, version, itr.next()); } /** * Helper to read a block from an {@link ITuple}. * * @param id * @param version * @param tuple * @return */ private byte[] readBlock(String id, int version, ITuple tuple) { final byte[] key = tuple.getKey(); // decode the block identifier from the key. // block = KeyBuilder.decodeLong(tuple.getKeyBuffer().array(), // tuple.getKeyBuffer().pos() - Bytes.SIZEOF_LONG); long block = KeyBuilder.decodeLong(key, key.length - Bytes.SIZEOF_LONG); final long addr; try { DataInput in = tuple.getValueStream(); addr = in.readLong(); } catch (IOException e) { throw new RuntimeException(e); } if (addr == 0L) { /* * Note: empty blocks are allowed and are recorded with 0L as * their address. */ if(INFO) log.info("id=" + id + ", version=" + version + ", block=" + block + " : empty block."); return new byte[]{}; } /* * Read the block from the backing store. */ final IBlock tmp = tuple.readBlock(addr); final int len = tmp.length(); if(INFO) log.info("id=" + id + ", version=" + version + ", block=" + block + " : " + len + " bytes"); // @todo reuse buffers, but must return {byte[],off,len} tuple. final byte[] data = new byte[len]; try { final int nread = tmp.inputStream().read(data, 0, len); if (nread != len) { throw new RuntimeException("Expecting to read " + len + " bytes but read " + nread + " bytes"); } } catch (IOException e) { throw new RuntimeException(e); } return data; } /** * Atomic append of a block to a file version. * * @param id * The file identifier. * @param version * The file version. * @param b * The buffer containing the data to be written.. * @param off * The offset of the 1st byte to be written. * @param len * The #of bytes to be written in [0:{@link #blockSize}]. * * @return The block identifer for the written block. * * @throws IllegalArgumentException * if <i>id</id> is <code>null</code> or an empty string. * @throws IllegalArgumentException * if <i>version</id> is negative. * @throws IllegalArgumentException * if <i>b</id> is <code>null</code>. * @throws IllegalArgumentException * if <i>off</id> is negative or greater than the length of the * byte[]. * @throws IllegalArgumentException * if <i>len</id> is negative or <i>off+len</i> is greater * than the length of the byte[]. * @throws IllegalArgumentException * if <i>len</i> is greater than {@link #blockSize}. */ public long appendBlock(String id, int version, byte[] b, int off, int len) { if (id == null || id.length() == 0) throw new IllegalArgumentException(); if (version < 0) throw new IllegalArgumentException(); if (b == null) throw new IllegalArgumentException(); if (off < 0 || off > b.length) throw new IllegalArgumentException("off="+off+", b.length="+b.length); if (len < 0 || off + len > b.length) throw new IllegalArgumentException("off="+off+", len="+len+", b.length="+b.length); if (len > blockSize) { throw new IllegalArgumentException(); } // construct the atomic append operation. final ISimpleIndexProcedure proc = new AtomicBlockAppendProc(this, id, version, b, off, len); final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder(); // the last possible key for this file final byte[] key = keyBuilder.reset().appendText(id, true/* unicode */, true/* successor */).append(version) .append(-1L).getKey(); /* * Figure out which index partition will absorb writes on the end of the * file. We do this by finding the index partition that would contain * the successor of the id and then considering its leftSeparator. If * the leftSeparator is greater than the id then the id does not enter * this index partition and we use the prior index partition. Otherwise * the id enters this partition and we use it. * * Note: File versions allow us to avoid painful edge cases when a file * has been deleted that spans more than one index partition. Since we * never attempt to write on the deleted file version we are not faced * with the problem of locating the largest index partition that * actually has data for that file. When a large file has been deleted * there can be EMPTY index partitions (containing only deleted entries) * until the next compacting merge. */ return (Long) getFileDataIndex().submit(key, proc); } /** * Return the maximum #of blocks in the file version. The return value * includes any deleted but not yet eradicated blocks for the specified file * version, so it represents an upper bound on the #of blocks that could be * read for that file version. * <p> * Note: the block count only decreases when a compacting merge eradicates * deleted blocks from an index partition. It will increase any time there * is a write on a block for the file version for which neither a delete nor * an undeleted entry exists. The only way to count the #of non-deleted * blocks for a file version is to traverse the {@link #blocks(String, int)} * iterator. * * @param id * The file identifier. * @param version * The file version identifier. * * @return The #of blocks in that file. */ public long getBlockCount(String id, int version) { final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata() .getKeyBuilder(); final byte[] fromKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .getKey(); final byte[] toKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version + 1) .getKey(); final long nblocks = getFileDataIndex().rangeCount(fromKey, toKey); if (INFO) log.info("id=" + id + ", version=" + version + ", nblocks=" + nblocks); return nblocks; } /** * Return a {@link Writer} that will <em>append</em> character data on the * file version. Characters written on the {@link Writer} will be converted * to bytes using the specified encoding. Bytes will be buffered until the * block is full and then written on the file version using an atomic * append. An {@link Writer#flush()} will force a non-empty partial block to * be written immediately. * <p> * Note: Map/Reduce processing of a file version MAY be facilitated greatly * by ensuring that "records" never cross a block boundary - this means that * file versions can be split into blocks and blocks distributed to clients * without any regard for the record structure within those blocks. The * caller can prevent records from crossing block boundaries by the simple * expediency of invoking {@link Writer#flush()} to force the atomic append * of a (partial but non-empty) block to the file. * <p> * Since the characters are being converted to bytes, the caller MUST make * {@link Writer#flush()} decisions with an awareness of the expansion rate * of the specified encoding. For simplicity, it is easy to specify * <code>UTF-16</code> in which case you can simply count two bytes * written for each character written. * * @param id * The file identifier. * @param version * The version identifier. * @param encoding * The character set encoding. * * @return The writer on which to write the character data. * * @throws UnsupportedEncodingException */ public Writer writer(String id, int version, String encoding) throws UnsupportedEncodingException { if(INFO) log.info("id="+id+", version="+version+", encoding="+encoding); return new OutputStreamWriter(outputStream(id, version), encoding); } /** * Read character data from a file version. * * @param id * The file identifier. * @param version * The version identifier. * @param encoding * The character set encoding. * * @return The reader from which you can read the character data. * * @throws UnsupportedEncodingException */ public Reader reader(String id, int version, String encoding) throws UnsupportedEncodingException { if(INFO) log.info("id="+id+", version="+version+", encoding="+encoding); if (encoding == null) { throw new IllegalStateException(); } return new InputStreamReader(inputStream(id, version), encoding); } /** * Read data from a file version. * <p> * Note: The input stream will remain coherent for the file version as of * the time that the view on the file version is formed. Additional atomic * appends MAY be read, but that is NOT guarenteed. If the file is deleted * and its data is expunged by a compacting merge during the read then the * read MAY be truncated. * * @param id * The file identifier. * @param version * The version identifier. * * @return An input stream from which the caller may read the data in the * file -or- <code>null</code> if there is no data for that file * version, including no deleted blocks pending garbage collection. * An empty input stream MAY be returned since empty blocks are * allowed. An empty stream will also be returned after a file * version is deleted until the deleted blocks are eradicated from * the file data index. */ public FileVersionInputStream inputStream(String id,int version) { return inputStream(id, version, ITx.UNISOLATED); } /** * Read data from a file version. * <p> * Some points about consistency and transaction identifiers. * <ol> * * <li> When using an {@link ITx#UNISOLATED} read addition atomic writes and * atomic appends issued after the input stream view was formed MAY be read, * but that is NOT guarenteed - it depends on the buffering of the range * iterator used to read blocks for the file version. Likewise, if the file * is deleted and its data is expunged by a compacting merge during the read * then the read MAY be truncated. </li> * * <li> It is possible to re-create historical states of a file version * corresponding to a <em>commit point</em> for the * {@link #getFileDataIndex() data index} provided that the relevant data has * not been eradicated by a compacting merge. It is not possible to recover * all states - merely committed states - since unisolated writes may be * grouped together by group commit and therefore have the same commit * point. </li> * * <li> It is possible to issue transactional read requests, but you must * first open a transaction with an {@link ITransactionManagerService}. In general * the use of full transactions is discouraged as the * {@link BigdataFileSystem} is designed for high throughput and high * concurrency with weaker isolation levels suitable for scale-out * processing techniques including map/reduce.</li> * * </ol> * * @param id * The file identifier. * @param version * The version identifier. * @param tx * The transaction identifier. This is generally either * {@link ITx#UNISOLATED} to use an unisolated read -or- * <code>- timestamp</code> to use a historical read for the * most recent consistent state of the file data not later than * <i>timestamp</i>. * * @return An input stream from which the caller may read the data in the * file -or- <code>null</code> if there is no data for that file * version, including no deleted blocks pending garbage collection. * An empty input stream MAY be returned since empty blocks are * allowed. An empty stream will also be returned after a file * version is deleted until the deleted blocks are eradicated from * the file data index. */ public FileVersionInputStream inputStream(String id, int version, long tx) { if (INFO) log.info("id=" + id + ", version=" + version + ", tx=" + tx); /* * Range count the file and version on the federation - this is the * number of blocks of data for that file and version as of the start of * this read operation. If the result is zero then there are no index * partitions which span that file and version and we return null. * * Note: This step is skipped for historical and transactional reads * since getBlockCount() does not accept the transaction identifier. */ if (tx == ITx.UNISOLATED && getBlockCount(id, version) == 0L) { if (INFO) log.info("No data: id=" + id + ", version=" + version); return null; } /* * Return an input stream that will progress through a range scan of the * blocks for that file and version. */ final IKeyBuilder keyBuilder = getFileDataIndex().getIndexMetadata().getKeyBuilder(); final byte[] fromKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version) .getKey(); final byte[] toKey = keyBuilder.reset().appendText(id, true/* unicode */, false/* successor */).append(version + 1) .getKey(); /* * The capacity is essentially the #of block addresses to transfer at a * time, not the #of blocks. I've set a moderately low limit here since * the blocks themselves need to be transferred as well, so there is * little point in buffering too many block addresses. * * The addresses associated with a block identifier are updated when the * block is re-written, so if you buffer a lot of block addresses here * then updates to the blocks for the buffered identifiers will not be * visible to the client. * * Finally, for very large files you may find that the block addresses * grow stale (the resource on which they were written may be moved or * deleted following a compacting merge), forcing a re-start of the read * from the last visited block identifier. * * @todo handle automatic restart of the read from the next block * identifier if we learn that the resource on which a block was written * has been deleted. */ final int capacity = 1000; // both keys and values. final int flags = IRangeQuery.KEYS | IRangeQuery.VALS; final ITupleIterator itr; final IIndex dataIndex; if (tx == ITx.UNISOLATED) { dataIndex = getFileDataIndex(); } else { /* * Obtain the index view for that historical timestamp or isolated * by the specified transaction. */ dataIndex = getIndexManager().getIndex(getNamespace()+"."+FILE_DATA_INDEX_BASENAME,tx); } itr = dataIndex .rangeIterator(fromKey, toKey, capacity, flags, null/* filter */); return new FileVersionInputStream(id, version, itr); } /** * Return an output stream that will <em>append</em> on the file version. * Bytes written on the output stream will be buffered until they are full * blocks and then written on the file version using an atomic append. An * {@link OutputStream#flush()} will force a non-empty partial block to be * written immediately. * <p> * Note: Map/Reduce processing of a file version MAY be facilitated greatly * by ensuring that "records" never cross a block boundary - this means that * files can be split into blocks and blocks distributed to clients without * any regard for the record structure within those blocks. The caller can * prevent records from crossing block boundaries by the simple expediency * of invoking {@link OutputStream#flush()} to force the atomic append of a * (partial but non-empty) block to the file. * * @param id * The file identifier. * @param version * The version identifier. * * @return The output stream. */ public OutputStream outputStream(String id, int version) { if(INFO) log.info("id="+id+", version="+version); return new FileVersionOutputStream(this, id, version); } /** * Copies data from the input stream to the file version. The data is * buffered into blocks. Each block is written on the file version using an * atomic append. Writing an empty stream will cause an empty block to be * appended (this ensures that read back will read an empty stream). * * @param id * The file identifier. * @param version * The version identifier. * @param is * The input stream (closed iff it is fully consumed). * * @return The #of bytes copied. */ public long copyStream(String id, int version, InputStream is) { final FileVersionOutputStream os = (FileVersionOutputStream) outputStream( id, version); final long ncopied; try { ncopied = os.copyStream( is ); if (ncopied == 0) { // force an empty block to be written. appendBlock(id, version, new byte[]{}, 0, 0); } os.close(); } catch(IOException ex) { throw new RuntimeException(ex); } return ncopied; } }