/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.bigdata.journal; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.channels.AsynchronousCloseException; import java.nio.channels.ClosedChannelException; import java.nio.channels.FileChannel; import java.util.Map; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Exchanger; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import com.bigdata.BigdataStatics; import com.bigdata.btree.BTree.Counter; import com.bigdata.counters.AbstractStatisticsCollector; import com.bigdata.counters.CounterSet; import com.bigdata.counters.Instrument; import com.bigdata.counters.OneShotInstrument; import com.bigdata.io.DirectBufferPool; import com.bigdata.io.FileChannelUtility; import com.bigdata.io.IBufferAccess; import com.bigdata.io.IReopenChannel; import com.bigdata.journal.WORMStrategy.StoreCounters; import com.bigdata.rawstore.IRawStore; import com.bigdata.resources.StoreManager.ManagedJournal; import com.bigdata.util.Bytes; /** * Disk-based journal strategy. * <p> * Writes are buffered in a write cache. The cache is flushed when it would * overflow. As a result only large sequential writes are performed on the * store. Reads read through the write cache for consistency. * <p> * Note: This is used to realize both the {@link BufferMode#Disk} and the * {@link BufferMode#Temporary} {@link BufferMode}s. When configured for the * {@link BufferMode#Temporary} mode: the root blocks will not be written onto * the disk, writes will not be forced, and the backing file will be created the * first time the {@link DiskOnlyStrategy} attempts to write through to the * disk. For many scenarios, the backing file will never be created unless the * write cache overflows. This provides very low latency on start-up, the same * MRMW capability, and allows very large temporary stores. * * FIXME Examine behavior when write caching is enabled/disabled for the OS. * This has a profound impact. Asynchronous writes of multiple buffers, and the * use of smaller buffers, may be absolutely when the write cache is disabled. * It may be that swapping sets in because the Windows write cache is being * overworked, in which case doing incremental and async IO would help. Compare * with behavior on server platforms. See * http://support.microsoft.com/kb/259716, * http://www.accucadd.com/TechNotes/Cache/WriteBehindCache.htm, * http://msdn2.microsoft.com/en-us/library/aa365165.aspx, * http://www.jasonbrome.com/blog/archives/2004/04/03/writecache_enabled.html, * http://support.microsoft.com/kb/811392, * http://mail-archives.apache.org/mod_mbox/db-derby-dev/200609.mbox/%3C44F820A8.6000000@sun.com%3E * * <pre> * /sbin/hdparm -W 0 /dev/hda 0 Disable write caching * /sbin/hdparm -W 1 /dev/hda 1 Enable write caching * </pre> * * @todo report whether or not the on-disk write cache is enabled for each * platform in {@link AbstractStatisticsCollector}. offer guidence on how * to disable that write cache. * * @todo The flush of the write cache could be made asynchronous if we had two * write buffers, but that increases the complexity significantly. It * would have to be synchronous if invoked from {@link #force(boolean)} in * any case (or rather force would have to flush all buffers). * <p> * Reconsider a 2nd buffer so that we can avoid waiting on the writes to * disk. Use * {@link Executors#newSingleThreadExecutor(java.util.concurrent.ThreadFactory) * to obtain the 2nd (daemon) thread and an. {@link Exchanger}. * <p> * Consider the generalization where a WriteCache encapulates the logic * that exists in this class and where we have a {@link BlockingQueue} of * available write caches. There is one "writable" writeCache object at * any given time, unless we are blocked waiting for one to show up on the * availableQueue. When a WriteCache is full it is placed onto a * writeQueue. A thread reads from the writeQueue and performs writes, * placing empty WriteCache objects onto the availableQueue. Sync places * the current writeCache on the writeQueue and then waits on the * writeQueue to be empty. Large objects could be wrapped and written out * using the same mechansims but should not become "available" again after * they are written. * <p> * Consider that a WriteCache also doubles as a read cache IF we create * write cache objects encapsulating reads that we read directly from the * disk rather than from a WriteCache. In this case we might do a larger * read so as to populate more of the WriteCache object in the hope that * we will have more hits in that part of the journal. * <p> * modify force to use an atomic handoff of the write cache so that the * net result is atomic from the perspective of the caller. This may * require locking on the write cache so that we wait until concurrent * writes have finished before flushing to the disk or I may be able to * use nextOffset to make an atomic determination of the range of the * buffer to be forced, create a view of that range, and use the view to * force to disk so that the position and limits are not changed by force * nor by concurrent writers - this may also be a problem for the Direct * mode and the Mapped mode, at least if they use a write cache. * <p> * Async cache writes are also useful if the disk cache is turned off and * could gain importance in offering tighter control over IO guarentees. * * @todo test verifying that large records are written directly and that the * write cache is properly flush beforehand. * * @todo test verifying that the write cache can be disabled. * * @todo test verifying that {@link #writeCacheOffset} is restored correctly on * restart (ie., you can continue to append to the store after restart and * the result is valid). * * @todo test verifying that the buffer position and limit are updated correctly * by {@link #write(ByteBuffer)} regardless of the code path. * * @todo Retro fit the concept of a write cache into the * {@link DirectBufferStrategy} so that we defer writes onto the disk * until (a) a threshold of data has been buffered; or (b) * {@link #force(boolean)} is invoked. Note that the implementation will * be a bit different since the Direct mode is already fully buffered so * we do not need to allocate a separate writeCache. However, we will * still need to track the {@link #writeCacheOffset} and maintain a * {@link #writeCacheIndex}. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ * * @see BufferMode#Disk * @see BufferMode#Temporary * * @deprecated This has been replaced by {@link WORMStrategy}. */ public class DiskOnlyStrategy extends AbstractBufferStrategy implements IDiskBasedStrategy { /** * The file. */ private final File file; /** * The mode used to open that file. */ private final String fileMode; /** * <code>true</code> iff configured as a {@link BufferMode#Temporary} store. */ private final boolean temporaryStore; /** * The backing file for a {@link BufferMode#Temporary} store is not opened * until the {@link #writeCache} is flushed to disk for the first time. In * these scenarios this field will be <code>false</code> until the * {@link #writeCache} is flushed and <code>true</code> thereafter. For * {@link BufferMode#Disk}, this field is initially <code>true</code>. * <p> * The value of this field determines the behavior of * {@link #reopenChannel()}. */ private boolean fileOpened; /** * The IO interface for the file - <strong>use * {@link #getRandomAccessFile()} rather than this field</strong>. */ /*private*/ /*final*/ RandomAccessFile raf; /** * The size of the journal header, including MAGIC, version, and both root * blocks. This is used as an offset when computing the address of a record * in an underlying file and is ignored by buffer modes that are not backed * by a file (e.g., transient) or that are memory mapped (since the map is * setup to skip over the header) */ private final int headerSize; /** * Extent of the file. This value should be valid since we obtain an * exclusive lock on the file when we open it. */ private long extent; private long userExtent; // /** // * Optional read cache. // * <p> // * Note: When enabled, records are entered iff there is a miss on a read. // * Written records are NOT entered into the read cache since (when the // * {@link #writeCache} is enabled), recently written records are already in // * the {@link #writeCache}. // * <p> // * Note: The higher-level data structures use the {@link LRUNexus}, which // * provides a read cache of the decompressed records. For this reason there // * is little reason to enable this lower-level read cache. // */ // private LRUCache<Long, byte[]> readCache = null; // // /** // * The maximum size of a record that may enter the {@link #readCache}. // * Records larger than this are not cached. // */ // private int readCacheMaxRecordSize = 0; /** * Optional {@link WriteCache}. */ final private WriteCache writeCache; /** * The next offset at which data in the {@link #writeCache} will be written * on the disk. The offset is relative to the start of the user data space. * Offset zero(0) addresses the first byte after the root blocks. * * @see FileMetadata#nextOffset * @see AbstractBufferStrategy#nextOffset */ private long writeCacheOffset; private class WriteCache { /** * The buffer used to absorb writes that are destined for the disk. Writes * are simply appended into this buffer until it would overflow. On * overflow, {@link #flushWriteCache()} is invoked to flush the data to the * disk (without synchronizing the disk). If a record is too large to fit * into this buffer, then the write cache is flushed and the record is * written directly on the disk. * <p> * Note: We must clone the data since the * {@link IRawStore#write(ByteBuffer)} contract says that the caller can * reuse the buffer once we return. In order minimize heap churn we simply * copy the data into {@link #buf}, a {@link ByteBuffer} that * buffers recently written records. Writes are deferred until the buffer is * would overflow and then all buffered are written at once onto the disk. * <p> * In order to ensure consistency we read through the {@link #buf} in * {@link #read(long)}. Otherwise a {@link #write(ByteBuffer)} could return * and a subsequent read on the record while it is in the * {@link #buf} would "miss" causing us to read through to the disk * (which would not have the correct data). * <p> * Note: The write cache design assumes an "append only" store. In * particular, it assumes that the application data records are written in * are purely sequential manner on the end of the file (the root blocks are * outside of the application data). Either the write cache must be disabled * or a different design must be used if you are using a store where records * may be deleted and recycled. * <p> * The write cache offers a 27% performance gain when compared to the same * condition without the write cache as measured by * {@link AbstractMRMWTestCase}. */ private IBufferAccess buf; /** * An index into the write cache used for read through on the cache. The * keys are the addresses that would be used to read the corresponding * record. The values are the position in {@link #buf} where that record * is buffered. A cache miss means that you need to read the record from * the disk. */ final private Map<Long,Integer> writeCacheIndex; // /** // * The starting position in the buffer for data that has not been // * written to the disk. // * // * @see Task // */ // private int start = 0; /** * Create a {@link WriteCache} from a caller supplied buffer. * <p> * Note: {@link FileChannel} IO is fact perform using a direct * {@link ByteBuffer}. When the caller supplies a {@link ByteBuffer} * that is allocated on the Java heap as opposed to in native memory a * temporary direct {@link ByteBuffer} will be allocated for the IO * operation. The JVM can fail to release this temporary direct * {@link ByteBuffer}, resulting in a memory leak. For this reason, the * write cache should be a direct {@link ByteBuffer} and the same direct * {@link ByteBuffer} instance should be shared when overflow causes the * live journal overflow, being replaced by a new live journal. This bug * forces us to pass in the write cache {@link ByteBuffer} directly via * the {@link ManagedJournal} constructor. * * @see http://bugs.sun.com/bugdatabase/view_bug.do;jsessionid=8fab76d1d4479fffffffffa5abfb09c719a30?bug_id=6210541 * * @param writeCache * A {@link ByteBuffer} to be used as the write cache * (optional). * * @param capacity */ public WriteCache(final IBufferAccess writeCache) { if (writeCache == null) throw new IllegalArgumentException(); // save reference to the write cache. this.buf = writeCache; // the capacity of the buffer in bytes. final int capacity = writeCache.buffer().capacity(); /* * Discard anything in the buffer, resetting the position to zero, * the mark to zero, and the limit to the capacity. */ writeCache.buffer().clear(); /* * An estimate of the #of records that might fit within the write * cache. This is based on an assumption that the "average" record * is 1k. This is used solely to assign the initial capacity to the * writeCacheIndex. */ final int indexDefaultCapacity = capacity / (1 * Bytes.kilobyte32); // allocate and initialize the write cache index. writeCacheIndex = new ConcurrentHashMap<Long, Integer>(indexDefaultCapacity); } /** * The current position in the buffer. */ final int position() { return buf.buffer().position(); } /** * The capacity of the buffer. */ final int capacity() { return buf.buffer().capacity(); } void flush() { // #of bytes to write on the disk. final int nbytes = buf.buffer().position(); if (nbytes == 0) return; // limit := position; position := 0; buf.buffer().flip(); // write the data on the disk file. writeOnDisk(buf.buffer(), writeCacheOffset, true/*append*/); // position := 0; limit := capacity. buf.buffer().clear(); // clear the index since all records were flushed to disk. writeCacheIndex.clear(); } /** * Write the record on the cache. * * @param addr * The address assigned to that record in the journal. * * @param data * The record. */ void write(final long addr, final ByteBuffer data) { // the position() at which the record is cached. final int position = buf.buffer().position(); // copy the record into the cache. buf.buffer().put(data); // add the record to the write cache index for read(addr). writeCacheIndex.put(Long.valueOf(addr), Integer.valueOf(position)); } /** * Read a record from the write cache. * * @param addr * The address assigned to that record in the journal. * @param nbytes * The length of the record (decoded from the address by the * caller). * * @return A read-write view onto the record in the write cache buffer * -or- <code>null</code> iff the record does not lie within * this {@link WriteCache}. * <p> * Note: The caller MUST copy the data from the view since * concurrent operations may result in the write cache being * flushed and the view overwritten with new data. * <p> * Note: A read-write view is returned in order to support * {@link DiskOnlyStrategy#update(long, int, ByteBuffer)} * for those cases when the record to be updated in still in * the {@link WriteCache}. */ ByteBuffer read(final long addr, final int nbytes) { /* * The return value is the position in the writeCache where that * record starts and [null] if the record is not in the writeCache. */ final Integer writeCachePosition = writeCacheIndex.get(addr); if (writeCachePosition == null) { // The record is not in this write cache. return null; } // the start of the record in writeCache. final int pos = writeCachePosition; // create a view with same offset, limit and position. final ByteBuffer tmp = buf.buffer().duplicate(); // adjust the view to just the record of interest. tmp.limit(pos + nbytes); tmp.position(pos); /* * Return a slice using that view - this restrict the caller to only * those bytes exposed by the slice. */ return tmp.slice(); } } /** * Need to override commit to ensure the writeCache is flushed prior to * writing the root block. * * For the DiskOnlyStrategy flushing the writeCache also ensures the backing * file is created if the file is temporary. * * Note that the internal call to flush the writeCache must be synchronized * or concurrent writers to the cache will cause problems. */ @Override public void commit() { if (writeCache != null) { synchronized (this) { flushWriteCache(); } } super.commit(); } /** * Writes the {@link #writeCache} through to the disk and its position is * reset to zero. * <p> * The caller MUST be synchronized on <i>this</i>. */ void flushWriteCache() { if (writeCache == null) return; writeCache.flush(); // storeCounters.ncacheFlush++; } final public int getHeaderSize() { return headerSize; } final public File getFile() { return file; } /** * Note: This MAY be <code>null</code>. If {@link BufferMode#Temporary} * is used then it WILL be <code>null</code> until the {@link #writeCache} * is flushed to disk for the first time. */ final public RandomAccessFile getRandomAccessFile() { return raf; } /** * Note: This MAY be <code>null</code>. If {@link BufferMode#Temporary} * is used then it WILL be <code>null</code> until the {@link #writeCache} * is flushed to disk for the first time. */ final public FileChannel getChannel() { final RandomAccessFile raf = getRandomAccessFile(); if (raf == null) return null; return raf.getChannel(); } // /** // * Counters for {@link IRawStore} access, including operations that read or // * write through to the underlying media. // * // * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> // * @version $Id$ // * // * @todo report elapsed time and average latency for force, reopen, and // * writeRootBlock. // * // * @todo counters need to be atomic if we want to avoid the possibility of // * concurrent <code>x++</code> operations failing to correctly // * increment <code>x</code> for each request. // */ // public static class StoreCounters { // // /** // * #of read requests. // */ // public long nreads; // // /** // * #of read requests that are satisfied by our write cache (vs the // * OS or disk level write cache). // */ // public long ncacheRead; // // /** // * #of read requests that read through to the backing file. // */ // public long ndiskRead; // // /** // * #of bytes read. // */ // public long bytesRead; // // /** // * #of bytes that have been read from the disk. // */ // public long bytesReadFromDisk; // // /** // * The size of the largest record read. // */ // public long maxReadSize; // // /** // * Total elapsed time for reads. // */ // public long elapsedReadNanos; // // /** // * Total elapsed time checking the disk write cache for records to be // * read. // */ // public long elapsedCacheReadNanos; // // /** // * Total elapsed time for reading on the disk. // */ // public long elapsedDiskReadNanos; // // /** // * #of write requests. // */ // public long nwrites; // // /** // * #of write requests that are absorbed by our write cache (vs the OS or // * disk level write cache). // */ // public long ncacheWrite; // // /** // * #of times the write cache was flushed to disk. // */ // public long ncacheFlush; // // /** // * #of write requests that write through to the backing file. // */ // public long ndiskWrite; // // /** // * The size of the largest record written. // */ // public long maxWriteSize; // // /** // * #of bytes written. // */ // public long bytesWritten; // // /** // * #of bytes that have been written on the disk. // */ // public long bytesWrittenOnDisk; // // /** // * Total elapsed time for writes. // */ // public long elapsedWriteNanos; // // /** // * Total elapsed time writing records into the cache (does not count // * time to flush the cache when it is full or to write records that do // * not fit in the cache directly to the disk). // */ // public long elapsedCacheWriteNanos; // // /** // * Total elapsed time for writing on the disk. // */ // public long elapsedDiskWriteNanos; // // /** // * #of times the data were forced to the disk. // */ // public long nforce; // // /** // * #of times the length of the file was changed (typically, extended). // */ // public long ntruncate; // // /** // * #of times the file has been reopened after it was closed by an // * interrupt. // */ // public long nreopen; // // /** // * #of times one of the root blocks has been written. // */ // public long nwriteRootBlock; // // /** // * Initialize a new set of counters. // */ // public StoreCounters() { // // } // // /** // * Copy ctor. // * @param o // */ // public StoreCounters(final StoreCounters o) { // // add( o ); // // } // // /** // * Adds counters to the current counters. // * // * @param o // */ // public void add(final StoreCounters o) { // // nreads += o.nreads; // ncacheRead += o.ncacheRead; // ndiskRead += o.ndiskRead; // bytesRead += o.bytesRead; // bytesReadFromDisk += o.bytesReadFromDisk; // maxReadSize += o.maxReadSize; // elapsedReadNanos += o.elapsedReadNanos; // elapsedCacheReadNanos += o.elapsedCacheReadNanos; // elapsedDiskReadNanos += o.elapsedDiskReadNanos; // // nwrites += o.nwrites; // ncacheWrite += o.ncacheWrite; // ncacheFlush += o.ncacheFlush; // ndiskWrite += o.ndiskWrite; // maxWriteSize += o.maxWriteSize; // bytesWritten += o.bytesWritten; // bytesWrittenOnDisk += o.bytesWrittenOnDisk; // elapsedWriteNanos += o.elapsedWriteNanos; // elapsedCacheWriteNanos += o.elapsedCacheWriteNanos; // elapsedDiskWriteNanos += o.elapsedDiskWriteNanos; // // nforce += o.nforce; // ntruncate += o.ntruncate; // nreopen += o.nreopen; // nwriteRootBlock += o.nwriteRootBlock; // // } // // /** // * Returns a new {@link StoreCounters} containing the current counter values // * minus the given counter values. // * // * @param o // * // * @return // */ // public StoreCounters subtract(final StoreCounters o) { // // // make a copy of the current counters. // final StoreCounters t = new StoreCounters(this); // // // subtract out the given counters. // t.nreads -= o.nreads; // t.ncacheRead -= o.ncacheRead; // t.ndiskRead -= o.ndiskRead; // t.bytesRead -= o.bytesRead; // t.bytesReadFromDisk -= o.bytesReadFromDisk; // t.maxReadSize -= o.maxReadSize; // t.elapsedReadNanos -= o.elapsedReadNanos; // t.elapsedCacheReadNanos -= o.elapsedCacheReadNanos; // t.elapsedDiskReadNanos -= o.elapsedDiskReadNanos; // // t.nwrites -= o.nwrites; // t.ncacheWrite -= o.ncacheWrite; // t.ncacheFlush -= o.ncacheFlush; // t.ndiskWrite -= o.ndiskWrite; // t.maxWriteSize -= o.maxWriteSize; // t.bytesWritten -= o.bytesWritten; // t.bytesWrittenOnDisk -= o.bytesWrittenOnDisk; // t.elapsedWriteNanos -= o.elapsedWriteNanos; // t.elapsedCacheWriteNanos -= o.elapsedCacheWriteNanos; // t.elapsedDiskWriteNanos -= o.elapsedDiskWriteNanos; // // t.nforce -= o.nforce; // t.ntruncate -= o.ntruncate; // t.nreopen -= o.nreopen; // t.nwriteRootBlock -= o.nwriteRootBlock; // // return t; // // } // // synchronized public CounterSet getCounters() { // // if (root == null) { // // root = new CounterSet(); // // // IRawStore API // { // // /* // * reads // */ // // root.addCounter("nreads", new Instrument<Long>() { // public void sample() { // setValue(nreads); // } // }); // // root.addCounter("bytesRead", new Instrument<Long>() { // public void sample() { // setValue(bytesRead); // } // }); // // root.addCounter("readSecs", new Instrument<Double>() { // public void sample() { // final double elapsedReadSecs = (elapsedReadNanos / 1000000000.); // setValue(elapsedReadSecs); // } // }); // // root.addCounter("bytesReadPerSec", // new Instrument<Double>() { // public void sample() { // final double readSecs = (elapsedReadNanos / 1000000000.); // final double bytesReadPerSec = (readSecs == 0L ? 0d // : (bytesRead / readSecs)); // setValue(bytesReadPerSec); // } // }); // // root.addCounter("maxReadSize", new Instrument<Long>() { // public void sample() { // setValue(maxReadSize); // } // }); // // /* // * writes // */ // // root.addCounter("nwrites", new Instrument<Long>() { // public void sample() { // setValue(nwrites); // } // }); // // root.addCounter("bytesWritten", new Instrument<Long>() { // public void sample() { // setValue(bytesWritten); // } // }); // // root.addCounter("writeSecs", new Instrument<Double>() { // public void sample() { // final double writeSecs = (elapsedWriteNanos / 1000000000.); // setValue(writeSecs); // } // }); // // root.addCounter("bytesWrittenPerSec", // new Instrument<Double>() { // public void sample() { // final double writeSecs = (elapsedWriteNanos / 1000000000.); // final double bytesWrittenPerSec = (writeSecs == 0L ? 0d // : (bytesWritten / writeSecs)); // setValue(bytesWrittenPerSec); // } // }); // // root.addCounter("maxWriteSize", new Instrument<Long>() { // public void sample() { // setValue(maxWriteSize); // } // }); // // } // // /* // * write cache statistics // */ // { // // final CounterSet writeCache = root.makePath("writeCache"); // // /* // * read // */ // writeCache.addCounter("nread", new Instrument<Long>() { // public void sample() { // setValue(ncacheRead); // } // }); // // writeCache.addCounter("readHitRate", new Instrument<Double>() { // public void sample() { // setValue(nreads == 0L ? 0d : (double) ncacheRead // / nreads); // } // }); // // writeCache.addCounter("readSecs", new Instrument<Double>() { // public void sample() { // setValue(elapsedCacheReadNanos / 1000000000.); // } // }); // // /* // * write // */ // // // #of writes on the write cache. // writeCache.addCounter("nwrite", new Instrument<Long>() { // public void sample() { // setValue(ncacheWrite); // } // }); // // /* // * % of writes that are buffered vs writing through to the // * disk. // * // * Note: This will be 1.0 unless you are writing large // * records. Large records are written directly to the disk // * rather than first into the write cache. When this happens // * the writeHitRate on the cache can be less than one. // */ // writeCache.addCounter("writeHitRate", new Instrument<Double>() { // public void sample() { // setValue(nwrites == 0L ? 0d : (double) ncacheWrite // / nwrites); // } // }); // // writeCache.addCounter("writeSecs", new Instrument<Double>() { // public void sample() { // setValue(elapsedCacheWriteNanos / 1000000000.); // } // }); // // // #of times the write cache was flushed to the disk. // writeCache.addCounter("nflush", new Instrument<Long>() { // public void sample() { // setValue(ncacheFlush); // } // }); // // } // // // disk statistics // { // final CounterSet disk = root.makePath("disk"); // // /* // * read // */ // // disk.addCounter("nreads", new Instrument<Long>() { // public void sample() { // setValue(ndiskRead); // } // }); // // disk.addCounter("bytesRead", new Instrument<Long>() { // public void sample() { // setValue(bytesReadFromDisk); // } // }); // // disk.addCounter("bytesPerRead", new Instrument<Double>() { // public void sample() { // final double bytesPerDiskRead = (ndiskRead == 0 ? 0d // : (bytesReadFromDisk / (double)ndiskRead)); // setValue(bytesPerDiskRead); // } // }); // // disk.addCounter("readSecs", new Instrument<Double>() { // public void sample() { // final double diskReadSecs = (elapsedDiskReadNanos / 1000000000.); // setValue(diskReadSecs); // } // }); // // disk.addCounter("bytesReadPerSec", // new Instrument<Double>() { // public void sample() { // final double diskReadSecs = (elapsedDiskReadNanos / 1000000000.); // final double bytesReadPerSec = (diskReadSecs == 0L ? 0d // : bytesReadFromDisk / diskReadSecs); // setValue(bytesReadPerSec); // } // }); // // disk.addCounter("secsPerRead", new Instrument<Double>() { // public void sample() { // final double diskReadSecs = (elapsedDiskReadNanos / 1000000000.); // final double readLatency = (diskReadSecs == 0 ? 0d // : diskReadSecs / ndiskRead); // setValue(readLatency); // } // }); // // /* // * write // */ // // disk.addCounter("nwrites", new Instrument<Long>() { // public void sample() { // setValue(ndiskWrite); // } // }); // // disk.addCounter("bytesWritten", new Instrument<Long>() { // public void sample() { // setValue(bytesWrittenOnDisk); // } // }); // // disk.addCounter("bytesPerWrite", new Instrument<Double>() { // public void sample() { // final double bytesPerDiskWrite = (ndiskWrite == 0 ? 0d // : (bytesWrittenOnDisk / (double)ndiskWrite)); // setValue(bytesPerDiskWrite); // } // }); // // disk.addCounter("writeSecs", new Instrument<Double>() { // public void sample() { // final double diskWriteSecs = (elapsedDiskWriteNanos / 1000000000.); // setValue(diskWriteSecs); // } // }); // // disk.addCounter("bytesWrittenPerSec", // new Instrument<Double>() { // public void sample() { // final double diskWriteSecs = (elapsedDiskWriteNanos / 1000000000.); // final double bytesWrittenPerSec = (diskWriteSecs == 0L ? 0d // : bytesWrittenOnDisk // / diskWriteSecs); // setValue(bytesWrittenPerSec); // } // }); // // disk.addCounter("secsPerWrite", new Instrument<Double>() { // public void sample() { // final double diskWriteSecs = (elapsedDiskWriteNanos / 1000000000.); // final double writeLatency = (diskWriteSecs == 0 ? 0d // : diskWriteSecs / ndiskWrite); // setValue(writeLatency); // } // }); // // /* // * other // */ // // disk.addCounter("nforce", new Instrument<Long>() { // public void sample() { // setValue(nforce); // } // }); // // disk.addCounter("nextend", new Instrument<Long>() { // public void sample() { // setValue(ntruncate); // } // }); // // disk.addCounter("nreopen", new Instrument<Long>() { // public void sample() { // setValue(nreopen); // } // }); // // disk.addCounter("rootBlockWrites", new Instrument<Long>() { // public void sample() { // setValue(nwriteRootBlock); // } // }); // // } // // } // // return root; // // } // private CounterSet root; // // /** // * Human readable representation of the counters. // */ // public String toString() { // // return getCounters().toString(); // // } // // } // class StoreCounters /** * Performance counters for this class. */ private StoreCounters storeCounters = new StoreCounters(); /** * Returns the performance counters for the store. */ public StoreCounters getStoreCounters() { return storeCounters; } /** * Replaces the {@link StoreCounters} object. * * @param storeCounters * The new {@link Counter}s. * * @throws IllegalArgumentException * if the argument is <code>null</code>. */ public void setStoreCounters(final StoreCounters storeCounters) { if (storeCounters == null) throw new IllegalArgumentException(); synchronized (this) { this.storeCounters = storeCounters; // if (root != null) { // // root.attach(storeCounters.getCounters(), true/* replace */); // // } } } /** * Return interesting information about the write cache and file operations. */ public CounterSet getCounters() { final CounterSet root = new CounterSet(); root.addCounter("nextOffset", new Instrument<Long>() { public void sample() { setValue(nextOffset.get()); } }); root.addCounter("extent", new Instrument<Long>() { public void sample() { setValue(extent); } }); root.attach(storeCounters.getCounters()); /* * other. */ { final CounterSet writeCache = root.makePath("writeCache"); { final WriteCache tmp = DiskOnlyStrategy.this.writeCache; // add counter for the write cache capacity. writeCache.addCounter("capacity", new OneShotInstrument<Long>( tmp == null ? 0L : tmp.capacity())); } } // /* // * read cache. // */ // { // // final CounterSet readCache = root.makePath("readCache"); // // { // // final LRUCache tmp = DiskOnlyStrategy.this.readCache; // // readCache.addCounter("capacity", new OneShotInstrument<Long>( // (long) (tmp == null ? 0 : tmp.capacity()))); // // } // // readCache.addCounter("testCount", new Instrument<Long>() { // // @Override // protected void sample() { // // final LRUCache tmp = DiskOnlyStrategy.this.readCache; // // if (tmp == null) // return; // // setValue(tmp.getTestCount()); // // } // }); // // readCache.addCounter("successCount", new Instrument<Long>() { // // @Override // protected void sample() { // // final LRUCache tmp = DiskOnlyStrategy.this.readCache; // // if (tmp == null) // return; // // setValue(tmp.getSuccessCount()); // // } // }); // // readCache.addCounter("insertCount", new Instrument<Long>() { // // @Override // protected void sample() { // // final LRUCache tmp = DiskOnlyStrategy.this.readCache; // // if (tmp == null) // return; // // setValue(tmp.getInsertCount()); // // } // }); // // readCache.addCounter("hitRatio", new Instrument<Double>() { // // @Override // protected void sample() { // // final LRUCache tmp = DiskOnlyStrategy.this.readCache; // // if (tmp == null) // return; // // setValue(tmp.getHitRatio()); // // } // }); // // } return root; } /** * * @param maximumExtent * @param fileMetadata */ DiskOnlyStrategy(final long maximumExtent, final FileMetadata fileMetadata) { super(fileMetadata.extent, maximumExtent, fileMetadata.offsetBits, fileMetadata.nextOffset, fileMetadata.getBufferMode(), fileMetadata.readOnly); this.file = fileMetadata.file; this.fileMode = fileMetadata.fileMode; this.temporaryStore = (fileMetadata.getBufferMode()==BufferMode.Temporary); this.raf = fileMetadata.raf; this.fileOpened = raf != null; if (!temporaryStore && !fileOpened) { throw new RuntimeException( "File not open and not a temporary store"); } this.extent = fileMetadata.extent; this.headerSize = FileMetadata.headerSize0; this.userExtent = extent - headerSize; /* * Enable the write cache? * * Note: Do NOT enable the write cache if the file is being opened in a * read-only mode. * * Note: If the file has been closed for writes (closeTime != 0L), then * the file is read-only regardless of the mode in which it was opened. * * Note: NIO always (at least up to Java 6) allocates a "temporary" * direct byte buffer for disk read/write operations on a heap buffer * AND there is a bug in the release of those buffers. Therefore do NOT * pass in a heap byte buffer for the write cache!!! */ if (fileMetadata.writeCacheEnabled && !fileMetadata.readOnly && fileMetadata.closeTime == 0L) { final IBufferAccess tmp; try { /* * Note: a timeout here is not such a good idea. It could be * triggered by a GC pause with the resulting temp store then * lacking a write cache. */ tmp = DirectBufferPool.INSTANCE.acquire(); } catch (InterruptedException e) { throw new RuntimeException(e); } if (log.isInfoEnabled()) log.info("Enabling writeCache: capacity=" + tmp.buffer().capacity()); writeCache = new WriteCache(tmp); } else { writeCache = null; } // the offset at which the next record would be written on the file. writeCacheOffset = fileMetadata.nextOffset; // if (fileMetadata.readCacheCapacity > 0) { // // if(log.isInfoEnabled()) // log.info("Enabling read cache: capacity=" // + fileMetadata.readCacheCapacity + ", maxRecordSize=" // + fileMetadata.readCacheMaxRecordSize); // // if (fileMetadata.readCacheMaxRecordSize <= 0) // throw new IllegalArgumentException(); // // this.readCacheMaxRecordSize = fileMetadata.readCacheMaxRecordSize; // // this.readCache = new LRUCache<Long, byte[]>( // fileMetadata.readCacheCapacity); // // } } final public boolean isStable() { return true; } public boolean isFullyBuffered() { return false; } /** * {@link #flushWriteCache() flushes} the {@link #writeCache} before syncing * the disk. */ public void force(final boolean metadata) { assertOpen(); synchronized(this) { // flush all pending writes to disk. flushWriteCache(); } try { if(!temporaryStore) { // sync the disk. getChannel().force(metadata); } } catch (IOException ex) { throw new RuntimeException(ex); } storeCounters.nforce++; } /** * Closes the file immediately (without flushing any pending writes). */ synchronized public void close() { /* * Note: this clears the [open] flag. It is important to do this first * so that we do not re-open the channel once it has been closed. */ super.close(); // Release the write cache. releaseWriteCache(); // if(readCache != null) { // // if (log.isInfoEnabled()) // log.info("readCache: " + readCache.getStatistics()); // // // Discard the LRU cache. // readCache = null; // // } try { if (raf != null) { // FileLockUtility.closeFile(file,raf); synchronized (this) { if (raf != null && raf.getChannel().isOpen()) { raf.close(); } } } } catch (IOException ex) { throw new RuntimeException(ex); } if (!bufferMode.isStable() && file.exists()) { if (!file.delete()) { log.warn("Unable to delete temporary file: " + file); } } } public void deleteResources() { if (isOpen()) { throw new IllegalStateException(); } if( fileOpened && file.exists() && ! file.delete() ) { log.warn("Could not delete file: " + file.getAbsoluteFile()); } } final public long getExtent() { return extent; } final public long getUserExtent() { return userExtent; } /** * Note: {@link ClosedChannelException} and * {@link AsynchronousCloseException} can get thrown out of this method * (wrapped as {@link RuntimeException}s) if a reader task is interrupted. */ public ByteBuffer read(final long addr) { final long begin = System.nanoTime(); if (addr == 0L) throw new IllegalArgumentException(ERR_ADDRESS_IS_NULL); final long offset = getOffset(addr); final int nbytes = getByteCount(addr); if (nbytes == 0) { throw new IllegalArgumentException(ERR_RECORD_LENGTH_ZERO); } if (offset + nbytes > nextOffset.get()) { throw new IllegalArgumentException(ERR_ADDRESS_NOT_WRITTEN); } // if (readCache != null) { // // /* // * Test the read cache first and return the record from the read // * cache if it is found there. // */ // // final byte[] data = readCache.get(addr); // // if (data != null) { // // return ByteBuffer.wrap(data).asReadOnlyBuffer(); // // } // // } /* * Allocate a new buffer of the exact capacity. * * Note: we do this even if we are reading from the writeCache since the * writeCache may be flushed and re-written while the caller is holding * onto the returned buffer. If the buffer were a view onto the * writeCache, then this would cause the data in the returned view to * change! */ final ByteBuffer dst = ByteBuffer.allocate(nbytes); /* * We need to synchronize before we test the write cache since otherwise * the cache contents could change asynchronously. * * FIXME The synchronization block also covers the case when we have to * read through to the disk. Ideally we would not have to remain * synchronized for that operation, but inconsistencies have been * observed when synchronization is not maintained during the read. I am * not sure why this is required, but corrupt data can otherwise be * demonstrated by AbstractMRMWTestCase. * * Note: this issue appears to be an interaction with the OS or hardware * disk cache as the problem is generally demonstrated only after the * cache has been given some time to "clear". I have seen this problem * using Java 1.5.0_07 (-server -Xms1g -Xmx1g * -XX:MaxDirectMemorySize=256M) and Windows/XP service pack 2 but I * have not tested on other platforms yet. * * Performance is somewhat better if you do not synchronize this block * of code. However, the differences are not that extreme. As measured * by AbstractMRMWTestCase (timeout=10, nclients=20, percentReaders=.8) * the performance is: * * write 3.3, read 11.4 mb/s with sychronized(this) * * write 3.6, read 13.2 mb/s without sychronized(this) * * FIXME Also of interest, the JRockit VM corresponding to 1.5.0_06 * performs significantly worse on the same test. Check out some other * VM and OS versions and see what is going on here! * * @todo If you are NOT synchronized here then NIO READ operations can * be concurrent with WRITEs on the channel and there are methods on * this class that DO NOT retry writes if the channel is concurrently * closed! Those methods would need to be modified to retry in order for * this class to remain thread-safe. ( @todo DirectBuffer probably has * the same problem.) (@todo this is probably the explaination for the * need for the synchronized block that is documented immediately * above.) [@todo now that FileChannelUtility supports transparent * re-opening I should try to remove the synchronization for writers vs * readers and see if i can get better throughput.] */ synchronized (this) { if (nbytes > storeCounters.maxReadSize) { storeCounters.maxReadSize = nbytes; } /* * Check the write cache for this address. */ if (writeCache != null) { final long beginCache = System.nanoTime(); ByteBuffer tmp = writeCache.read(addr, nbytes); if (tmp != null) { /* * Copy the data into the newly allocated buffer. */ // copy the data into [dst]. dst.put(tmp); // flip buffer for reading. dst.flip(); /* * Update counters while synchronized. */ storeCounters.nreads++; storeCounters.bytesRead+=nbytes; // storeCounters.ncacheRead++; storeCounters.elapsedReadNanos+=(System.nanoTime()-begin); // return the new buffer. return dst; } else { // storeCounters.elapsedCacheReadNanos+=(System.nanoTime()-beginCache); } } /* * read through to the disk. */ final long beginDisk = System.nanoTime(); // the offset into the disk file. final long pos = offset + headerSize; // for (int ntries = 0; ntries < 3; ntries++) { // // if (ntries > 0) { // // /* // * Note: clear if we are retrying since the buffer may have // * been modified by a partial read. // */ // // dst.clear(); // // } try { storeCounters.ndiskRead += FileChannelUtility.readAll(opener, dst, pos); // successful read - exit the loop. // break; // } catch (ClosedByInterruptException ex) { // // /* // * This indicates that this thread was interrupted. We // * always abort in this case. // */ // // throw new RuntimeException(ex); // // } catch (AsynchronousCloseException ex) { // // /* // * The channel was closed asynchronously while blocking // * during the read. If the buffer strategy still thinks that // * it is open then we re-open the channel and re-read. // */ // // if(reopenChannel()) continue; // // throw new RuntimeException(ex); // // } catch (ClosedChannelException ex) { // // /* // * The channel is closed. If the buffer strategy still // * thinks that it is open then we re-open the channel and // * re-read. // */ // // if(reopenChannel()) continue; // // throw new RuntimeException(ex); } catch (IOException ex) { throw new RuntimeException(ex); } // } // flip for reading. dst.flip(); /* * Update counters while synchronized. */ storeCounters.nreads++; storeCounters.bytesRead+=nbytes; storeCounters.bytesReadFromDisk+=nbytes; storeCounters.elapsedReadNanos+=(System.nanoTime()-begin); storeCounters.elapsedDiskReadNanos+=(System.nanoTime()-beginDisk); // if (readCache != null && nbytes < readCacheMaxRecordSize) { // // /* // * Note: make sure that the record is not in the cache (we have // * to do this again since we were not synchronized on [this] // * when we tested at the start of this method). // */ // if (readCache.get(addr) == null) { // // /* // * Put a copy of the record in the read cache. // */ // // // new byte[] for the read cache. // final byte[] data = new byte[nbytes]; // // // copy contents into the new byte[]. // dst.get(data); // // // flip the buffer again so that it is read for re-reading. // dst.flip(); // // // put the record into the read cache. // readCache.put(addr, data, false/* dirty */); // // } // // } // return the buffer. return dst; } // synchronized(this) } /** * Used to re-open the {@link FileChannel} in this class. */ private final IReopenChannel<FileChannel> opener = new IReopenChannel<FileChannel>() { public String toString() { return file.toString(); } public FileChannel reopenChannel() throws IOException { return DiskOnlyStrategy.this.reopenChannel(); } }; /** * This method transparently re-opens the channel for the backing file. * <p> * Note: This method is synchronized so that concurrent readers do not try * to all open the store at the same time. * * @todo This method is ONLY invoked by readers. It should be used for * writers as well. Note that this method WILL NOT be invoked by * {@link FileChannelUtility} if the channel was closed by an * interrupt in the current thread (a different exception is thrown). */ synchronized private FileChannel reopenChannel() throws IOException { assertOpen(); if (raf != null && raf.getChannel().isOpen()) { /* The channel is still open. If you are allowing concurrent reads * on the channel, then this could indicate that two readers each * found the channel closed and that one was able to re-open the * channel before the other such that the channel was open again * by the time the 2nd reader got here. */ return raf.getChannel(); } if(temporaryStore && !fileOpened) { /* * The backing file has not been opened. * * Note: Without this case this method would create the backing * store for a Temporary store if anyone happened to invoke it. In * fact, this method will never get invoked for a Temporary store * without a backing store since the reads never read against the * channel because it does not exist. So, really, this is just here * to be paranoid. */ throw new AssertionError("TemporaryStore not yet open: "+file); } // open the file. this.raf = new RandomAccessFile(file, fileMode); if (log.isInfoEnabled()) log.info("(Re-)opened file: " + file); try { /* * Request a shared file lock. */ final boolean readOnly = "r".equals(fileMode); if (raf.getChannel() .tryLock(0, Long.MAX_VALUE, readOnly/* shared */) == null) { /* * Note: A null return indicates that someone else holds the * lock. This can happen if the platform does not support shared * locks or if someone requested an exclusive file lock. */ try { raf.close(); } catch (Throwable t) { // ignore. } throw new IOException("File already locked? file=" + file); } } catch (IOException ex) { /* * Note: This is true of NFS volumes. This is Ok and should be * ignored. However the backing file is not protected against * accidental deletes or overwrites. */ if (log.isInfoEnabled()) log.info("FileLock not supported: file=" + file, ex); } storeCounters.nreopen++; return raf.getChannel(); } private long allocate(final int nbytes) { if (isReadOnly()) throw new IllegalStateException(ERR_READ_ONLY); if (nbytes <= 0) throw new IllegalArgumentException(ERR_BAD_RECORD_SIZE); final long addr; // address in the store. synchronized(this) { /* * The offset at which the record will be written on the disk file * (not adjusted for the root blocks). */ final long offset = nextOffset.get(); /* * Make sure that the allocated region of the file exists. */ overflow(offset, nbytes); /* * Formulate the address that can be used to recover that record. */ addr = toAddr(nbytes, offset); /* * Increment the offset of the next address to be assigned by the * #of bytes in the record. */ nextOffset.addAndGet(nbytes); } return addr; } // /** // * FIXME The {@link #update(long, int, ByteBuffer)} API was introduced to // * support touch ups of the leaves generated by the // * {@link IndexSegmentBuilder} and the notional support for writable blocks, // * which was never realized (blobs should be send to the file system). At // * this time, update() is only used by that class and the // * {@link IndexSegmentBuilder} could use double-buffer the leaves or just // * write them out directly onto the output store. Update SHOULD BE REMOVED // * from the API since it allows non-append semantics and thus makes it much // * more complicated to implement write pipelines for journal level failover. // */ // public void update(final long addr, final int off, final ByteBuffer data) { // // if (addr == 0L) // throw new IllegalArgumentException(ERR_ADDRESS_IS_NULL); // // if (off < 0) // throw new IllegalArgumentException("Offset is negative"); // // if (data == null) // throw new IllegalArgumentException(ERR_BUFFER_NULL); // // if (isReadOnly()) // throw new IllegalStateException(ERR_READ_ONLY); // // // The offset of the record in the store (not adjusted for the root blocks). // final long addrOffset = getOffset(addr); // // // The size of the record (NOT the #of bytes to be written). // final int addrByteCount = getByteCount(addr); // // if (addrOffset + addrByteCount > nextOffset) { // // throw new IllegalArgumentException(ERR_ADDRESS_NOT_WRITTEN); // // } // // // #of bytes to be updated on the pre-existing record. // final int nbytes = data.remaining(); // // if (nbytes == 0) // throw new IllegalArgumentException(ERR_BUFFER_EMPTY); // // if (off + nbytes > addrByteCount) { // // throw new IllegalArgumentException(ERR_BUFFER_OVERRUN); // // } // // final long begin = System.nanoTime(); // // synchronized(this) { // // try { // // if (writeCache != null) { // // /* // * Check the writeCache. If the record is found in the write // * cache then we just update the slice of the record // * corresponding to the caller's request. This is a common // * use case and results in no IO. // */ // // final long beginCache = System.nanoTime(); // // try { // // final ByteBuffer view = writeCache.read(addr,addrByteCount); // // if (view != null) { // // // adjust the limit on the record in the write // // cache. // view.limit(off + nbytes); // // // adjust the position on the record in the write // // cache. // view.position(off); // // // copy the caller's data onto the record in the // // write // // cache. // view.put(data); // // // count this as a cache write. // storeCounters.ncacheWrite++; // // // Done. // return; // // } // // } finally { // // // track the write cache time. // storeCounters.elapsedCacheWriteNanos += (System.nanoTime() - beginCache); // // } // // } // // /* // * Either the writeCache is disabled or the record was not found // * in the write cache so just write the record directly on the // * disk. // * // * Note: for this case we might be able to move the write // * outside of the synchronized() block IFF we also cloned the // * data (since the caller is allowed to modify the buffer as // * soon as write() returns). // * // * Note: We MUST NOT update the writeCacheOffset since we are // * probably writing behind the end of the file (this is contrary // * to a normal write write is an append at the end of the file). // */ // // writeOnDisk(data, addrOffset + off/* adjustedOffset */, false/* append */); // // } finally { // // /* // * Update counters while we are synchronized. If done outside of // * the synchronization block then we need to use AtomicLongs // * rather than primitive longs. // */ // // storeCounters.nwrites++; // storeCounters.bytesWritten += nbytes; // storeCounters.elapsedWriteNanos += (System.nanoTime() - begin); // // if(nbytes > storeCounters.maxWriteSize) { // // storeCounters.maxWriteSize = nbytes; // // } // // } // // } // synchronized // // } public long write(final ByteBuffer data) { if (data == null) throw new IllegalArgumentException(ERR_BUFFER_NULL); if (isReadOnly()) throw new IllegalStateException(ERR_READ_ONLY); // #of bytes to store. final int nbytes = data.remaining(); if (nbytes == 0) throw new IllegalArgumentException(ERR_BUFFER_EMPTY); final long begin = System.nanoTime(); final long addr; // address in the store. synchronized(this) { /* * Allocate address for a new record with [nbytes] of data. */ addr = allocate(nbytes); /* * The offset at which the record will be written on the disk file * (not adjusted for the root blocks). */ final long offset = getOffset(addr); if (writeCache != null) { /* * Flush the writeCache if the record would cause it to * overflow. */ if (nbytes + writeCache.position() > writeCache.capacity()) { flushWriteCache(); } /* * This record is to big for the write cache so we write the * record directly on the disk. */ if (nbytes > writeCache.capacity()) { writeOnDisk(data, offset, true/*append*/); } else { /* * Queue up the write in the writeCache. */ final long beginCache = System.nanoTime(); writeCache.write(addr, data); // storeCounters.ncacheWrite++; // // storeCounters.elapsedCacheWriteNanos+=(System.nanoTime()-beginCache); } } else { /* * The writeCache is disabled so just write the record directly * on the disk. * * Note: for this case we might be able to move the write * outside of the synchronized() block IFF we also cloned the * data (since the caller is allowed to modify the buffer as * soon as write() returns). * * Note: We update the writeCacheOffset even when the writeCache * is disabled just to keep it consistent. This allows for the * possibility that the writeCache could be enabled and disabled * at will. */ writeOnDisk(data,offset, true/*append*/); } /* * Update counters while we are synchronized. If done outside of the * synchronization block then we need to use AtomicLongs rather than * primitive longs. */ storeCounters.nwrites++; storeCounters.bytesWritten+=nbytes; storeCounters.elapsedWriteNanos+=(System.nanoTime() - begin); if(nbytes > storeCounters.maxWriteSize) { storeCounters.maxWriteSize = nbytes; } } // synchronized return addr; } /** * Make sure that the file is large enough to accept a write of <i>nbytes</i> * starting at <i>offset</i> bytes into the file. * <p> * Note: The caller MUST be synchronized on <i>this</i>. * * @param offset * The offset into the file (NOT adjusted for the root blocks). * @param nbytes * The #of bytes to be written at that offset. */ private void overflow(final long offset, final int nbytes) { final long needed = (offset + nbytes) - userExtent; if (needed > 0) { if (!overflow(needed)) { throw new OverflowException(); } } } /** * Create/open the backing file for a {@link BufferMode#Temporary} store iff * it has not been created/opened. */ final private void createBackingFile() { if (!fileOpened && temporaryStore) { try { // // open the file for the first time (create). // raf = FileLockUtility.openFile(file, fileMode, // bufferMode != BufferMode.Mapped/*useTryLock*/); // note: set true so that reopenChannel will create the file. fileOpened = true; reopenChannel(); if (log.isInfoEnabled()) log.info("Opened backing file for temporary store: " + file); } catch (IOException e) { throw new RuntimeException("Could not open temp file: file=" + file, e); } } } /** * Write the data on the disk (synchronous). * <p> * Note: The caller MUST be synchronized on <i>this</i>. * <p> * Note: This updates {@link #writeCacheOffset} as well (but only if the * write is an append). * <p> * Note: It is possible for {@link #update(long, int, ByteBuffer)} to force * a non-append write that is beyond the {@link #writeCacheOffset}. This * will occur if the record that is being updated is too large for the * {@link #writeCache} while there are also records buffered by this write * cache. * * @param data * The data. The bytes from the current * {@link ByteBuffer#position()} to the * {@link ByteBuffer#limit()} will be written and the * {@link ByteBuffer#position()} will be advanced to the * {@link ByteBuffer#limit()} . The caller may subsequently * modify the contents of the buffer without side effects (i.e., * the data are copied onto the disk). * @param offset * The offset in the file at which the data will be written. * @param append * <code>true</code> iff the write is an append (most record * writes are appends). * * @todo When integrating the new WriteCache, this method will still have to * make sure that the backing file exists and handle overflow of the * file (file extension). */ private void writeOnDisk(final ByteBuffer data, final long offset, final boolean append) { final long begin = System.nanoTime(); createBackingFile(); final int nbytes = data.remaining(); // make sure that the file is large enough. overflow(offset, nbytes); /* * The position in the file at which the record will be written * (this is adjusted for the root blocks). */ final long pos = offset + headerSize; try { /* * Write bytes in [data] from position to limit onto the channel. * * Note: Since the caller is synchronized on [this] it SHOULD NOT be * possible for a reader is to be interrupted during a concurrent * NIO operation and thus the channel SHOULD NOT be asynchronously * closed while we are writing on it. */ storeCounters.ndiskWrite += FileChannelUtility.writeAll(getChannel(), data, pos); } catch (IOException ex) { throw new RuntimeException(ex); } // update the next offset at which data will be written on the disk. if(append) { writeCacheOffset += nbytes; } final long elapsed = (System.nanoTime() - begin); storeCounters.bytesWrittenOnDisk += nbytes; storeCounters.elapsedDiskWriteNanos += elapsed; if (false&&BigdataStatics.debug) { /* * Note: There are only two places where the journal writes on the * disk using this backing buffer implementation. Here and when it * updates the root blocks. It only syncs the disk at the commit. */ System.err.println("wrote on disk: bytes=" + nbytes + ", elapsed=" + TimeUnit.NANOSECONDS.toMillis(elapsed) + "ms; totals: write=" + TimeUnit.NANOSECONDS .toMillis(storeCounters.elapsedDiskWriteNanos) + "ms, read=" + TimeUnit.NANOSECONDS .toMillis(storeCounters.elapsedDiskReadNanos) + "ms"); } } public ByteBuffer readRootBlock(final boolean rootBlock0) { if(!isOpen()) throw new IllegalStateException(); final ByteBuffer tmp = ByteBuffer .allocate(RootBlockView.SIZEOF_ROOT_BLOCK); try { /* * Note: Synchronized on [this] to prevent concurrent NIO requests * which might lead to the channel being closed asynchronously. */ // synchronized (this) { FileChannelUtility.readAll(opener, tmp, rootBlock0 ? FileMetadata.OFFSET_ROOT_BLOCK0 : FileMetadata.OFFSET_ROOT_BLOCK1); // } tmp.position(0); // resets the position. } catch (IOException ex) { throw new RuntimeException(ex); } return tmp; } public void writeRootBlock(final IRootBlockView rootBlock, final ForceEnum forceOnCommit) { /* * Note: Root blocks are written for a temporary store in support of * rollback(). */ // if(temporaryStore) { // // /* // * Note: There are NO ROOT BLOCKS for a temporary store. Root blocks // * are only useful for stores that can be re-opened, and you can not // * re-open a temporary store - the backing file is always deleted // * when the store is closed. The AbstractJournal still formats the // * root blocks and retains a reference to the current root block, // * but it is NOT written onto the file. // */ // // return; // // } if (rootBlock == null) throw new IllegalArgumentException(); try { final ByteBuffer data = rootBlock.asReadOnlyBuffer(); final long pos = rootBlock.isRootBlock0() ? FileMetadata.OFFSET_ROOT_BLOCK0 : FileMetadata.OFFSET_ROOT_BLOCK1; /* * Note: Synchronized on [this] to prevent concurrent NIO requests * which might lead to the channel being closed asynchronously. */ synchronized(this) { FileChannelUtility.writeAll(getChannel(), data, pos); } if (forceOnCommit != ForceEnum.No) { force(forceOnCommit == ForceEnum.ForceMetadata); } } catch (IOException ex) { throw new RuntimeException(ex); } if (log.isDebugEnabled()) log.debug("wrote root block: "+rootBlock); storeCounters.nwriteRootBlock++; } synchronized public void truncate(final long newExtent) { final long newUserExtent = newExtent - headerSize; if (newUserExtent < getNextOffset() ) { throw new IllegalArgumentException(ERR_TRUNCATE); } if(newUserExtent == getUserExtent()) { // NOP. return; } /* * Note: This handles the case for a Temporary store where the write * cache is the same size as the initial extent and everything written * so far has been absorbed by the write cache. */ createBackingFile(); try { // extend (or truncate) the file. getRandomAccessFile().setLength(newExtent); /* * Since we just changed the file length we force the data to disk * and update the file metadata. this is a relatively expensive * operation but we want to make sure that we do not loose track of * a change in the length of the file. * * @todo an alternative would be to set a marker on the buffer such * that the next force() also forced the metadata to disk. */ if (!temporaryStore) { force(true); } storeCounters.ntruncate++; if(log.isInfoEnabled()) log.info("newLength=" + cf.format(newExtent) + ", file=" + file); if(log.isInfoEnabled()) log.info(getCounters().toString()); } catch(IOException ex) { /* * I've see an IOException "The handle is invalid" tossed here (just * once). A bit of searching around suggests that perhaps the * RandomAccessFile was concurrently closed? Something to look out * for if it happens again. [@todo probably a concurrent reader was * interrupted, in which case this method should just try the * setLength() operation again.] [@todo the MRMW test can throw this * during test shutdown, which simulates interrupt of NIO * operations]. */ throw new RuntimeException(ex); } this.userExtent = newUserExtent; this.extent = newExtent; } synchronized public long transferTo(final RandomAccessFile out) throws IOException { if (out == null) throw new IllegalArgumentException(); /* * Note: Force the write cache to the disk so that all the data we want * to transfer from channel to channel are actually on the source * channel! * * Note: This also handles the case for a Temporary store where the * backing file has not even been created yet. */ flushWriteCache(); return super.transferFromDiskTo(this, out); } /** * Extended to discard the write cache. * <p> * Note: The file is NOT closed and re-opened in a read-only mode in order * to avoid causing difficulties for concurrent readers. */ public void closeForWrites() { // sets the [readOnly] flag. super.closeForWrites(); // discard the write cache. releaseWriteCache(); } synchronized private final void releaseWriteCache() { final IBufferAccess tmp = writeCache == null ? null : writeCache.buf; if (tmp == null) return; try { tmp.release(); } catch (InterruptedException e) { throw new RuntimeException(e); } finally { writeCache.buf = null; } } public void delete(long addr) { // void behaviour } public void setNextOffset(long lastOffset) { // void for standard Disk strategy } public void setCommitRecordIndex(CommitRecordIndex commitRecordIndex) { // NOP } }