/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.journal;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.AsynchronousCloseException;
import java.nio.channels.ClosedChannelException;
import java.nio.channels.FileChannel;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Exchanger;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import com.bigdata.BigdataStatics;
import com.bigdata.btree.BTree.Counter;
import com.bigdata.counters.AbstractStatisticsCollector;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.Instrument;
import com.bigdata.counters.OneShotInstrument;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.FileChannelUtility;
import com.bigdata.io.IBufferAccess;
import com.bigdata.io.IReopenChannel;
import com.bigdata.journal.WORMStrategy.StoreCounters;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.resources.StoreManager.ManagedJournal;
import com.bigdata.util.Bytes;
/**
* Disk-based journal strategy.
* <p>
* Writes are buffered in a write cache. The cache is flushed when it would
* overflow. As a result only large sequential writes are performed on the
* store. Reads read through the write cache for consistency.
* <p>
* Note: This is used to realize both the {@link BufferMode#Disk} and the
* {@link BufferMode#Temporary} {@link BufferMode}s. When configured for the
* {@link BufferMode#Temporary} mode: the root blocks will not be written onto
* the disk, writes will not be forced, and the backing file will be created the
* first time the {@link DiskOnlyStrategy} attempts to write through to the
* disk. For many scenarios, the backing file will never be created unless the
* write cache overflows. This provides very low latency on start-up, the same
* MRMW capability, and allows very large temporary stores.
*
* FIXME Examine behavior when write caching is enabled/disabled for the OS.
* This has a profound impact. Asynchronous writes of multiple buffers, and the
* use of smaller buffers, may be absolutely when the write cache is disabled.
* It may be that swapping sets in because the Windows write cache is being
* overworked, in which case doing incremental and async IO would help. Compare
* with behavior on server platforms. See
* http://support.microsoft.com/kb/259716,
* http://www.accucadd.com/TechNotes/Cache/WriteBehindCache.htm,
* http://msdn2.microsoft.com/en-us/library/aa365165.aspx,
* http://www.jasonbrome.com/blog/archives/2004/04/03/writecache_enabled.html,
* http://support.microsoft.com/kb/811392,
* http://mail-archives.apache.org/mod_mbox/db-derby-dev/200609.mbox/%3C44F820A8.6000000@sun.com%3E
*
* <pre>
* /sbin/hdparm -W 0 /dev/hda 0 Disable write caching
* /sbin/hdparm -W 1 /dev/hda 1 Enable write caching
* </pre>
*
* @todo report whether or not the on-disk write cache is enabled for each
* platform in {@link AbstractStatisticsCollector}. offer guidence on how
* to disable that write cache.
*
* @todo The flush of the write cache could be made asynchronous if we had two
* write buffers, but that increases the complexity significantly. It
* would have to be synchronous if invoked from {@link #force(boolean)} in
* any case (or rather force would have to flush all buffers).
* <p>
* Reconsider a 2nd buffer so that we can avoid waiting on the writes to
* disk. Use
* {@link Executors#newSingleThreadExecutor(java.util.concurrent.ThreadFactory)
* to obtain the 2nd (daemon) thread and an. {@link Exchanger}.
* <p>
* Consider the generalization where a WriteCache encapulates the logic
* that exists in this class and where we have a {@link BlockingQueue} of
* available write caches. There is one "writable" writeCache object at
* any given time, unless we are blocked waiting for one to show up on the
* availableQueue. When a WriteCache is full it is placed onto a
* writeQueue. A thread reads from the writeQueue and performs writes,
* placing empty WriteCache objects onto the availableQueue. Sync places
* the current writeCache on the writeQueue and then waits on the
* writeQueue to be empty. Large objects could be wrapped and written out
* using the same mechansims but should not become "available" again after
* they are written.
* <p>
* Consider that a WriteCache also doubles as a read cache IF we create
* write cache objects encapsulating reads that we read directly from the
* disk rather than from a WriteCache. In this case we might do a larger
* read so as to populate more of the WriteCache object in the hope that
* we will have more hits in that part of the journal.
* <p>
* modify force to use an atomic handoff of the write cache so that the
* net result is atomic from the perspective of the caller. This may
* require locking on the write cache so that we wait until concurrent
* writes have finished before flushing to the disk or I may be able to
* use nextOffset to make an atomic determination of the range of the
* buffer to be forced, create a view of that range, and use the view to
* force to disk so that the position and limits are not changed by force
* nor by concurrent writers - this may also be a problem for the Direct
* mode and the Mapped mode, at least if they use a write cache.
* <p>
* Async cache writes are also useful if the disk cache is turned off and
* could gain importance in offering tighter control over IO guarentees.
*
* @todo test verifying that large records are written directly and that the
* write cache is properly flush beforehand.
*
* @todo test verifying that the write cache can be disabled.
*
* @todo test verifying that {@link #writeCacheOffset} is restored correctly on
* restart (ie., you can continue to append to the store after restart and
* the result is valid).
*
* @todo test verifying that the buffer position and limit are updated correctly
* by {@link #write(ByteBuffer)} regardless of the code path.
*
* @todo Retro fit the concept of a write cache into the
* {@link DirectBufferStrategy} so that we defer writes onto the disk
* until (a) a threshold of data has been buffered; or (b)
* {@link #force(boolean)} is invoked. Note that the implementation will
* be a bit different since the Direct mode is already fully buffered so
* we do not need to allocate a separate writeCache. However, we will
* still need to track the {@link #writeCacheOffset} and maintain a
* {@link #writeCacheIndex}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* @see BufferMode#Disk
* @see BufferMode#Temporary
*
* @deprecated This has been replaced by {@link WORMStrategy}.
*/
public class DiskOnlyStrategy extends AbstractBufferStrategy implements
IDiskBasedStrategy {
/**
* The file.
*/
private final File file;
/**
* The mode used to open that file.
*/
private final String fileMode;
/**
* <code>true</code> iff configured as a {@link BufferMode#Temporary} store.
*/
private final boolean temporaryStore;
/**
* The backing file for a {@link BufferMode#Temporary} store is not opened
* until the {@link #writeCache} is flushed to disk for the first time. In
* these scenarios this field will be <code>false</code> until the
* {@link #writeCache} is flushed and <code>true</code> thereafter. For
* {@link BufferMode#Disk}, this field is initially <code>true</code>.
* <p>
* The value of this field determines the behavior of
* {@link #reopenChannel()}.
*/
private boolean fileOpened;
/**
* The IO interface for the file - <strong>use
* {@link #getRandomAccessFile()} rather than this field</strong>.
*/
/*private*/ /*final*/ RandomAccessFile raf;
/**
* The size of the journal header, including MAGIC, version, and both root
* blocks. This is used as an offset when computing the address of a record
* in an underlying file and is ignored by buffer modes that are not backed
* by a file (e.g., transient) or that are memory mapped (since the map is
* setup to skip over the header)
*/
private final int headerSize;
/**
* Extent of the file. This value should be valid since we obtain an
* exclusive lock on the file when we open it.
*/
private long extent;
private long userExtent;
// /**
// * Optional read cache.
// * <p>
// * Note: When enabled, records are entered iff there is a miss on a read.
// * Written records are NOT entered into the read cache since (when the
// * {@link #writeCache} is enabled), recently written records are already in
// * the {@link #writeCache}.
// * <p>
// * Note: The higher-level data structures use the {@link LRUNexus}, which
// * provides a read cache of the decompressed records. For this reason there
// * is little reason to enable this lower-level read cache.
// */
// private LRUCache<Long, byte[]> readCache = null;
//
// /**
// * The maximum size of a record that may enter the {@link #readCache}.
// * Records larger than this are not cached.
// */
// private int readCacheMaxRecordSize = 0;
/**
* Optional {@link WriteCache}.
*/
final private WriteCache writeCache;
/**
* The next offset at which data in the {@link #writeCache} will be written
* on the disk. The offset is relative to the start of the user data space.
* Offset zero(0) addresses the first byte after the root blocks.
*
* @see FileMetadata#nextOffset
* @see AbstractBufferStrategy#nextOffset
*/
private long writeCacheOffset;
private class WriteCache {
/**
* The buffer used to absorb writes that are destined for the disk. Writes
* are simply appended into this buffer until it would overflow. On
* overflow, {@link #flushWriteCache()} is invoked to flush the data to the
* disk (without synchronizing the disk). If a record is too large to fit
* into this buffer, then the write cache is flushed and the record is
* written directly on the disk.
* <p>
* Note: We must clone the data since the
* {@link IRawStore#write(ByteBuffer)} contract says that the caller can
* reuse the buffer once we return. In order minimize heap churn we simply
* copy the data into {@link #buf}, a {@link ByteBuffer} that
* buffers recently written records. Writes are deferred until the buffer is
* would overflow and then all buffered are written at once onto the disk.
* <p>
* In order to ensure consistency we read through the {@link #buf} in
* {@link #read(long)}. Otherwise a {@link #write(ByteBuffer)} could return
* and a subsequent read on the record while it is in the
* {@link #buf} would "miss" causing us to read through to the disk
* (which would not have the correct data).
* <p>
* Note: The write cache design assumes an "append only" store. In
* particular, it assumes that the application data records are written in
* are purely sequential manner on the end of the file (the root blocks are
* outside of the application data). Either the write cache must be disabled
* or a different design must be used if you are using a store where records
* may be deleted and recycled.
* <p>
* The write cache offers a 27% performance gain when compared to the same
* condition without the write cache as measured by
* {@link AbstractMRMWTestCase}.
*/
private IBufferAccess buf;
/**
* An index into the write cache used for read through on the cache. The
* keys are the addresses that would be used to read the corresponding
* record. The values are the position in {@link #buf} where that record
* is buffered. A cache miss means that you need to read the record from
* the disk.
*/
final private Map<Long,Integer> writeCacheIndex;
// /**
// * The starting position in the buffer for data that has not been
// * written to the disk.
// *
// * @see Task
// */
// private int start = 0;
/**
* Create a {@link WriteCache} from a caller supplied buffer.
* <p>
* Note: {@link FileChannel} IO is fact perform using a direct
* {@link ByteBuffer}. When the caller supplies a {@link ByteBuffer}
* that is allocated on the Java heap as opposed to in native memory a
* temporary direct {@link ByteBuffer} will be allocated for the IO
* operation. The JVM can fail to release this temporary direct
* {@link ByteBuffer}, resulting in a memory leak. For this reason, the
* write cache should be a direct {@link ByteBuffer} and the same direct
* {@link ByteBuffer} instance should be shared when overflow causes the
* live journal overflow, being replaced by a new live journal. This bug
* forces us to pass in the write cache {@link ByteBuffer} directly via
* the {@link ManagedJournal} constructor.
*
* @see http://bugs.sun.com/bugdatabase/view_bug.do;jsessionid=8fab76d1d4479fffffffffa5abfb09c719a30?bug_id=6210541
*
* @param writeCache
* A {@link ByteBuffer} to be used as the write cache
* (optional).
*
* @param capacity
*/
public WriteCache(final IBufferAccess writeCache) {
if (writeCache == null)
throw new IllegalArgumentException();
// save reference to the write cache.
this.buf = writeCache;
// the capacity of the buffer in bytes.
final int capacity = writeCache.buffer().capacity();
/*
* Discard anything in the buffer, resetting the position to zero,
* the mark to zero, and the limit to the capacity.
*/
writeCache.buffer().clear();
/*
* An estimate of the #of records that might fit within the write
* cache. This is based on an assumption that the "average" record
* is 1k. This is used solely to assign the initial capacity to the
* writeCacheIndex.
*/
final int indexDefaultCapacity = capacity / (1 * Bytes.kilobyte32);
// allocate and initialize the write cache index.
writeCacheIndex = new ConcurrentHashMap<Long, Integer>(indexDefaultCapacity);
}
/**
* The current position in the buffer.
*/
final int position() {
return buf.buffer().position();
}
/**
* The capacity of the buffer.
*/
final int capacity() {
return buf.buffer().capacity();
}
void flush() {
// #of bytes to write on the disk.
final int nbytes = buf.buffer().position();
if (nbytes == 0) return;
// limit := position; position := 0;
buf.buffer().flip();
// write the data on the disk file.
writeOnDisk(buf.buffer(), writeCacheOffset, true/*append*/);
// position := 0; limit := capacity.
buf.buffer().clear();
// clear the index since all records were flushed to disk.
writeCacheIndex.clear();
}
/**
* Write the record on the cache.
*
* @param addr
* The address assigned to that record in the journal.
*
* @param data
* The record.
*/
void write(final long addr, final ByteBuffer data) {
// the position() at which the record is cached.
final int position = buf.buffer().position();
// copy the record into the cache.
buf.buffer().put(data);
// add the record to the write cache index for read(addr).
writeCacheIndex.put(Long.valueOf(addr), Integer.valueOf(position));
}
/**
* Read a record from the write cache.
*
* @param addr
* The address assigned to that record in the journal.
* @param nbytes
* The length of the record (decoded from the address by the
* caller).
*
* @return A read-write view onto the record in the write cache buffer
* -or- <code>null</code> iff the record does not lie within
* this {@link WriteCache}.
* <p>
* Note: The caller MUST copy the data from the view since
* concurrent operations may result in the write cache being
* flushed and the view overwritten with new data.
* <p>
* Note: A read-write view is returned in order to support
* {@link DiskOnlyStrategy#update(long, int, ByteBuffer)}
* for those cases when the record to be updated in still in
* the {@link WriteCache}.
*/
ByteBuffer read(final long addr, final int nbytes) {
/*
* The return value is the position in the writeCache where that
* record starts and [null] if the record is not in the writeCache.
*/
final Integer writeCachePosition = writeCacheIndex.get(addr);
if (writeCachePosition == null) {
// The record is not in this write cache.
return null;
}
// the start of the record in writeCache.
final int pos = writeCachePosition;
// create a view with same offset, limit and position.
final ByteBuffer tmp = buf.buffer().duplicate();
// adjust the view to just the record of interest.
tmp.limit(pos + nbytes);
tmp.position(pos);
/*
* Return a slice using that view - this restrict the caller to only
* those bytes exposed by the slice.
*/
return tmp.slice();
}
}
/**
* Need to override commit to ensure the writeCache is flushed prior to
* writing the root block.
*
* For the DiskOnlyStrategy flushing the writeCache also ensures the backing
* file is created if the file is temporary.
*
* Note that the internal call to flush the writeCache must be synchronized
* or concurrent writers to the cache will cause problems.
*/
@Override
public void commit() {
if (writeCache != null) {
synchronized (this) {
flushWriteCache();
}
}
super.commit();
}
/**
* Writes the {@link #writeCache} through to the disk and its position is
* reset to zero.
* <p>
* The caller MUST be synchronized on <i>this</i>.
*/
void flushWriteCache() {
if (writeCache == null) return;
writeCache.flush();
// storeCounters.ncacheFlush++;
}
final public int getHeaderSize() {
return headerSize;
}
final public File getFile() {
return file;
}
/**
* Note: This MAY be <code>null</code>. If {@link BufferMode#Temporary}
* is used then it WILL be <code>null</code> until the {@link #writeCache}
* is flushed to disk for the first time.
*/
final public RandomAccessFile getRandomAccessFile() {
return raf;
}
/**
* Note: This MAY be <code>null</code>. If {@link BufferMode#Temporary}
* is used then it WILL be <code>null</code> until the {@link #writeCache}
* is flushed to disk for the first time.
*/
final public FileChannel getChannel() {
final RandomAccessFile raf = getRandomAccessFile();
if (raf == null)
return null;
return raf.getChannel();
}
// /**
// * Counters for {@link IRawStore} access, including operations that read or
// * write through to the underlying media.
// *
// * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
// * @version $Id$
// *
// * @todo report elapsed time and average latency for force, reopen, and
// * writeRootBlock.
// *
// * @todo counters need to be atomic if we want to avoid the possibility of
// * concurrent <code>x++</code> operations failing to correctly
// * increment <code>x</code> for each request.
// */
// public static class StoreCounters {
//
// /**
// * #of read requests.
// */
// public long nreads;
//
// /**
// * #of read requests that are satisfied by our write cache (vs the
// * OS or disk level write cache).
// */
// public long ncacheRead;
//
// /**
// * #of read requests that read through to the backing file.
// */
// public long ndiskRead;
//
// /**
// * #of bytes read.
// */
// public long bytesRead;
//
// /**
// * #of bytes that have been read from the disk.
// */
// public long bytesReadFromDisk;
//
// /**
// * The size of the largest record read.
// */
// public long maxReadSize;
//
// /**
// * Total elapsed time for reads.
// */
// public long elapsedReadNanos;
//
// /**
// * Total elapsed time checking the disk write cache for records to be
// * read.
// */
// public long elapsedCacheReadNanos;
//
// /**
// * Total elapsed time for reading on the disk.
// */
// public long elapsedDiskReadNanos;
//
// /**
// * #of write requests.
// */
// public long nwrites;
//
// /**
// * #of write requests that are absorbed by our write cache (vs the OS or
// * disk level write cache).
// */
// public long ncacheWrite;
//
// /**
// * #of times the write cache was flushed to disk.
// */
// public long ncacheFlush;
//
// /**
// * #of write requests that write through to the backing file.
// */
// public long ndiskWrite;
//
// /**
// * The size of the largest record written.
// */
// public long maxWriteSize;
//
// /**
// * #of bytes written.
// */
// public long bytesWritten;
//
// /**
// * #of bytes that have been written on the disk.
// */
// public long bytesWrittenOnDisk;
//
// /**
// * Total elapsed time for writes.
// */
// public long elapsedWriteNanos;
//
// /**
// * Total elapsed time writing records into the cache (does not count
// * time to flush the cache when it is full or to write records that do
// * not fit in the cache directly to the disk).
// */
// public long elapsedCacheWriteNanos;
//
// /**
// * Total elapsed time for writing on the disk.
// */
// public long elapsedDiskWriteNanos;
//
// /**
// * #of times the data were forced to the disk.
// */
// public long nforce;
//
// /**
// * #of times the length of the file was changed (typically, extended).
// */
// public long ntruncate;
//
// /**
// * #of times the file has been reopened after it was closed by an
// * interrupt.
// */
// public long nreopen;
//
// /**
// * #of times one of the root blocks has been written.
// */
// public long nwriteRootBlock;
//
// /**
// * Initialize a new set of counters.
// */
// public StoreCounters() {
//
// }
//
// /**
// * Copy ctor.
// * @param o
// */
// public StoreCounters(final StoreCounters o) {
//
// add( o );
//
// }
//
// /**
// * Adds counters to the current counters.
// *
// * @param o
// */
// public void add(final StoreCounters o) {
//
// nreads += o.nreads;
// ncacheRead += o.ncacheRead;
// ndiskRead += o.ndiskRead;
// bytesRead += o.bytesRead;
// bytesReadFromDisk += o.bytesReadFromDisk;
// maxReadSize += o.maxReadSize;
// elapsedReadNanos += o.elapsedReadNanos;
// elapsedCacheReadNanos += o.elapsedCacheReadNanos;
// elapsedDiskReadNanos += o.elapsedDiskReadNanos;
//
// nwrites += o.nwrites;
// ncacheWrite += o.ncacheWrite;
// ncacheFlush += o.ncacheFlush;
// ndiskWrite += o.ndiskWrite;
// maxWriteSize += o.maxWriteSize;
// bytesWritten += o.bytesWritten;
// bytesWrittenOnDisk += o.bytesWrittenOnDisk;
// elapsedWriteNanos += o.elapsedWriteNanos;
// elapsedCacheWriteNanos += o.elapsedCacheWriteNanos;
// elapsedDiskWriteNanos += o.elapsedDiskWriteNanos;
//
// nforce += o.nforce;
// ntruncate += o.ntruncate;
// nreopen += o.nreopen;
// nwriteRootBlock += o.nwriteRootBlock;
//
// }
//
// /**
// * Returns a new {@link StoreCounters} containing the current counter values
// * minus the given counter values.
// *
// * @param o
// *
// * @return
// */
// public StoreCounters subtract(final StoreCounters o) {
//
// // make a copy of the current counters.
// final StoreCounters t = new StoreCounters(this);
//
// // subtract out the given counters.
// t.nreads -= o.nreads;
// t.ncacheRead -= o.ncacheRead;
// t.ndiskRead -= o.ndiskRead;
// t.bytesRead -= o.bytesRead;
// t.bytesReadFromDisk -= o.bytesReadFromDisk;
// t.maxReadSize -= o.maxReadSize;
// t.elapsedReadNanos -= o.elapsedReadNanos;
// t.elapsedCacheReadNanos -= o.elapsedCacheReadNanos;
// t.elapsedDiskReadNanos -= o.elapsedDiskReadNanos;
//
// t.nwrites -= o.nwrites;
// t.ncacheWrite -= o.ncacheWrite;
// t.ncacheFlush -= o.ncacheFlush;
// t.ndiskWrite -= o.ndiskWrite;
// t.maxWriteSize -= o.maxWriteSize;
// t.bytesWritten -= o.bytesWritten;
// t.bytesWrittenOnDisk -= o.bytesWrittenOnDisk;
// t.elapsedWriteNanos -= o.elapsedWriteNanos;
// t.elapsedCacheWriteNanos -= o.elapsedCacheWriteNanos;
// t.elapsedDiskWriteNanos -= o.elapsedDiskWriteNanos;
//
// t.nforce -= o.nforce;
// t.ntruncate -= o.ntruncate;
// t.nreopen -= o.nreopen;
// t.nwriteRootBlock -= o.nwriteRootBlock;
//
// return t;
//
// }
//
// synchronized public CounterSet getCounters() {
//
// if (root == null) {
//
// root = new CounterSet();
//
// // IRawStore API
// {
//
// /*
// * reads
// */
//
// root.addCounter("nreads", new Instrument<Long>() {
// public void sample() {
// setValue(nreads);
// }
// });
//
// root.addCounter("bytesRead", new Instrument<Long>() {
// public void sample() {
// setValue(bytesRead);
// }
// });
//
// root.addCounter("readSecs", new Instrument<Double>() {
// public void sample() {
// final double elapsedReadSecs = (elapsedReadNanos / 1000000000.);
// setValue(elapsedReadSecs);
// }
// });
//
// root.addCounter("bytesReadPerSec",
// new Instrument<Double>() {
// public void sample() {
// final double readSecs = (elapsedReadNanos / 1000000000.);
// final double bytesReadPerSec = (readSecs == 0L ? 0d
// : (bytesRead / readSecs));
// setValue(bytesReadPerSec);
// }
// });
//
// root.addCounter("maxReadSize", new Instrument<Long>() {
// public void sample() {
// setValue(maxReadSize);
// }
// });
//
// /*
// * writes
// */
//
// root.addCounter("nwrites", new Instrument<Long>() {
// public void sample() {
// setValue(nwrites);
// }
// });
//
// root.addCounter("bytesWritten", new Instrument<Long>() {
// public void sample() {
// setValue(bytesWritten);
// }
// });
//
// root.addCounter("writeSecs", new Instrument<Double>() {
// public void sample() {
// final double writeSecs = (elapsedWriteNanos / 1000000000.);
// setValue(writeSecs);
// }
// });
//
// root.addCounter("bytesWrittenPerSec",
// new Instrument<Double>() {
// public void sample() {
// final double writeSecs = (elapsedWriteNanos / 1000000000.);
// final double bytesWrittenPerSec = (writeSecs == 0L ? 0d
// : (bytesWritten / writeSecs));
// setValue(bytesWrittenPerSec);
// }
// });
//
// root.addCounter("maxWriteSize", new Instrument<Long>() {
// public void sample() {
// setValue(maxWriteSize);
// }
// });
//
// }
//
// /*
// * write cache statistics
// */
// {
//
// final CounterSet writeCache = root.makePath("writeCache");
//
// /*
// * read
// */
// writeCache.addCounter("nread", new Instrument<Long>() {
// public void sample() {
// setValue(ncacheRead);
// }
// });
//
// writeCache.addCounter("readHitRate", new Instrument<Double>() {
// public void sample() {
// setValue(nreads == 0L ? 0d : (double) ncacheRead
// / nreads);
// }
// });
//
// writeCache.addCounter("readSecs", new Instrument<Double>() {
// public void sample() {
// setValue(elapsedCacheReadNanos / 1000000000.);
// }
// });
//
// /*
// * write
// */
//
// // #of writes on the write cache.
// writeCache.addCounter("nwrite", new Instrument<Long>() {
// public void sample() {
// setValue(ncacheWrite);
// }
// });
//
// /*
// * % of writes that are buffered vs writing through to the
// * disk.
// *
// * Note: This will be 1.0 unless you are writing large
// * records. Large records are written directly to the disk
// * rather than first into the write cache. When this happens
// * the writeHitRate on the cache can be less than one.
// */
// writeCache.addCounter("writeHitRate", new Instrument<Double>() {
// public void sample() {
// setValue(nwrites == 0L ? 0d : (double) ncacheWrite
// / nwrites);
// }
// });
//
// writeCache.addCounter("writeSecs", new Instrument<Double>() {
// public void sample() {
// setValue(elapsedCacheWriteNanos / 1000000000.);
// }
// });
//
// // #of times the write cache was flushed to the disk.
// writeCache.addCounter("nflush", new Instrument<Long>() {
// public void sample() {
// setValue(ncacheFlush);
// }
// });
//
// }
//
// // disk statistics
// {
// final CounterSet disk = root.makePath("disk");
//
// /*
// * read
// */
//
// disk.addCounter("nreads", new Instrument<Long>() {
// public void sample() {
// setValue(ndiskRead);
// }
// });
//
// disk.addCounter("bytesRead", new Instrument<Long>() {
// public void sample() {
// setValue(bytesReadFromDisk);
// }
// });
//
// disk.addCounter("bytesPerRead", new Instrument<Double>() {
// public void sample() {
// final double bytesPerDiskRead = (ndiskRead == 0 ? 0d
// : (bytesReadFromDisk / (double)ndiskRead));
// setValue(bytesPerDiskRead);
// }
// });
//
// disk.addCounter("readSecs", new Instrument<Double>() {
// public void sample() {
// final double diskReadSecs = (elapsedDiskReadNanos / 1000000000.);
// setValue(diskReadSecs);
// }
// });
//
// disk.addCounter("bytesReadPerSec",
// new Instrument<Double>() {
// public void sample() {
// final double diskReadSecs = (elapsedDiskReadNanos / 1000000000.);
// final double bytesReadPerSec = (diskReadSecs == 0L ? 0d
// : bytesReadFromDisk / diskReadSecs);
// setValue(bytesReadPerSec);
// }
// });
//
// disk.addCounter("secsPerRead", new Instrument<Double>() {
// public void sample() {
// final double diskReadSecs = (elapsedDiskReadNanos / 1000000000.);
// final double readLatency = (diskReadSecs == 0 ? 0d
// : diskReadSecs / ndiskRead);
// setValue(readLatency);
// }
// });
//
// /*
// * write
// */
//
// disk.addCounter("nwrites", new Instrument<Long>() {
// public void sample() {
// setValue(ndiskWrite);
// }
// });
//
// disk.addCounter("bytesWritten", new Instrument<Long>() {
// public void sample() {
// setValue(bytesWrittenOnDisk);
// }
// });
//
// disk.addCounter("bytesPerWrite", new Instrument<Double>() {
// public void sample() {
// final double bytesPerDiskWrite = (ndiskWrite == 0 ? 0d
// : (bytesWrittenOnDisk / (double)ndiskWrite));
// setValue(bytesPerDiskWrite);
// }
// });
//
// disk.addCounter("writeSecs", new Instrument<Double>() {
// public void sample() {
// final double diskWriteSecs = (elapsedDiskWriteNanos / 1000000000.);
// setValue(diskWriteSecs);
// }
// });
//
// disk.addCounter("bytesWrittenPerSec",
// new Instrument<Double>() {
// public void sample() {
// final double diskWriteSecs = (elapsedDiskWriteNanos / 1000000000.);
// final double bytesWrittenPerSec = (diskWriteSecs == 0L ? 0d
// : bytesWrittenOnDisk
// / diskWriteSecs);
// setValue(bytesWrittenPerSec);
// }
// });
//
// disk.addCounter("secsPerWrite", new Instrument<Double>() {
// public void sample() {
// final double diskWriteSecs = (elapsedDiskWriteNanos / 1000000000.);
// final double writeLatency = (diskWriteSecs == 0 ? 0d
// : diskWriteSecs / ndiskWrite);
// setValue(writeLatency);
// }
// });
//
// /*
// * other
// */
//
// disk.addCounter("nforce", new Instrument<Long>() {
// public void sample() {
// setValue(nforce);
// }
// });
//
// disk.addCounter("nextend", new Instrument<Long>() {
// public void sample() {
// setValue(ntruncate);
// }
// });
//
// disk.addCounter("nreopen", new Instrument<Long>() {
// public void sample() {
// setValue(nreopen);
// }
// });
//
// disk.addCounter("rootBlockWrites", new Instrument<Long>() {
// public void sample() {
// setValue(nwriteRootBlock);
// }
// });
//
// }
//
// }
//
// return root;
//
// }
// private CounterSet root;
//
// /**
// * Human readable representation of the counters.
// */
// public String toString() {
//
// return getCounters().toString();
//
// }
//
// } // class StoreCounters
/**
* Performance counters for this class.
*/
private StoreCounters storeCounters = new StoreCounters();
/**
* Returns the performance counters for the store.
*/
public StoreCounters getStoreCounters() {
return storeCounters;
}
/**
* Replaces the {@link StoreCounters} object.
*
* @param storeCounters
* The new {@link Counter}s.
*
* @throws IllegalArgumentException
* if the argument is <code>null</code>.
*/
public void setStoreCounters(final StoreCounters storeCounters) {
if (storeCounters == null)
throw new IllegalArgumentException();
synchronized (this) {
this.storeCounters = storeCounters;
// if (root != null) {
//
// root.attach(storeCounters.getCounters(), true/* replace */);
//
// }
}
}
/**
* Return interesting information about the write cache and file operations.
*/
public CounterSet getCounters() {
final CounterSet root = new CounterSet();
root.addCounter("nextOffset", new Instrument<Long>() {
public void sample() {
setValue(nextOffset.get());
}
});
root.addCounter("extent", new Instrument<Long>() {
public void sample() {
setValue(extent);
}
});
root.attach(storeCounters.getCounters());
/*
* other.
*/
{
final CounterSet writeCache = root.makePath("writeCache");
{
final WriteCache tmp = DiskOnlyStrategy.this.writeCache;
// add counter for the write cache capacity.
writeCache.addCounter("capacity", new OneShotInstrument<Long>(
tmp == null ? 0L : tmp.capacity()));
}
}
// /*
// * read cache.
// */
// {
//
// final CounterSet readCache = root.makePath("readCache");
//
// {
//
// final LRUCache tmp = DiskOnlyStrategy.this.readCache;
//
// readCache.addCounter("capacity", new OneShotInstrument<Long>(
// (long) (tmp == null ? 0 : tmp.capacity())));
//
// }
//
// readCache.addCounter("testCount", new Instrument<Long>() {
//
// @Override
// protected void sample() {
//
// final LRUCache tmp = DiskOnlyStrategy.this.readCache;
//
// if (tmp == null)
// return;
//
// setValue(tmp.getTestCount());
//
// }
// });
//
// readCache.addCounter("successCount", new Instrument<Long>() {
//
// @Override
// protected void sample() {
//
// final LRUCache tmp = DiskOnlyStrategy.this.readCache;
//
// if (tmp == null)
// return;
//
// setValue(tmp.getSuccessCount());
//
// }
// });
//
// readCache.addCounter("insertCount", new Instrument<Long>() {
//
// @Override
// protected void sample() {
//
// final LRUCache tmp = DiskOnlyStrategy.this.readCache;
//
// if (tmp == null)
// return;
//
// setValue(tmp.getInsertCount());
//
// }
// });
//
// readCache.addCounter("hitRatio", new Instrument<Double>() {
//
// @Override
// protected void sample() {
//
// final LRUCache tmp = DiskOnlyStrategy.this.readCache;
//
// if (tmp == null)
// return;
//
// setValue(tmp.getHitRatio());
//
// }
// });
//
// }
return root;
}
/**
*
* @param maximumExtent
* @param fileMetadata
*/
DiskOnlyStrategy(final long maximumExtent, final FileMetadata fileMetadata) {
super(fileMetadata.extent, maximumExtent, fileMetadata.offsetBits,
fileMetadata.nextOffset, fileMetadata.getBufferMode(),
fileMetadata.readOnly);
this.file = fileMetadata.file;
this.fileMode = fileMetadata.fileMode;
this.temporaryStore = (fileMetadata.getBufferMode()==BufferMode.Temporary);
this.raf = fileMetadata.raf;
this.fileOpened = raf != null;
if (!temporaryStore && !fileOpened) {
throw new RuntimeException(
"File not open and not a temporary store");
}
this.extent = fileMetadata.extent;
this.headerSize = FileMetadata.headerSize0;
this.userExtent = extent - headerSize;
/*
* Enable the write cache?
*
* Note: Do NOT enable the write cache if the file is being opened in a
* read-only mode.
*
* Note: If the file has been closed for writes (closeTime != 0L), then
* the file is read-only regardless of the mode in which it was opened.
*
* Note: NIO always (at least up to Java 6) allocates a "temporary"
* direct byte buffer for disk read/write operations on a heap buffer
* AND there is a bug in the release of those buffers. Therefore do NOT
* pass in a heap byte buffer for the write cache!!!
*/
if (fileMetadata.writeCacheEnabled && !fileMetadata.readOnly
&& fileMetadata.closeTime == 0L) {
final IBufferAccess tmp;
try {
/*
* Note: a timeout here is not such a good idea. It could be
* triggered by a GC pause with the resulting temp store then
* lacking a write cache.
*/
tmp = DirectBufferPool.INSTANCE.acquire();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
if (log.isInfoEnabled())
log.info("Enabling writeCache: capacity=" + tmp.buffer().capacity());
writeCache = new WriteCache(tmp);
} else {
writeCache = null;
}
// the offset at which the next record would be written on the file.
writeCacheOffset = fileMetadata.nextOffset;
// if (fileMetadata.readCacheCapacity > 0) {
//
// if(log.isInfoEnabled())
// log.info("Enabling read cache: capacity="
// + fileMetadata.readCacheCapacity + ", maxRecordSize="
// + fileMetadata.readCacheMaxRecordSize);
//
// if (fileMetadata.readCacheMaxRecordSize <= 0)
// throw new IllegalArgumentException();
//
// this.readCacheMaxRecordSize = fileMetadata.readCacheMaxRecordSize;
//
// this.readCache = new LRUCache<Long, byte[]>(
// fileMetadata.readCacheCapacity);
//
// }
}
final public boolean isStable() {
return true;
}
public boolean isFullyBuffered() {
return false;
}
/**
* {@link #flushWriteCache() flushes} the {@link #writeCache} before syncing
* the disk.
*/
public void force(final boolean metadata) {
assertOpen();
synchronized(this) {
// flush all pending writes to disk.
flushWriteCache();
}
try {
if(!temporaryStore) {
// sync the disk.
getChannel().force(metadata);
}
} catch (IOException ex) {
throw new RuntimeException(ex);
}
storeCounters.nforce++;
}
/**
* Closes the file immediately (without flushing any pending writes).
*/
synchronized public void close() {
/*
* Note: this clears the [open] flag. It is important to do this first
* so that we do not re-open the channel once it has been closed.
*/
super.close();
// Release the write cache.
releaseWriteCache();
// if(readCache != null) {
//
// if (log.isInfoEnabled())
// log.info("readCache: " + readCache.getStatistics());
//
// // Discard the LRU cache.
// readCache = null;
//
// }
try {
if (raf != null) {
// FileLockUtility.closeFile(file,raf);
synchronized (this) {
if (raf != null && raf.getChannel().isOpen()) {
raf.close();
}
}
}
} catch (IOException ex) {
throw new RuntimeException(ex);
}
if (!bufferMode.isStable() && file.exists()) {
if (!file.delete()) {
log.warn("Unable to delete temporary file: " + file);
}
}
}
public void deleteResources() {
if (isOpen()) {
throw new IllegalStateException();
}
if( fileOpened && file.exists() && ! file.delete() ) {
log.warn("Could not delete file: " + file.getAbsoluteFile());
}
}
final public long getExtent() {
return extent;
}
final public long getUserExtent() {
return userExtent;
}
/**
* Note: {@link ClosedChannelException} and
* {@link AsynchronousCloseException} can get thrown out of this method
* (wrapped as {@link RuntimeException}s) if a reader task is interrupted.
*/
public ByteBuffer read(final long addr) {
final long begin = System.nanoTime();
if (addr == 0L)
throw new IllegalArgumentException(ERR_ADDRESS_IS_NULL);
final long offset = getOffset(addr);
final int nbytes = getByteCount(addr);
if (nbytes == 0) {
throw new IllegalArgumentException(ERR_RECORD_LENGTH_ZERO);
}
if (offset + nbytes > nextOffset.get()) {
throw new IllegalArgumentException(ERR_ADDRESS_NOT_WRITTEN);
}
// if (readCache != null) {
//
// /*
// * Test the read cache first and return the record from the read
// * cache if it is found there.
// */
//
// final byte[] data = readCache.get(addr);
//
// if (data != null) {
//
// return ByteBuffer.wrap(data).asReadOnlyBuffer();
//
// }
//
// }
/*
* Allocate a new buffer of the exact capacity.
*
* Note: we do this even if we are reading from the writeCache since the
* writeCache may be flushed and re-written while the caller is holding
* onto the returned buffer. If the buffer were a view onto the
* writeCache, then this would cause the data in the returned view to
* change!
*/
final ByteBuffer dst = ByteBuffer.allocate(nbytes);
/*
* We need to synchronize before we test the write cache since otherwise
* the cache contents could change asynchronously.
*
* FIXME The synchronization block also covers the case when we have to
* read through to the disk. Ideally we would not have to remain
* synchronized for that operation, but inconsistencies have been
* observed when synchronization is not maintained during the read. I am
* not sure why this is required, but corrupt data can otherwise be
* demonstrated by AbstractMRMWTestCase.
*
* Note: this issue appears to be an interaction with the OS or hardware
* disk cache as the problem is generally demonstrated only after the
* cache has been given some time to "clear". I have seen this problem
* using Java 1.5.0_07 (-server -Xms1g -Xmx1g
* -XX:MaxDirectMemorySize=256M) and Windows/XP service pack 2 but I
* have not tested on other platforms yet.
*
* Performance is somewhat better if you do not synchronize this block
* of code. However, the differences are not that extreme. As measured
* by AbstractMRMWTestCase (timeout=10, nclients=20, percentReaders=.8)
* the performance is:
*
* write 3.3, read 11.4 mb/s with sychronized(this)
*
* write 3.6, read 13.2 mb/s without sychronized(this)
*
* FIXME Also of interest, the JRockit VM corresponding to 1.5.0_06
* performs significantly worse on the same test. Check out some other
* VM and OS versions and see what is going on here!
*
* @todo If you are NOT synchronized here then NIO READ operations can
* be concurrent with WRITEs on the channel and there are methods on
* this class that DO NOT retry writes if the channel is concurrently
* closed! Those methods would need to be modified to retry in order for
* this class to remain thread-safe. ( @todo DirectBuffer probably has
* the same problem.) (@todo this is probably the explaination for the
* need for the synchronized block that is documented immediately
* above.) [@todo now that FileChannelUtility supports transparent
* re-opening I should try to remove the synchronization for writers vs
* readers and see if i can get better throughput.]
*/
synchronized (this)
{
if (nbytes > storeCounters.maxReadSize) {
storeCounters.maxReadSize = nbytes;
}
/*
* Check the write cache for this address.
*/
if (writeCache != null) {
final long beginCache = System.nanoTime();
ByteBuffer tmp = writeCache.read(addr, nbytes);
if (tmp != null) {
/*
* Copy the data into the newly allocated buffer.
*/
// copy the data into [dst].
dst.put(tmp);
// flip buffer for reading.
dst.flip();
/*
* Update counters while synchronized.
*/
storeCounters.nreads++;
storeCounters.bytesRead+=nbytes;
// storeCounters.ncacheRead++;
storeCounters.elapsedReadNanos+=(System.nanoTime()-begin);
// return the new buffer.
return dst;
} else {
// storeCounters.elapsedCacheReadNanos+=(System.nanoTime()-beginCache);
}
}
/*
* read through to the disk.
*/
final long beginDisk = System.nanoTime();
// the offset into the disk file.
final long pos = offset + headerSize;
// for (int ntries = 0; ntries < 3; ntries++) {
//
// if (ntries > 0) {
//
// /*
// * Note: clear if we are retrying since the buffer may have
// * been modified by a partial read.
// */
//
// dst.clear();
//
// }
try {
storeCounters.ndiskRead += FileChannelUtility.readAll(opener, dst,
pos);
// successful read - exit the loop.
// break;
// } catch (ClosedByInterruptException ex) {
//
// /*
// * This indicates that this thread was interrupted. We
// * always abort in this case.
// */
//
// throw new RuntimeException(ex);
//
// } catch (AsynchronousCloseException ex) {
//
// /*
// * The channel was closed asynchronously while blocking
// * during the read. If the buffer strategy still thinks that
// * it is open then we re-open the channel and re-read.
// */
//
// if(reopenChannel()) continue;
//
// throw new RuntimeException(ex);
//
// } catch (ClosedChannelException ex) {
//
// /*
// * The channel is closed. If the buffer strategy still
// * thinks that it is open then we re-open the channel and
// * re-read.
// */
//
// if(reopenChannel()) continue;
//
// throw new RuntimeException(ex);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
// }
// flip for reading.
dst.flip();
/*
* Update counters while synchronized.
*/
storeCounters.nreads++;
storeCounters.bytesRead+=nbytes;
storeCounters.bytesReadFromDisk+=nbytes;
storeCounters.elapsedReadNanos+=(System.nanoTime()-begin);
storeCounters.elapsedDiskReadNanos+=(System.nanoTime()-beginDisk);
// if (readCache != null && nbytes < readCacheMaxRecordSize) {
//
// /*
// * Note: make sure that the record is not in the cache (we have
// * to do this again since we were not synchronized on [this]
// * when we tested at the start of this method).
// */
// if (readCache.get(addr) == null) {
//
// /*
// * Put a copy of the record in the read cache.
// */
//
// // new byte[] for the read cache.
// final byte[] data = new byte[nbytes];
//
// // copy contents into the new byte[].
// dst.get(data);
//
// // flip the buffer again so that it is read for re-reading.
// dst.flip();
//
// // put the record into the read cache.
// readCache.put(addr, data, false/* dirty */);
//
// }
//
// }
// return the buffer.
return dst;
} // synchronized(this)
}
/**
* Used to re-open the {@link FileChannel} in this class.
*/
private final IReopenChannel<FileChannel> opener = new IReopenChannel<FileChannel>() {
public String toString() {
return file.toString();
}
public FileChannel reopenChannel() throws IOException {
return DiskOnlyStrategy.this.reopenChannel();
}
};
/**
* This method transparently re-opens the channel for the backing file.
* <p>
* Note: This method is synchronized so that concurrent readers do not try
* to all open the store at the same time.
*
* @todo This method is ONLY invoked by readers. It should be used for
* writers as well. Note that this method WILL NOT be invoked by
* {@link FileChannelUtility} if the channel was closed by an
* interrupt in the current thread (a different exception is thrown).
*/
synchronized private FileChannel reopenChannel() throws IOException {
assertOpen();
if (raf != null && raf.getChannel().isOpen()) {
/* The channel is still open. If you are allowing concurrent reads
* on the channel, then this could indicate that two readers each
* found the channel closed and that one was able to re-open the
* channel before the other such that the channel was open again
* by the time the 2nd reader got here.
*/
return raf.getChannel();
}
if(temporaryStore && !fileOpened) {
/*
* The backing file has not been opened.
*
* Note: Without this case this method would create the backing
* store for a Temporary store if anyone happened to invoke it. In
* fact, this method will never get invoked for a Temporary store
* without a backing store since the reads never read against the
* channel because it does not exist. So, really, this is just here
* to be paranoid.
*/
throw new AssertionError("TemporaryStore not yet open: "+file);
}
// open the file.
this.raf = new RandomAccessFile(file, fileMode);
if (log.isInfoEnabled())
log.info("(Re-)opened file: " + file);
try {
/*
* Request a shared file lock.
*/
final boolean readOnly = "r".equals(fileMode);
if (raf.getChannel()
.tryLock(0, Long.MAX_VALUE, readOnly/* shared */) == null) {
/*
* Note: A null return indicates that someone else holds the
* lock. This can happen if the platform does not support shared
* locks or if someone requested an exclusive file lock.
*/
try {
raf.close();
} catch (Throwable t) {
// ignore.
}
throw new IOException("File already locked? file=" + file);
}
} catch (IOException ex) {
/*
* Note: This is true of NFS volumes. This is Ok and should be
* ignored. However the backing file is not protected against
* accidental deletes or overwrites.
*/
if (log.isInfoEnabled())
log.info("FileLock not supported: file=" + file, ex);
}
storeCounters.nreopen++;
return raf.getChannel();
}
private long allocate(final int nbytes) {
if (isReadOnly())
throw new IllegalStateException(ERR_READ_ONLY);
if (nbytes <= 0)
throw new IllegalArgumentException(ERR_BAD_RECORD_SIZE);
final long addr; // address in the store.
synchronized(this) {
/*
* The offset at which the record will be written on the disk file
* (not adjusted for the root blocks).
*/
final long offset = nextOffset.get();
/*
* Make sure that the allocated region of the file exists.
*/
overflow(offset, nbytes);
/*
* Formulate the address that can be used to recover that record.
*/
addr = toAddr(nbytes, offset);
/*
* Increment the offset of the next address to be assigned by the
* #of bytes in the record.
*/
nextOffset.addAndGet(nbytes);
}
return addr;
}
// /**
// * FIXME The {@link #update(long, int, ByteBuffer)} API was introduced to
// * support touch ups of the leaves generated by the
// * {@link IndexSegmentBuilder} and the notional support for writable blocks,
// * which was never realized (blobs should be send to the file system). At
// * this time, update() is only used by that class and the
// * {@link IndexSegmentBuilder} could use double-buffer the leaves or just
// * write them out directly onto the output store. Update SHOULD BE REMOVED
// * from the API since it allows non-append semantics and thus makes it much
// * more complicated to implement write pipelines for journal level failover.
// */
// public void update(final long addr, final int off, final ByteBuffer data) {
//
// if (addr == 0L)
// throw new IllegalArgumentException(ERR_ADDRESS_IS_NULL);
//
// if (off < 0)
// throw new IllegalArgumentException("Offset is negative");
//
// if (data == null)
// throw new IllegalArgumentException(ERR_BUFFER_NULL);
//
// if (isReadOnly())
// throw new IllegalStateException(ERR_READ_ONLY);
//
// // The offset of the record in the store (not adjusted for the root blocks).
// final long addrOffset = getOffset(addr);
//
// // The size of the record (NOT the #of bytes to be written).
// final int addrByteCount = getByteCount(addr);
//
// if (addrOffset + addrByteCount > nextOffset) {
//
// throw new IllegalArgumentException(ERR_ADDRESS_NOT_WRITTEN);
//
// }
//
// // #of bytes to be updated on the pre-existing record.
// final int nbytes = data.remaining();
//
// if (nbytes == 0)
// throw new IllegalArgumentException(ERR_BUFFER_EMPTY);
//
// if (off + nbytes > addrByteCount) {
//
// throw new IllegalArgumentException(ERR_BUFFER_OVERRUN);
//
// }
//
// final long begin = System.nanoTime();
//
// synchronized(this) {
//
// try {
//
// if (writeCache != null) {
//
// /*
// * Check the writeCache. If the record is found in the write
// * cache then we just update the slice of the record
// * corresponding to the caller's request. This is a common
// * use case and results in no IO.
// */
//
// final long beginCache = System.nanoTime();
//
// try {
//
// final ByteBuffer view = writeCache.read(addr,addrByteCount);
//
// if (view != null) {
//
// // adjust the limit on the record in the write
// // cache.
// view.limit(off + nbytes);
//
// // adjust the position on the record in the write
// // cache.
// view.position(off);
//
// // copy the caller's data onto the record in the
// // write
// // cache.
// view.put(data);
//
// // count this as a cache write.
// storeCounters.ncacheWrite++;
//
// // Done.
// return;
//
// }
//
// } finally {
//
// // track the write cache time.
// storeCounters.elapsedCacheWriteNanos += (System.nanoTime() - beginCache);
//
// }
//
// }
//
// /*
// * Either the writeCache is disabled or the record was not found
// * in the write cache so just write the record directly on the
// * disk.
// *
// * Note: for this case we might be able to move the write
// * outside of the synchronized() block IFF we also cloned the
// * data (since the caller is allowed to modify the buffer as
// * soon as write() returns).
// *
// * Note: We MUST NOT update the writeCacheOffset since we are
// * probably writing behind the end of the file (this is contrary
// * to a normal write write is an append at the end of the file).
// */
//
// writeOnDisk(data, addrOffset + off/* adjustedOffset */, false/* append */);
//
// } finally {
//
// /*
// * Update counters while we are synchronized. If done outside of
// * the synchronization block then we need to use AtomicLongs
// * rather than primitive longs.
// */
//
// storeCounters.nwrites++;
// storeCounters.bytesWritten += nbytes;
// storeCounters.elapsedWriteNanos += (System.nanoTime() - begin);
//
// if(nbytes > storeCounters.maxWriteSize) {
//
// storeCounters.maxWriteSize = nbytes;
//
// }
//
// }
//
// } // synchronized
//
// }
public long write(final ByteBuffer data) {
if (data == null)
throw new IllegalArgumentException(ERR_BUFFER_NULL);
if (isReadOnly())
throw new IllegalStateException(ERR_READ_ONLY);
// #of bytes to store.
final int nbytes = data.remaining();
if (nbytes == 0)
throw new IllegalArgumentException(ERR_BUFFER_EMPTY);
final long begin = System.nanoTime();
final long addr; // address in the store.
synchronized(this) {
/*
* Allocate address for a new record with [nbytes] of data.
*/
addr = allocate(nbytes);
/*
* The offset at which the record will be written on the disk file
* (not adjusted for the root blocks).
*/
final long offset = getOffset(addr);
if (writeCache != null) {
/*
* Flush the writeCache if the record would cause it to
* overflow.
*/
if (nbytes + writeCache.position() > writeCache.capacity()) {
flushWriteCache();
}
/*
* This record is to big for the write cache so we write the
* record directly on the disk.
*/
if (nbytes > writeCache.capacity()) {
writeOnDisk(data, offset, true/*append*/);
} else {
/*
* Queue up the write in the writeCache.
*/
final long beginCache = System.nanoTime();
writeCache.write(addr, data);
// storeCounters.ncacheWrite++;
//
// storeCounters.elapsedCacheWriteNanos+=(System.nanoTime()-beginCache);
}
} else {
/*
* The writeCache is disabled so just write the record directly
* on the disk.
*
* Note: for this case we might be able to move the write
* outside of the synchronized() block IFF we also cloned the
* data (since the caller is allowed to modify the buffer as
* soon as write() returns).
*
* Note: We update the writeCacheOffset even when the writeCache
* is disabled just to keep it consistent. This allows for the
* possibility that the writeCache could be enabled and disabled
* at will.
*/
writeOnDisk(data,offset, true/*append*/);
}
/*
* Update counters while we are synchronized. If done outside of the
* synchronization block then we need to use AtomicLongs rather than
* primitive longs.
*/
storeCounters.nwrites++;
storeCounters.bytesWritten+=nbytes;
storeCounters.elapsedWriteNanos+=(System.nanoTime() - begin);
if(nbytes > storeCounters.maxWriteSize) {
storeCounters.maxWriteSize = nbytes;
}
} // synchronized
return addr;
}
/**
* Make sure that the file is large enough to accept a write of <i>nbytes</i>
* starting at <i>offset</i> bytes into the file.
* <p>
* Note: The caller MUST be synchronized on <i>this</i>.
*
* @param offset
* The offset into the file (NOT adjusted for the root blocks).
* @param nbytes
* The #of bytes to be written at that offset.
*/
private void overflow(final long offset, final int nbytes) {
final long needed = (offset + nbytes) - userExtent;
if (needed > 0) {
if (!overflow(needed)) {
throw new OverflowException();
}
}
}
/**
* Create/open the backing file for a {@link BufferMode#Temporary} store iff
* it has not been created/opened.
*/
final private void createBackingFile() {
if (!fileOpened && temporaryStore) {
try {
// // open the file for the first time (create).
// raf = FileLockUtility.openFile(file, fileMode,
// bufferMode != BufferMode.Mapped/*useTryLock*/);
// note: set true so that reopenChannel will create the file.
fileOpened = true;
reopenChannel();
if (log.isInfoEnabled())
log.info("Opened backing file for temporary store: "
+ file);
} catch (IOException e) {
throw new RuntimeException("Could not open temp file: file="
+ file, e);
}
}
}
/**
* Write the data on the disk (synchronous).
* <p>
* Note: The caller MUST be synchronized on <i>this</i>.
* <p>
* Note: This updates {@link #writeCacheOffset} as well (but only if the
* write is an append).
* <p>
* Note: It is possible for {@link #update(long, int, ByteBuffer)} to force
* a non-append write that is beyond the {@link #writeCacheOffset}. This
* will occur if the record that is being updated is too large for the
* {@link #writeCache} while there are also records buffered by this write
* cache.
*
* @param data
* The data. The bytes from the current
* {@link ByteBuffer#position()} to the
* {@link ByteBuffer#limit()} will be written and the
* {@link ByteBuffer#position()} will be advanced to the
* {@link ByteBuffer#limit()} . The caller may subsequently
* modify the contents of the buffer without side effects (i.e.,
* the data are copied onto the disk).
* @param offset
* The offset in the file at which the data will be written.
* @param append
* <code>true</code> iff the write is an append (most record
* writes are appends).
*
* @todo When integrating the new WriteCache, this method will still have to
* make sure that the backing file exists and handle overflow of the
* file (file extension).
*/
private void writeOnDisk(final ByteBuffer data, final long offset, final boolean append) {
final long begin = System.nanoTime();
createBackingFile();
final int nbytes = data.remaining();
// make sure that the file is large enough.
overflow(offset, nbytes);
/*
* The position in the file at which the record will be written
* (this is adjusted for the root blocks).
*/
final long pos = offset + headerSize;
try {
/*
* Write bytes in [data] from position to limit onto the channel.
*
* Note: Since the caller is synchronized on [this] it SHOULD NOT be
* possible for a reader is to be interrupted during a concurrent
* NIO operation and thus the channel SHOULD NOT be asynchronously
* closed while we are writing on it.
*/
storeCounters.ndiskWrite += FileChannelUtility.writeAll(getChannel(),
data, pos);
} catch (IOException ex) {
throw new RuntimeException(ex);
}
// update the next offset at which data will be written on the disk.
if(append) {
writeCacheOffset += nbytes;
}
final long elapsed = (System.nanoTime() - begin);
storeCounters.bytesWrittenOnDisk += nbytes;
storeCounters.elapsedDiskWriteNanos += elapsed;
if (false&&BigdataStatics.debug) {
/*
* Note: There are only two places where the journal writes on the
* disk using this backing buffer implementation. Here and when it
* updates the root blocks. It only syncs the disk at the commit.
*/
System.err.println("wrote on disk: bytes="
+ nbytes
+ ", elapsed="
+ TimeUnit.NANOSECONDS.toMillis(elapsed)
+ "ms; totals: write="
+ TimeUnit.NANOSECONDS
.toMillis(storeCounters.elapsedDiskWriteNanos)
+ "ms, read="
+ TimeUnit.NANOSECONDS
.toMillis(storeCounters.elapsedDiskReadNanos)
+ "ms");
}
}
public ByteBuffer readRootBlock(final boolean rootBlock0) {
if(!isOpen()) throw new IllegalStateException();
final ByteBuffer tmp = ByteBuffer
.allocate(RootBlockView.SIZEOF_ROOT_BLOCK);
try {
/*
* Note: Synchronized on [this] to prevent concurrent NIO requests
* which might lead to the channel being closed asynchronously.
*/
// synchronized (this) {
FileChannelUtility.readAll(opener, tmp,
rootBlock0 ? FileMetadata.OFFSET_ROOT_BLOCK0
: FileMetadata.OFFSET_ROOT_BLOCK1);
// }
tmp.position(0); // resets the position.
} catch (IOException ex) {
throw new RuntimeException(ex);
}
return tmp;
}
public void writeRootBlock(final IRootBlockView rootBlock,
final ForceEnum forceOnCommit) {
/*
* Note: Root blocks are written for a temporary store in support of
* rollback().
*/
// if(temporaryStore) {
//
// /*
// * Note: There are NO ROOT BLOCKS for a temporary store. Root blocks
// * are only useful for stores that can be re-opened, and you can not
// * re-open a temporary store - the backing file is always deleted
// * when the store is closed. The AbstractJournal still formats the
// * root blocks and retains a reference to the current root block,
// * but it is NOT written onto the file.
// */
//
// return;
//
// }
if (rootBlock == null)
throw new IllegalArgumentException();
try {
final ByteBuffer data = rootBlock.asReadOnlyBuffer();
final long pos = rootBlock.isRootBlock0() ? FileMetadata.OFFSET_ROOT_BLOCK0
: FileMetadata.OFFSET_ROOT_BLOCK1;
/*
* Note: Synchronized on [this] to prevent concurrent NIO requests
* which might lead to the channel being closed asynchronously.
*/
synchronized(this) {
FileChannelUtility.writeAll(getChannel(), data, pos);
}
if (forceOnCommit != ForceEnum.No) {
force(forceOnCommit == ForceEnum.ForceMetadata);
}
} catch (IOException ex) {
throw new RuntimeException(ex);
}
if (log.isDebugEnabled())
log.debug("wrote root block: "+rootBlock);
storeCounters.nwriteRootBlock++;
}
synchronized public void truncate(final long newExtent) {
final long newUserExtent = newExtent - headerSize;
if (newUserExtent < getNextOffset() ) {
throw new IllegalArgumentException(ERR_TRUNCATE);
}
if(newUserExtent == getUserExtent()) {
// NOP.
return;
}
/*
* Note: This handles the case for a Temporary store where the write
* cache is the same size as the initial extent and everything written
* so far has been absorbed by the write cache.
*/
createBackingFile();
try {
// extend (or truncate) the file.
getRandomAccessFile().setLength(newExtent);
/*
* Since we just changed the file length we force the data to disk
* and update the file metadata. this is a relatively expensive
* operation but we want to make sure that we do not loose track of
* a change in the length of the file.
*
* @todo an alternative would be to set a marker on the buffer such
* that the next force() also forced the metadata to disk.
*/
if (!temporaryStore) {
force(true);
}
storeCounters.ntruncate++;
if(log.isInfoEnabled())
log.info("newLength=" + cf.format(newExtent) + ", file="
+ file);
if(log.isInfoEnabled())
log.info(getCounters().toString());
} catch(IOException ex) {
/*
* I've see an IOException "The handle is invalid" tossed here (just
* once). A bit of searching around suggests that perhaps the
* RandomAccessFile was concurrently closed? Something to look out
* for if it happens again. [@todo probably a concurrent reader was
* interrupted, in which case this method should just try the
* setLength() operation again.] [@todo the MRMW test can throw this
* during test shutdown, which simulates interrupt of NIO
* operations].
*/
throw new RuntimeException(ex);
}
this.userExtent = newUserExtent;
this.extent = newExtent;
}
synchronized public long transferTo(final RandomAccessFile out)
throws IOException {
if (out == null)
throw new IllegalArgumentException();
/*
* Note: Force the write cache to the disk so that all the data we want
* to transfer from channel to channel are actually on the source
* channel!
*
* Note: This also handles the case for a Temporary store where the
* backing file has not even been created yet.
*/
flushWriteCache();
return super.transferFromDiskTo(this, out);
}
/**
* Extended to discard the write cache.
* <p>
* Note: The file is NOT closed and re-opened in a read-only mode in order
* to avoid causing difficulties for concurrent readers.
*/
public void closeForWrites() {
// sets the [readOnly] flag.
super.closeForWrites();
// discard the write cache.
releaseWriteCache();
}
synchronized private final void releaseWriteCache() {
final IBufferAccess tmp = writeCache == null ? null : writeCache.buf;
if (tmp == null)
return;
try {
tmp.release();
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
writeCache.buf = null;
}
}
public void delete(long addr) {
// void behaviour
}
public void setNextOffset(long lastOffset) {
// void for standard Disk strategy
}
public void setCommitRecordIndex(CommitRecordIndex commitRecordIndex) {
// NOP
}
}