/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.btree;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.lang.ref.WeakReference;
import java.lang.reflect.Constructor;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.nio.channels.OverlappingFileLockException;
import java.util.UUID;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.log4j.Logger;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.Instrument;
import com.bigdata.counters.OneShotInstrument;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.FileChannelUtility;
import com.bigdata.io.IBufferAccess;
import com.bigdata.io.IReopenChannel;
import com.bigdata.io.SerializerUtil;
import com.bigdata.journal.AbstractJournal;
import com.bigdata.journal.RootBlockException;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.mdi.SegmentMetadata;
import com.bigdata.rawstore.AbstractRawStore;
import com.bigdata.resources.StoreManager;
import com.bigdata.service.Event;
import com.bigdata.service.EventResource;
import com.bigdata.service.EventType;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.service.ResourceService;
/**
* A read-only store backed by a file containing a single {@link IndexSegment}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class IndexSegmentStore extends AbstractRawStore {
/**
* Logger.
*/
protected static final Logger log = Logger
.getLogger(IndexSegmentStore.class);
/**
* The mode that will be used to open the {@link #file} .
*/
protected static final String mode = "r";
/**
* The file containing the index segment.
*/
protected final File file;
/**
* For reporting via {@link #getResourceMetadata()}.
*/
final private SegmentMetadata segmentMetadata;
/**
* Used to correct decode region-based addresses. The
* {@link IndexSegmentBuilder} encodes region-based addresses using
* {@link IndexSegmentRegion}. Those addresses are then transparently
* decoded by this class. The {@link IndexSegment} itself knows nothing
* about this entire slight of hand.
* <p>
* Note: Don't deallocate. It is small and holds useful metadata such as the
* #of index entries that we would always like to have on hand.
*/
private final IndexSegmentAddressManager addressManager;
/**
* Optional store cache for the bloom filter, index metadata, and the B+Tree
* nodes and leaves (MAY be <code>null</code>).
*/
// @see BLZG-1501 (remove LRUNexus)
@Deprecated
private final ConcurrentMap<Long, Object> storeCache;
/**
* An optional <strong>direct</strong> {@link ByteBuffer} containing a disk
* image of the nodes in the {@link IndexSegment}.
* <p>
* Note: This buffer is acquired from the {@link DirectBufferPool} and MUST
* be released back to that pool.
* <p>
* Note: While some nodes will be held in memory by the hard reference queue
* the use of this buffer means that reading a node that has fallen off of
* the queue does not require any IO.
*/
private volatile IBufferAccess buf_nodes;
/**
* The random access file used to read the index segment. This is
* transparently re-opened if closed by an interrupt during an NIO
* operation.
* <p>
* A shared {@link FileLock} is requested. If the platform and the volume
* either DO NOT support {@link FileLock} or support <em>shared</em>
* {@link FileLock}s then you will be able to open the same
* {@link IndexSegmentStore} in multiple applications. However, if the
* platform does not support shared locks then the lock request is converted
* (by Java) into an exclusive {@link FileLock} and you will not be able to
* open the {@link IndexSegmentStore} in more than one application at a
* time.
* <p>
* Note: A shared {@link FileLock} makes it impossible to delete an
* {@link IndexSegmentStore} that is in use. {@link FileLock}s are
* automatically released when the {@link FileChannel} is closed or the
* application dies. Using an advisory lock is NOT a good idea as it can
* leave lock files in place which make it impossible to restart a data
* service after an abnormal termination. For that reason it is better to
* NOT use advisory locks on platforms and volumes which do not support
* {@link FileLock}.
*
* @see #reopenChannel()
*/
private volatile RandomAccessFile raf;
/**
*
* @see #raf
*/
/**
* A read-only view of the checkpoint record for the index segment.
* <p>
* Note: Don't deallocate. It is small and holds useful metadata such as the
* #of index entries that we would always like to have on hand.
*/
private final IndexSegmentCheckpoint checkpoint;
/**
* The metadata record for the index segment.
* <p>
* Note: Don't deallocate. Relatively small and it holds some important
* metadata. By reading this during the ctor we do not have to force the
* entire index segment to be loaded just to access the index metadata.
*/
private final IndexMetadata indexMetadata;
/**
* Counters specific to the {@link IndexSegmentStore}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
*/
private static class IndexSegmentStoreCounters {
/**
* The #of times the store was (re-)opened.
*/
long openCount;
/**
* The #of times the store was closed.
*/
long closeCount;
/**
* A read on a node (whether or not it is buffered).
*/
long nodesRead;
/**
* A read on a node when the nodes are not buffered.
*/
long nodesReadFromDisk;
/**
* A read on a leaf (always reads through to the disk).
*/
long leavesReadFromDisk;
}
/**
* Counters specific to the {@link IndexSegmentStore}.
*/
private final IndexSegmentStoreCounters counters = new IndexSegmentStoreCounters();
// final protected void assertOpen() {
//
// if (!open) {
//
// throw new IllegalStateException();
//
// }
//
// }
/**
* Used to correct decode region-based addresses. The
* {@link IndexSegmentBuilder} encodes region-based addresses using
* {@link IndexSegmentRegion}. Those addresses are then transparently
* decoded by this class. The {@link IndexSegment} itself knows nothing
* about this entire slight of hand.
*/
public final IndexSegmentAddressManager getAddressManager() {
return addressManager;
}
/**
* A read-only view of the checkpoint record for the index segment.
*/
public final IndexSegmentCheckpoint getCheckpoint() {
return checkpoint;
}
public final UUID getUUID() {
return segmentMetadata.getUUID();
}
/**
* The {@link IndexMetadata} record for the {@link IndexSegment}.
* <p>
* Note: The {@link IndexMetadata#getPartitionMetadata()} always reports
* that {@link LocalPartitionMetadata#getResources()} is <code>null</code>.
* This is because the {@link BTree} on the {@link AbstractJournal} defines
* the index partition view and each {@link IndexSegment} generally
* participates in MANY views - one per commit point on each
* {@link AbstractJournal} where the {@link IndexSegment} is part of an
* index partition view.
*/
public final IndexMetadata getIndexMetadata() {
return indexMetadata;
}
/**
* True iff the store is open.
*/
private volatile boolean open = false;
/**
* Optional. When defined, {@link Event}s are reported out.
*/
protected final IBigdataFederation<?> fed;
private volatile Event openCloseEvent;
/**
* Open a read-only store containing an {@link IndexSegment}, but does not
* load the {@link IndexSegment} from the store.
* <p>
* Note: If an exception is thrown then the backing file will be closed.
* <p>
* Note: Normally access to {@link IndexSegmentStore}s is mediated by the
* {@link StoreManager} which imposes a canonicalizing weak value cache to
* ensure that we do not double-open an {@link IndexSegmentStore}.
*
* @param file
* The file
*
* @see #loadIndexSegment()
*
* @throws RuntimeException
* if there is a problem.
* @throws RootBlockException
* if the root block is invalid.
*/
public IndexSegmentStore(final File file) {
this(file, null/* fed */);
}
/**
* Constructor variant that accepts an {@link IBigdataFederation} reference
* and will report out {@link Event}s.
*
* @param file
* @param fed
*/
public IndexSegmentStore(final File file, final IBigdataFederation<?> fed) {
if (file == null)
throw new IllegalArgumentException();
this.file = file;
// MAY be null.
this.fed = fed;
/*
* Mark as open so that we can use reopenChannel() and read(long addr)
* to read other data (the root node/leaf).
*/
this.open = true;
try {
// open the file.
reopenChannel();
// read the checkpoint record from the file.
this.checkpoint = new IndexSegmentCheckpoint(raf);
// for reporting via getResourceMetadata and toString().
this.segmentMetadata = new SegmentMetadata(file,
checkpoint.segmentUUID, checkpoint.commitTime);
// handles transparent decoding of offsets within regions.
this.addressManager = new IndexSegmentAddressManager(checkpoint);
// optional store cache (set before reading metadata/bloomfilter).
// @see BLZG-1501 (remove LRUNexus)
this.storeCache = null;
// this.storeCache = LRUNexus.getCache(this);
// Read the metadata record.
this.indexMetadata = readMetadata();
} catch (IOException ex) {
_close();
throw new RuntimeException(ex);
}
if (log.isInfoEnabled())
log.info(checkpoint.toString());
}
/**
* Closes out the {@link IndexSegmentStore} iff it is still open.
* <p>
* Note: The {@link IndexSegment} has hard reference to the
* {@link IndexSegmentStore} but not the other way around. Therefore an
* {@link IndexSegment} will be swept before its store is finalized.
*/
protected void finalize() throws Exception {
if(open) {
if(log.isInfoEnabled())
log.info("Closing IndexSegmentStore: " + getFile());
_close();
}
}
public String toString() {
/*
* Note: Only depends on final fields.
*/
return file.toString();
}
public IResourceMetadata getResourceMetadata() {
/*
* Note: Only depends on final fields.
*/
return segmentMetadata;
}
/**
* Re-open a (possibly closed) store. This operation should succeed if the
* backing file is still accessible.
* <p>
* Note: If an exception is thrown then the backing file will be closed.
*
* @throws RootBlockException
* if the root block is invalid.
* @throws RuntimeException
* if there is a problem, including a
* {@link FileNotFoundException}.
*
* @see #close()
*/
public void reopen() {
lock.lock();
try {
if (open) {
/*
* The store was already open by the time we got the lock.
*
* Note: IndexSegment#readNodeOrLeaf() does not have a lock
* before it invokes this method so the backing store can easily
* have been concurrently re-opened once that thread gains the
* lock.
*/
// throw new IllegalStateException("Already open.");
return;
}
try {
/*
* Mark as open so that we can use read(long addr) to read other
* data (the root node/leaf).
*/
this.open = true;
// open the file channel for the 1st time.
reopenChannel();
counters.openCount++;
if (fed != null) {
openCloseEvent = new Event(fed, new EventResource(
indexMetadata, file),
EventType.IndexSegmentStoreOpenClose).start();
}
} catch (Throwable t) {
// clean up.
_close();
// re-throw the exception.
throw new RuntimeException(
"Could not (re-) open: file=" + file, t);
}
} finally {
lock.unlock();
}
}
/**
* Load the {@link IndexSegment}. The {@link IndexSegment} (or derived
* class) MUST provide a public constructor with the following signature:
* <code>
*
* <i>className</i>(IndexSegmentFileStore store)
*
* </code>
* <p>
* Note: Normally access to {@link IndexSegment}s is mediated by the
* {@link StoreManager} which imposes a canonicalizing weak value cache to
* ensure that we do not double-open an {@link IndexSegment}.
*
* @param store
* The store.
*
* @return The {@link IndexSegment} or derived class loaded from that store.
*/
public IndexSegment loadIndexSegment() {
/*
* This is grabbed before we request the lock in an attempt to close a
* possible concurrency window where the finalizer on the index segment
* might run while we are acquiring the lock. By grabbing a hard
* reference here we ensure that the finalizer will not run while we are
* acquiring the lock. Who knows if this will ever make a difference.
*/
IndexSegment seg = ref == null ? null : ref.get();
lock.lock();
try {
/*
* If we did not get the hard reference above then we need to try
* again now that we have the lock.
*/
seg = seg != null ? seg : ref == null ? null : ref.get();
if (seg != null) {
// ensure "open".
seg.reopen();
// return seg.
return seg;
} else {
try {
@SuppressWarnings("rawtypes")
final Class cl = Class.forName(indexMetadata
.getBTreeClassName());
@SuppressWarnings({ "rawtypes", "unchecked" })
final Constructor ctor = cl
.getConstructor(new Class[] { IndexSegmentStore.class });
seg = (IndexSegment) ctor
.newInstance(new Object[] { this });
/*
* Attach the counters maintained by AbstractBTree to those
* reported for the IndexSegmentStore.
*
* Note: These counters are only allocated when the
* IndexSegment object is created and this is where we
* enforce a 1:1 correspondence between an IndexSegmentStore
* and the IndexSegment loaded from that store. However, the
* index can be closed and re-opened so we still need to
* replace any counters which we find during attach().
*/
getCounters().attach(seg.getBtreeCounters().getCounters(),
true/*replace*/);
// set the canonicalizing weak reference to the open seg.
ref = new WeakReference<IndexSegment>(seg);
// return seg.
return seg;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
} finally {
lock.unlock();
}
}
/**
* A canonicalizing weak reference for the {@link IndexSegment} that can be
* loaded from this store.
*/
private volatile WeakReference<IndexSegment> ref = null;
/**
* A lock used to make open and close operations atomic.
*/
protected final ReentrantLock lock = new ReentrantLock();
final public boolean isOpen() {
return open;
}
final public boolean isReadOnly() {
return true;
}
final public boolean isStable() {
return true;
}
/**
* Return <code>false</code> since the leaves are not fully buffered even
* if the nodes are fully buffered.
*/
final public boolean isFullyBuffered() {
return false;
}
/**
* Return <code>true</code> if the nodes of the {@link IndexSegment} are
* fully buffered in memory. The result is consistent as of the time that
* this method examines the state of the {@link IndexSegmentStore}.
*/
public boolean isNodesFullyBuffered() {
lock.lock();
try {
return isOpen() && buf_nodes != null;
} finally {
lock.unlock();
}
}
final public File getFile() {
return file;
}
/**
* Closes the file and releases the internal buffers. This operation will
* quietly succeed if the {@link IndexSegmentStore} is already closed. This
* operation may be reversed by {@link #reopen()} as long as the backing
* file remains available. A read on a closed {@link IndexSegmentStore} will
* transparently {@link #reopen()} the store as long as the backing file
* remains available. {@link #destroy()} provides an atomic "close and
* delete" operation.
*/
public void close() {
lock.lock();
try {
if (log.isInfoEnabled())
log.info(file.toString());
// assertOpen();
if(isOpen()) {
_close();
}
} finally {
lock.unlock();
}
}
/**
* Method is safe to invoke whether or not the store is "open" and will
* always close {@link #raf} (if open), release various buffers, and set
* {@link #open} to <code>false</code>. All exceptions are trapped, a log
* message is written, and the exception is NOT re-thrown.
*/
private void _close() {
lock.lock();
try {
if (raf != null) {
try {
raf.close();
} catch (IOException ex) {
log.error("Problem closing file: " + file, ex);
// ignore exception.
}
raf = null;
}
if (buf_nodes != null) {
try {
// release the buffer back to the pool.
buf_nodes.release();
} catch (Throwable t) {
// log error but continue anyway.
log.error(this, t);
} finally {
// clear reference since buffer was released.
buf_nodes = null;
}
}
open = false;
counters.closeCount++;
if (openCloseEvent != null) {
try {
openCloseEvent.end();
} catch (Throwable t) {
log.error(this, t);
} finally {
openCloseEvent = null;
}
}
if (log.isInfoEnabled())
log.info("Closed: file=" + getFile());
} finally {
lock.unlock();
}
}
public void deleteResources() {
lock.lock();
try {
if (open)
throw new IllegalStateException();
try {
// @see BLZG-1501 (remove LRUNexus)
// if (LRUNexus.INSTANCE != null) {
//
// LRUNexus.INSTANCE.deleteCache(getUUID());
//
// }
} catch (Throwable t) {
log.error(t, t);
}
if (!file.delete()) {
throw new RuntimeException("Could not delete: "
+ file.getAbsolutePath());
}
} finally {
lock.unlock();
}
}
/**
* Atomically closes the store (iff open) and then deletes the backing file.
*/
public void destroy() {
lock.lock();
try {
if (isOpen()) {
close();
}
deleteResources();
} finally {
lock.unlock();
}
}
final public long write(ByteBuffer data) {
throw new UnsupportedOperationException();
}
final public void force(boolean metadata) {
throw new UnsupportedOperationException();
}
final public long size() {
return checkpoint.length;
}
public CounterSet getCounters() {
// if (counterSet == null) {
final CounterSet counterSet = new CounterSet();
counterSet.addCounter("file", new OneShotInstrument<String>(file
.toString()));
// checkpoint (counters are all oneshot).
{
final CounterSet tmp = counterSet.makePath("checkpoint");
tmp.addCounter("segment UUID", new OneShotInstrument<String>(
checkpoint.segmentUUID.toString()));
// length in bytes of the file.
tmp.addCounter("length", new OneShotInstrument<String>(
Long.toString(checkpoint.length)));
tmp.addCounter("#nodes", new OneShotInstrument<String>(
Long.toString(checkpoint.nnodes)));
tmp.addCounter("#leaves", new OneShotInstrument<String>(
Long.toString(checkpoint.nleaves)));
tmp.addCounter("#entries", new OneShotInstrument<String>(
Long.toString(checkpoint.nentries)));
tmp.addCounter("height", new OneShotInstrument<String>(
Long.toString(checkpoint.height)));
}
// metadata (all oneshot).
{
final CounterSet tmp = counterSet.makePath("metadata");
tmp.addCounter("name", new OneShotInstrument<String>(
indexMetadata.getName()));
tmp.addCounter("index UUID", new OneShotInstrument<String>(
indexMetadata.getIndexUUID().toString()));
}
// dynamic counters.
{
final CounterSet tmp = counterSet.makePath("store");
tmp.addCounter("nodesBuffered", new Instrument<Boolean>() {
protected void sample() {
setValue(buf_nodes != null);
}
});
tmp.addCounter("openCount", new Instrument<String>() {
protected void sample() {
setValue(Long.toString(counters.openCount));
}
});
tmp.addCounter("closeCount", new Instrument<String>() {
protected void sample() {
setValue(Long.toString(counters.closeCount));
}
});
tmp.addCounter("nodesRead", new Instrument<String>() {
protected void sample() {
setValue(Long.toString(counters.nodesRead));
}
});
tmp.addCounter("nodeReadFromDisk", new Instrument<String>() {
protected void sample() {
setValue(Long.toString(counters.nodesReadFromDisk));
}
});
tmp.addCounter("leavesReadFromDisk", new Instrument<String>() {
protected void sample() {
setValue(Long.toString(counters.leavesReadFromDisk));
}
});
}
// }
return counterSet;
}
// private CounterSet counterSet;
/**
* Read a record from the {@link IndexSegmentStore}. If the request is in
* the node region and the nodes have been buffered then this uses a slice
* on the node buffer. Otherwise this reads through to the backing file.
* <p>
* Note: An LRU disk cache is a poor choice for the leaves. Since the btree
* already maintains a cache of the recently touched leaf objects, a recent
* read against the disk is the best indication that we have that we will
* NOT want to read that region again soon.
*/
public ByteBuffer read(final long addr) {
// assertOpen();
/*
* True IFF the starting address lies entirely within the region
* dedicated to the B+Tree nodes.
*/
final boolean isNodeAddr = addressManager.isNodeAddr(addr);
if (log.isDebugEnabled()) {
log.debug("addr=" + addr + "(" + addressManager.toString(addr)
+ "), isNodeAddr="+isNodeAddr);
}
// abs. offset of the record in the file.
final long offset = addressManager.getOffset(addr);
// length of the record.
final int length = addressManager.getByteCount(addr);
if (isNodeAddr) {
// a node.
counters.nodesRead++;
/*
* Note: In order to read from [buf_nodes] we MUST be holding the
* [lock] since it could otherwise be concurrently returned to the
* DirectBufferPool by _close().
*
* Note: Because this takes the global [lock] it forces reads
* against the nodes buffer to be single threaded. That might cause
* [lock] to be contended, but I have not yet observed this. If the
* lock does become contended, then it should be replaced by a
* ReadWriteLock. The readLock would be used to read on the buffer.
* The writeLock would be used to allocate or release the buffer.
*/
if (buf_nodes != null) {
lock.lock();
try {
if (buf_nodes != null) {
return readFromBuffer(offset, length);
}
} finally {
lock.unlock();
}
}
counters.nodesReadFromDisk++;
// The data need to be read from the file.
return readFromFile(offset, length);
} else {
/*
* Read a leaf, a blob, or some other raw allocation on the
* IndexSegmentStore.
*/
// @todo over estimates leaves read since can be blob, etc. as well.
counters.leavesReadFromDisk++;
// The data need to be read from the file.
return readFromFile(offset, length);
}
}
/**
* The [addr] addresses a node and the data are buffered. Create and return
* a read-only view so that concurrent reads do not modify the buffer state.
* <p>
* Note: The caller MUST be synchronized such that {@link #buf_nodes} is
* in a known state (either allocated or released).
*
* @param offset
* @param length
* @return
*/
final private ByteBuffer readFromBuffer(final long offset, final int length) {
/*
* Note: In order to allow concurrent readers against the buffer, we
* need to take a slice() on the buffer and we need to be holding a lock
* such that the offset and position of the buffer can not change while
* we take that slice. (Creation of the slice views must be serialized.)
*
* Note: NOT read-only until we decide if we have a direct buffer or not
* when we take the slice.
*/
final ByteBuffer tmp;
{
final ByteBuffer t = buf_nodes.buffer();
synchronized (t) {
// Take slice. Still a read/write direct buffer.
tmp = t.slice();
}
}
// correct the offset so that it is relative to the buffer.
final long off = offset - checkpoint.offsetNodes;
// set the limit on the buffer to the end of the record.
tmp.limit((int) (off + length));
// set the position on the buffer to the start of the record.
tmp.position((int) off);
/*
* Create a slice of that view showing only the desired record. The
* position() of the slice will be zero(0) and the limit() will be the
* #of bytes in the record.
*
* Note: slice restricts the view available to the caller to the view
* that was setup on the buffer at the moment that the slice was
* obtained.
*
* Note: We MUST NOT return a view of the direct buffer since the direct
* buffer could be released at any time. Therefore, if [tmp] is a direct
* buffer, we copy the data into a byte[] and then wrap and return that
* byte[].
*
* Note: We DO NOT want to make the returned byte[] a read-only view.
* The B+Tree code requires access to the backing byte[]. If we make the
* view read-only then a new byte[] and a new view will have to be
* created by the NodeSerializer. (Plus, the returned buffer is wrapping
* a copy of the data so any writes on it just mess up the caller.)
*/
if (tmp.isDirect()) {
// Just the piece we are interested in.
final ByteBuffer slice = tmp.slice();
// backing array is not accessible, so copy into new byte[].
final byte[] a = new byte[length];
// Copy data.
slice.get(a);
// Wrap and return.
return ByteBuffer.wrap(a);
}
return tmp.slice();
}
/**
* Read the record from the file.
*/
final private ByteBuffer readFromFile(final long offset, final int length) {
try {
// Allocate buffer: limit = capacity; pos = 0.
final ByteBuffer dst = ByteBuffer.allocate(length);
// read into [dst] - does not modify the channel's position().
FileChannelUtility.readAll(opener, dst, offset);
// successful read from file; flip buffer for reading by caller.
dst.flip();
// done.
return dst;
} catch (IOException ex) {
throw new RuntimeException(ex);
}
}
/**
* Read from the file into the caller's buffer.
* <p>
* Note: This is package private in order to expose it to the
* {@link IndexSegmentMultiBlockIterator}.
*
* @param offset
* The offset of the first byte to be read.
* @param dst
* The buffer into which the data will be read. Bytes will be
* read into the buffer starting at the current position and up
* to the limit.
*
* @throws IOException
*/
final void readFromFile(final long offset, final ByteBuffer dst)
throws IOException {
// read into [dst] - does not modify the channel's position().
FileChannelUtility.readAll(opener, dst, offset);
}
private final IReopenChannel<FileChannel> opener = new IReopenChannel<FileChannel>() {
public String toString() {
return IndexSegmentStore.this.toString();
}
public FileChannel reopenChannel() throws IOException {
return IndexSegmentStore.this.reopenChannel();
}
};
/**
* This method transparently re-opens the channel for the backing file.
* <p>
* Since the {@link IndexSegment} is a read-only data structure, all of the
* in-memory state remains valid and we only need to re-open the
* {@link FileChannel} to the backing store and retry. In particular, we do
* not need to re-read the root node, {@link IndexMetadata},
* {@link BloomFilter}, etc. All we have to do is re-open the
* {@link FileChannel}.
* <p>
* Note: This method is internally synchronized so that concurrent readers
* do not try to all open the store at the same time. Further, this is the
* only method other than {@link #_close()} that can set {@link #raf}. Since
* both this method and {@link #_close()} are synchronized the state of that
* field is well known inside of this method.
* <p>
* Note: {@link OverlappingFileLockException}s can arise when there are
* concurrent requests to obtain a shared lock on the same file. Personally,
* I think that this is a bug since the lock requests are shared and should
* be processed without deadlock. However, the code handles this case by
* proceeding without the lock - exactly as it would handle the case where a
* shared lock was not available. This is still somewhat fragile since it
* someone does not test the {@link FileLock} and was in fact granted an
* exclusive lock when they requested a shared lock then this code will be
* unwilling to send the resource. There are two ways to make that work out
* - either we DO NOT use {@link FileLock} for read-only files (index
* segments) or we ALWAYS discard the {@link FileLock} if it is not shared
* when we requested a shared lock and proceed without a lock. For this
* reason, the behavior of this class and {@link ResourceService} MUST
* match.
*
* @see ResourceService
* @see http://blogs.sun.com/DaveB/entry/new_improved_in_java_se1
* @see http://forums.sun.com/thread.jspa?threadID=5324314.
*
* @return The {@link FileChannel}.
*
* @throws IOException
* if the backing file can not be locked.
*/
final private FileChannel reopenChannel() throws IOException {
/*
* Note: This is basically a double-checked locking pattern. It is
* used to avoid synchronizing when the backing channel is already
* open.
*/
{
final RandomAccessFile tmp = raf;
if (tmp != null) {
final FileChannel channel = tmp.getChannel();
if (channel.isOpen()) {
// The channel is still open.
return channel;
}
}
}
lock.lock();
try {
/*
* Note: closing an IndexSegmentStore DOES NOT prevent it from being
* transparently reopened.
*/
// assertOpen();
if(!open) {
reopen();
}
if (raf != null && raf.getChannel().isOpen()) {
/*
* The channel is still open. If you are allowing concurrent reads
* on the channel, then this could indicate that two readers each
* found the channel closed and that one was able to re-open the
* channel before the other such that the channel was open again by
* the time the 2nd reader got here.
*/
return raf.getChannel();
}
// open the file.
this.raf = new RandomAccessFile(file, mode);
if (log.isInfoEnabled())
log.info("(Re-)opened file: " + file);
try {
/*
* Request a shared file lock.
*/
final FileLock fileLock = raf.getChannel().tryLock(0,
Long.MAX_VALUE, true/* shared */);
if (fileLock == null) {
/*
* Note: A null return indicates that someone else holds the
* lock. This can happen if the platform does not support shared
* locks or if someone requested an exclusive file lock.
*/
try {
raf.close();
} catch (Throwable t) {
// ignore.
}
throw new IOException("File already locked: file=" + getFile());
}
if(!fileLock.isShared()) {
/*
* DO NOT hold an exclusive lock for an index segment store
* file!
*
* Note: On platforms where shared locks are not support the JDK
* will escalate to an exclusive lock. That would interfere with
* our ability to MOVE index segments around using the
* ResourceService so we make sure that we don't hold an
* exclusive file lock here.
*/
fileLock.release();
}
} catch (OverlappingFileLockException ex) {
/*
* Note: OverlappingFileLockException can be thrown when there are
* concurrent requests to obtain the same shared lock. I consider
* this a JDK bug. It should be possible to service both requests
* without deadlock.
*
* Note: I had seen this exception occasionally even before we
* started using the ResourceService to MOVE index segments around.
* That also looks like a JDK bug since we only request the FileLock
* in this method, we know that the channel is closed, and this
* method is [synchronized]. Ergo, it should not be possible to have
* overlapping requests (concurrent requests).
*/
if (log.isInfoEnabled())
log
.info("Will proceed without lock: file=" + file + " : "
+ ex);
} catch (IOException ex) {
/*
* Note: This is true of NFS volumes. This is Ok and should be
* ignored. However the backing file is not protected against
* accidental deletes or overwrites.
*/
if (log.isInfoEnabled())
log.info("FileLock not supported: file=" + getFile(), ex);
}
return raf.getChannel();
} finally {
lock.unlock();
}
}
/**
* Attempts to read the index nodes into {@link #buf_nodes}.
* <p>
* Note: If the nodes could not be buffered then reads against the nodes
* will read through to the backing file.
*/
protected void bufferIndexNodes() throws IOException {
if(!lock.isHeldByCurrentThread()) {
throw new IllegalMonitorStateException();
}
if (buf_nodes != null) {
// already buffered.
return;
}
if(checkpoint.nnodes == 0) {
throw new IllegalStateException();
}
if(checkpoint.offsetNodes == 0L) {
throw new IllegalStateException();
}
if(checkpoint.extentNodes > DirectBufferPool.INSTANCE.getBufferCapacity()) {
/*
* The buffer would be too small to contain the nodes.
*/
log.warn("Node extent exceeds buffer capacity: index="
+ getIndexMetadata().getName() + ", file=" + file + ", "
+ "extent=" + checkpoint.extentNodes + ", bufferCapacity="
+ DirectBufferPool.INSTANCE.getBufferCapacity());
return;
}
/*
* This code is designed to be robust. If anything goes wrong then we
* make certain that the direct buffer is released back to the pool, log
* any errors, and return to the caller. While the nodes will not be
* buffered if there is an error throw in this section, if the backing
* file is Ok then they can still be read directly from the backing
* file.
*/
try {
/*
* Attempt to allocate a buffer to hold the disk image of the nodes.
*
* FIXME There should be a direct buffer pool instance specifically
* configured to buffer the index segment nodes. This will make it
* possible to buffer the nodes even when the buffer size required
* is not a good match for the buffer size used as the write cache
* for the journal. We need to report counters for all buffer pools
* in order to accurately track the memory overhead for each
* purpose. [make sure to replace all references to the default
* INSTANCE with the specialized pool and make sure that we have the
* chance to configure the pool before it is placed into service.]
*
* Actually, we can probably do just as well using a Java heap
* byte[]. It is only the disk IO which is improved by the direct
* NIO buffer. All in-memory access to the data is better for a Java
* heap byte[].
*/
buf_nodes = DirectBufferPool.INSTANCE.acquire(100/* ms */,
TimeUnit.MILLISECONDS);
if (log.isInfoEnabled())
log.info("Buffering nodes: #nodes=" + checkpoint.nnodes
+ ", #bytes=" + checkpoint.extentNodes + ", file=" + file);
// #of bytes to read.
final ByteBuffer tmp = buf_nodes.buffer();
tmp.limit((int)checkpoint.extentNodes);
// attempt to read the nodes into the buffer.
FileChannelUtility.readAll(opener, tmp,
checkpoint.offsetNodes);
tmp.flip();
} catch (Throwable t1) {
/*
* If we could not obtain a buffer without blocking, or if there was
* ANY problem reading the data into the buffer, then release the
* buffer and return. The nodes will not be buffered, but if the
* file is Ok then the index will simply read through to the disk
* for the nodes.
*/
if (buf_nodes != null) {
try {
// release buffer back to the pool.
buf_nodes.release();
} catch (Throwable t) {
// log error and continue.
log.error(this, t);
} finally {
// make sure the reference is cleared.
buf_nodes = null;
}
}
// log error and continue.
log.error(this, t1);
}
}
/**
* Reads the bloom filter directly from the file.
*
* @return The bloom filter -or- <code>null</code> if the bloom filter was
* not constructed when the {@link IndexSegment} was built.
*/
protected BloomFilter readBloomFilter() throws IOException {
final long addr = checkpoint.addrBloom;
if(addr == 0L) {
return null;
}
if (storeCache != null) {
// Try the cache first.
final BloomFilter bloomFilter = (BloomFilter) storeCache.get(addr);
if (bloomFilter != null) {
return bloomFilter;
}
}
if (log.isInfoEnabled())
log.info("reading bloom filter: "+addressManager.toString(addr));
final long off = addressManager.getOffset(addr);
final int len = addressManager.getByteCount(addr);
final ByteBuffer buf = ByteBuffer.allocate(len);
buf.limit(len);
buf.position(0);
try {
// read into [dst] - does not modify the channel's position().
FileChannelUtility.readAll(opener, buf, off);
buf.flip(); // Flip buffer for reading.
} catch (IOException ex) {
throw new RuntimeException(ex);
}
assert buf.position() == 0;
assert buf.limit() == len;
final BloomFilter bloomFilter = (BloomFilter) SerializerUtil.deserialize(buf);
if (log.isInfoEnabled())
log.info("Read bloom filter: bytesOnDisk=" + len );
if (storeCache != null) {
storeCache.putIfAbsent(addr, bloomFilter);
}
return bloomFilter;
}
/**
* Reads the {@link IndexMetadata} record directly from the file (this is
* invoked by the ctor).
*/
final private IndexMetadata readMetadata() throws IOException {
final long addr = checkpoint.addrMetadata;
assert addr != 0L;
if (storeCache != null) {
// Try the cache first.
final IndexMetadata md = (IndexMetadata) storeCache.get(addr);
if (md != null) {
return md;
}
}
if (log.isInfoEnabled())
log.info("reading metadata: "+addressManager.toString(addr));
final long off = addressManager.getOffset(addr);
final int len = addressManager.getByteCount(addr);
final ByteBuffer buf = ByteBuffer.allocate(len);
buf.limit(len);
buf.position(0);
try {
// read into [dst] - does not modify the channel's position().
FileChannelUtility.readAll(opener, buf, off);
buf.flip(); // Flip buffer for reading.
} catch (IOException ex) {
throw new RuntimeException(ex);
}
assert buf.position() == 0;
assert buf.limit() == len;
final IndexMetadata md = (IndexMetadata) SerializerUtil
.deserialize(buf);
if (log.isInfoEnabled())
log.info("Read metadata: " + md);
if (storeCache != null) {
storeCache.putIfAbsent(addr, md);
}
return md;
}
/*
* IAddressManager
*/
final public int getByteCount(long addr) {
return addressManager.getByteCount(addr);
}
final public long getOffset(long addr) {
return addressManager.getOffset(addr);
}
final public long getPhysicalAddress(long addr) {
return addressManager.getPhysicalAddress(addr);
}
// final public void packAddr(DataOutput out, long addr) throws IOException {
// addressManager.packAddr(out, addr);
// }
final public long toAddr(int nbytes, long offset) {
return addressManager.toAddr(nbytes, offset);
}
final public String toString(long addr) {
return addressManager.toString(addr);
}
// final public long unpackAddr(DataInput in) throws IOException {
// return addressManager.unpackAddr(in);
// }
}