IndexSegmentStore.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.bigdata.btree;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.lang.ref.WeakReference;
import java.lang.reflect.Constructor;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.nio.channels.OverlappingFileLockException;
import java.util.UUID;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.log4j.Logger;

import com.bigdata.counters.CounterSet;
import com.bigdata.counters.Instrument;
import com.bigdata.counters.OneShotInstrument;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.FileChannelUtility;
import com.bigdata.io.IBufferAccess;
import com.bigdata.io.IReopenChannel;
import com.bigdata.io.SerializerUtil;
import com.bigdata.journal.AbstractJournal;
import com.bigdata.journal.RootBlockException;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.mdi.SegmentMetadata;
import com.bigdata.rawstore.AbstractRawStore;
import com.bigdata.resources.StoreManager;
import com.bigdata.service.Event;
import com.bigdata.service.EventResource;
import com.bigdata.service.EventType;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.service.ResourceService;

/**
 * A read-only store backed by a file containing a single {@link IndexSegment}.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class IndexSegmentStore extends AbstractRawStore {

    /**
     * Logger.
     */
    protected static final Logger log = Logger
            .getLogger(IndexSegmentStore.class);

    /**
     * The mode that will be used to open the {@link #file} .
     */
    protected static final String mode = "r"; 
    
    /**
     * The file containing the index segment.
     */
    protected final File file;

    /**
     * For reporting via {@link #getResourceMetadata()}.
     */
    final private SegmentMetadata segmentMetadata;
    
    /**
     * Used to correct decode region-based addresses. The
     * {@link IndexSegmentBuilder} encodes region-based addresses using
     * {@link IndexSegmentRegion}. Those addresses are then transparently
     * decoded by this class. The {@link IndexSegment} itself knows nothing
     * about this entire slight of hand.
     * <p>
     * Note: Don't deallocate. It is small and holds useful metadata such as the
     * #of index entries that we would always like to have on hand.
     */
    private final IndexSegmentAddressManager addressManager;

    /**
     * Optional store cache for the bloom filter, index metadata, and the B+Tree
     * nodes and leaves (MAY be <code>null</code>).
     */
    //  @see BLZG-1501 (remove LRUNexus)
    @Deprecated
    private final ConcurrentMap<Long, Object> storeCache;
    
    /**
     * An optional <strong>direct</strong> {@link ByteBuffer} containing a disk
     * image of the nodes in the {@link IndexSegment}.
     * <p>
     * Note: This buffer is acquired from the {@link DirectBufferPool} and MUST
     * be released back to that pool.
     * <p>
     * Note: While some nodes will be held in memory by the hard reference queue
     * the use of this buffer means that reading a node that has fallen off of
     * the queue does not require any IO.
     */
    private volatile IBufferAccess buf_nodes;
    
    /**
     * The random access file used to read the index segment. This is
     * transparently re-opened if closed by an interrupt during an NIO
     * operation.
     * <p>
     * A shared {@link FileLock} is requested. If the platform and the volume
     * either DO NOT support {@link FileLock} or support <em>shared</em>
     * {@link FileLock}s then you will be able to open the same
     * {@link IndexSegmentStore} in multiple applications. However, if the
     * platform does not support shared locks then the lock request is converted
     * (by Java) into an exclusive {@link FileLock} and you will not be able to
     * open the {@link IndexSegmentStore} in more than one application at a
     * time.
     * <p>
     * Note: A shared {@link FileLock} makes it impossible to delete an
     * {@link IndexSegmentStore} that is in use. {@link FileLock}s are
     * automatically released when the {@link FileChannel} is closed or the
     * application dies. Using an advisory lock is NOT a good idea as it can
     * leave lock files in place which make it impossible to restart a data
     * service after an abnormal termination. For that reason it is better to
     * NOT use advisory locks on platforms and volumes which do not support
     * {@link FileLock}.
     * 
     * @see #reopenChannel()
     */
    private volatile RandomAccessFile raf;

    /**
     * 
     * @see #raf
     */
    
    /**
     * A read-only view of the checkpoint record for the index segment.
     * <p>
     * Note: Don't deallocate. It is small and holds useful metadata such as the
     * #of index entries that we would always like to have on hand.
     */
    private final IndexSegmentCheckpoint checkpoint;

    /**
     * The metadata record for the index segment.
     * <p>
     * Note: Don't deallocate. Relatively small and it holds some important
     * metadata. By reading this during the ctor we do not have to force the
     * entire index segment to be loaded just to access the index metadata.
     */
    private final IndexMetadata indexMetadata;

    /**
     * Counters specific to the {@link IndexSegmentStore}.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
     */
    private static class IndexSegmentStoreCounters {

        /**
         * The #of times the store was (re-)opened.
         */
        long openCount;

        /**
         * The #of times the store was closed.
         */
        long closeCount;

        /**
         * A read on a node (whether or not it is buffered).
         */
        long nodesRead;

        /**
         * A read on a node when the nodes are not buffered.
         */
        long nodesReadFromDisk;

        /**
         * A read on a leaf (always reads through to the disk).
         */
        long leavesReadFromDisk;

    }

    /**
     * Counters specific to the {@link IndexSegmentStore}.
     */
    private final IndexSegmentStoreCounters counters = new IndexSegmentStoreCounters();
    
//    final protected void assertOpen() {
//
//        if (!open) {
//            
//            throw new IllegalStateException();
//            
//        }
//
//    }
    
    /**
     * Used to correct decode region-based addresses. The
     * {@link IndexSegmentBuilder} encodes region-based addresses using
     * {@link IndexSegmentRegion}. Those addresses are then transparently
     * decoded by this class. The {@link IndexSegment} itself knows nothing
     * about this entire slight of hand.
     */
    public final IndexSegmentAddressManager getAddressManager() {
        
        return addressManager;
        
    }
    
    /**
     * A read-only view of the checkpoint record for the index segment.
     */
    public final IndexSegmentCheckpoint getCheckpoint() {
        
        return checkpoint;
        
    }

    public final UUID getUUID() {
        
        return segmentMetadata.getUUID();
        
    }
    
    /**
     * The {@link IndexMetadata} record for the {@link IndexSegment}.
     * <p>
     * Note: The {@link IndexMetadata#getPartitionMetadata()} always reports
     * that {@link LocalPartitionMetadata#getResources()} is <code>null</code>.
     * This is because the {@link BTree} on the {@link AbstractJournal} defines
     * the index partition view and each {@link IndexSegment} generally
     * participates in MANY views - one per commit point on each
     * {@link AbstractJournal} where the {@link IndexSegment} is part of an
     * index partition view.
     */
    public final IndexMetadata getIndexMetadata() {
        
        return indexMetadata;
        
    }

    /**
     * True iff the store is open.
     */
    private volatile boolean open = false;

    /**
     * Optional.  When defined, {@link Event}s are reported out.
     */
    protected final IBigdataFederation<?> fed;
    private volatile Event openCloseEvent;
    
    /**
     * Open a read-only store containing an {@link IndexSegment}, but does not
     * load the {@link IndexSegment} from the store.
     * <p>
     * Note: If an exception is thrown then the backing file will be closed.
     * <p>
     * Note: Normally access to {@link IndexSegmentStore}s is mediated by the
     * {@link StoreManager} which imposes a canonicalizing weak value cache to
     * ensure that we do not double-open an {@link IndexSegmentStore}.
     * 
     * @param file
     *            The file
     * 
     * @see #loadIndexSegment()
     * 
     * @throws RuntimeException
     *             if there is a problem.
     * @throws RootBlockException
     *             if the root block is invalid.
     */
    public IndexSegmentStore(final File file) {

        this(file, null/* fed */);
        
    }

    /**
     * Constructor variant that accepts an {@link IBigdataFederation} reference
     * and will report out {@link Event}s.
     * 
     * @param file
     * @param fed
     */
    public IndexSegmentStore(final File file, final IBigdataFederation<?> fed) {

        if (file == null)
            throw new IllegalArgumentException();

        this.file = file;

        // MAY be null.
        this.fed = fed;
        
        /*
         * Mark as open so that we can use reopenChannel() and read(long addr)
         * to read other data (the root node/leaf).
         */
        this.open = true;

        try {

            // open the file.
            reopenChannel();

            // read the checkpoint record from the file.
            this.checkpoint = new IndexSegmentCheckpoint(raf);

            // for reporting via getResourceMetadata and toString().
            this.segmentMetadata = new SegmentMetadata(file,
                    checkpoint.segmentUUID, checkpoint.commitTime);
            
            // handles transparent decoding of offsets within regions.
            this.addressManager = new IndexSegmentAddressManager(checkpoint);

            // optional store cache (set before reading metadata/bloomfilter).
            // @see BLZG-1501 (remove LRUNexus)
            this.storeCache = null;
//            this.storeCache = LRUNexus.getCache(this);
            
            // Read the metadata record.
            this.indexMetadata = readMetadata();

        } catch (IOException ex) {

            _close();

            throw new RuntimeException(ex);

        }

        if (log.isInfoEnabled())
            log.info(checkpoint.toString());
        
    }

    /**
     * Closes out the {@link IndexSegmentStore} iff it is still open.
     * <p>
     * Note: The {@link IndexSegment} has hard reference to the
     * {@link IndexSegmentStore} but not the other way around. Therefore an
     * {@link IndexSegment} will be swept before its store is finalized.
     */
    protected void finalize() throws Exception {
        
        if(open) {

            if(log.isInfoEnabled())
                log.info("Closing IndexSegmentStore: " + getFile());

            _close();
            
        }
        
    }

    public String toString() {

        /*
         * Note: Only depends on final fields.
         */

        return file.toString();
        
    }

    public IResourceMetadata getResourceMetadata() {

        /*
         * Note: Only depends on final fields.
         */

        return segmentMetadata;
        
    }
    
    /**
     * Re-open a (possibly closed) store. This operation should succeed if the
     * backing file is still accessible.
     * <p>
     * Note: If an exception is thrown then the backing file will be closed.
     * 
     * @throws RootBlockException
     *             if the root block is invalid.
     * @throws RuntimeException
     *             if there is a problem, including a
     *             {@link FileNotFoundException}.
     * 
     * @see #close()
     */
    public void reopen() {

        lock.lock();
        try {

            if (open) {
                /*
                 * The store was already open by the time we got the lock.
                 * 
                 * Note: IndexSegment#readNodeOrLeaf() does not have a lock
                 * before it invokes this method so the backing store can easily
                 * have been concurrently re-opened once that thread gains the
                 * lock.
                 */
//                throw new IllegalStateException("Already open.");
                return;
            }

            try {

                /*
                 * Mark as open so that we can use read(long addr) to read other
                 * data (the root node/leaf).
                 */
                this.open = true;

                // open the file channel for the 1st time.
                reopenChannel();
                
                counters.openCount++;
                
                if (fed != null) {

                    openCloseEvent = new Event(fed, new EventResource(
                            indexMetadata, file),
                            EventType.IndexSegmentStoreOpenClose).start();
                    
                }

            } catch (Throwable t) {

                // clean up.
                _close();

                // re-throw the exception.
                throw new RuntimeException(
                        "Could not (re-) open: file=" + file, t);

            }

        } finally {

            lock.unlock();

        }

    }

    /**
     * Load the {@link IndexSegment}. The {@link IndexSegment} (or derived
     * class) MUST provide a public constructor with the following signature:
     * <code>
     * 
     * <i>className</i>(IndexSegmentFileStore store)
     * 
     * </code>
     * <p>
     * Note: Normally access to {@link IndexSegment}s is mediated by the
     * {@link StoreManager} which imposes a canonicalizing weak value cache to
     * ensure that we do not double-open an {@link IndexSegment}.
     * 
     * @param store
     *            The store.
     * 
     * @return The {@link IndexSegment} or derived class loaded from that store.
     */
    public IndexSegment loadIndexSegment() {

        /*
         * This is grabbed before we request the lock in an attempt to close a
         * possible concurrency window where the finalizer on the index segment
         * might run while we are acquiring the lock. By grabbing a hard
         * reference here we ensure that the finalizer will not run while we are
         * acquiring the lock.  Who knows if this will ever make a difference.
         */
        IndexSegment seg = ref == null ? null : ref.get();

        lock.lock();
        try {

            /*
             * If we did not get the hard reference above then we need to try
             * again now that we have the lock.
             */
            seg = seg != null ? seg : ref == null ? null : ref.get();

            if (seg != null) {

                // ensure "open".
                seg.reopen();

                // return seg.
                return seg;
                
            } else {

                try {

                    @SuppressWarnings("rawtypes")
                    final Class cl = Class.forName(indexMetadata
                            .getBTreeClassName());

                    @SuppressWarnings({ "rawtypes", "unchecked" })
                    final Constructor ctor = cl
                            .getConstructor(new Class[] { IndexSegmentStore.class });

                    seg = (IndexSegment) ctor
                            .newInstance(new Object[] { this });

                    /*
                     * Attach the counters maintained by AbstractBTree to those
                     * reported for the IndexSegmentStore.
                     * 
                     * Note: These counters are only allocated when the
                     * IndexSegment object is created and this is where we
                     * enforce a 1:1 correspondence between an IndexSegmentStore
                     * and the IndexSegment loaded from that store. However, the
                     * index can be closed and re-opened so we still need to
                     * replace any counters which we find during attach().
                     */

                    getCounters().attach(seg.getBtreeCounters().getCounters(),
                            true/*replace*/);

                    // set the canonicalizing weak reference to the open seg.
                    ref = new WeakReference<IndexSegment>(seg);

                    // return seg.
                    return seg;
                    
                } catch (Exception ex) {

                    throw new RuntimeException(ex);

                }

            }

        } finally {

            lock.unlock();

        }

    }
    /**
     * A canonicalizing weak reference for the {@link IndexSegment} that can be
     * loaded from this store.
     */
    private volatile WeakReference<IndexSegment> ref = null;

    /**
     * A lock used to make open and close operations atomic.
     */
    protected final ReentrantLock lock = new ReentrantLock();
    
    final public boolean isOpen() {
        
        return open;
        
    }
   
    final public boolean isReadOnly() {

        return true;
        
    }
    
    final public boolean isStable() {
        
        return true;
        
    }

    /**
     * Return <code>false</code> since the leaves are not fully buffered even
     * if the nodes are fully buffered.
     */
    final public boolean isFullyBuffered() {
        
        return false;
        
    }
    
    /**
     * Return <code>true</code> if the nodes of the {@link IndexSegment} are
     * fully buffered in memory. The result is consistent as of the time that
     * this method examines the state of the {@link IndexSegmentStore}.
     */
    public boolean isNodesFullyBuffered() {
        
        lock.lock();
        
        try {

            return isOpen() && buf_nodes != null;

        } finally {
            
            lock.unlock();
            
        }
        
    }
    
    final public File getFile() {
        
        return file;
        
    }
    
    /**
     * Closes the file and releases the internal buffers. This operation will
     * quietly succeed if the {@link IndexSegmentStore} is already closed. This
     * operation may be reversed by {@link #reopen()} as long as the backing
     * file remains available. A read on a closed {@link IndexSegmentStore} will
     * transparently {@link #reopen()} the store as long as the backing file
     * remains available. {@link #destroy()} provides an atomic "close and
     * delete" operation.
     */
    public void close() {

        lock.lock();

        try {
        
            if (log.isInfoEnabled())
                log.info(file.toString());

//          assertOpen();

            if(isOpen()) {

                _close();
                
            }
            
        } finally {
            
            lock.unlock();
            
        }
        
    }
        
    /**
     * Method is safe to invoke whether or not the store is "open" and will
     * always close {@link #raf} (if open), release various buffers, and set
     * {@link #open} to <code>false</code>. All exceptions are trapped, a log
     * message is written, and the exception is NOT re-thrown.
     */
    private void _close() {

        lock.lock();

        try {

            if (raf != null) {

                try {

                    raf.close();

                } catch (IOException ex) {

                    log.error("Problem closing file: " + file, ex);

                    // ignore exception.

                }

                raf = null;
                
            }

            if (buf_nodes != null) {

                try {

                    // release the buffer back to the pool.
                    buf_nodes.release();

                } catch (Throwable t) {

                    // log error but continue anyway.
                    log.error(this, t);

                } finally {

                    // clear reference since buffer was released.
                    buf_nodes = null;

                }

            }

            open = false;

            counters.closeCount++;

            if (openCloseEvent != null) {

                try {
                    openCloseEvent.end();
                } catch (Throwable t) {
                    log.error(this, t);
                } finally {
                    openCloseEvent = null;
                }

            }
            
            if (log.isInfoEnabled())
                log.info("Closed: file=" + getFile());

        } finally {

            lock.unlock();

        }
        
    }
    
    public void deleteResources() {
        
        lock.lock();
        try {

            if (open)
                throw new IllegalStateException();

            try {
                
//              @see BLZG-1501 (remove LRUNexus)
//                if (LRUNexus.INSTANCE != null) {
//
//                    LRUNexus.INSTANCE.deleteCache(getUUID());
//
//                }
                
            } catch (Throwable t) {
                
                log.error(t, t);
                
            }
            
            if (!file.delete()) {

                throw new RuntimeException("Could not delete: "
                        + file.getAbsolutePath());

            }
            
        } finally {

            lock.unlock();

        }

    }

    /**
     * Atomically closes the store (iff open) and then deletes the backing file.
     */
    public void destroy() {

        lock.lock();

        try {

            if (isOpen()) {

                close();

            }

            deleteResources();

        } finally {

            lock.unlock();
            
        }
        
    }

    final public long write(ByteBuffer data) {

        throw new UnsupportedOperationException();

    }

    final public void force(boolean metadata) {
        
        throw new UnsupportedOperationException();
        
    }
    
    final public long size() {

        return checkpoint.length;
        
    }

    public CounterSet getCounters() {

//        if (counterSet == null) {
        
            final CounterSet counterSet = new CounterSet();
            
            counterSet.addCounter("file", new OneShotInstrument<String>(file
                    .toString()));

            // checkpoint (counters are all oneshot).
            {
                
                final CounterSet tmp = counterSet.makePath("checkpoint");
                
                tmp.addCounter("segment UUID", new OneShotInstrument<String>(
                        checkpoint.segmentUUID.toString()));

                // length in bytes of the file.
                tmp.addCounter("length", new OneShotInstrument<String>(
                        Long.toString(checkpoint.length)));

                tmp.addCounter("#nodes", new OneShotInstrument<String>(
                        Long.toString(checkpoint.nnodes)));

                tmp.addCounter("#leaves", new OneShotInstrument<String>(
                        Long.toString(checkpoint.nleaves)));

                tmp.addCounter("#entries", new OneShotInstrument<String>(
                        Long.toString(checkpoint.nentries)));

                tmp.addCounter("height", new OneShotInstrument<String>(
                        Long.toString(checkpoint.height)));

            }
            
            // metadata (all oneshot).
            {
                
                final CounterSet tmp = counterSet.makePath("metadata");
                
                tmp.addCounter("name", new OneShotInstrument<String>(
                        indexMetadata.getName()));

                tmp.addCounter("index UUID", new OneShotInstrument<String>(
                        indexMetadata.getIndexUUID().toString()));
                
            }
            
            // dynamic counters.
            {

                final CounterSet tmp = counterSet.makePath("store");
                
                tmp.addCounter("nodesBuffered", new Instrument<Boolean>() {
                    protected void sample() {
                        setValue(buf_nodes != null);
                    }
                });
                
                tmp.addCounter("openCount", new Instrument<String>() {
                    protected void sample() {
                        setValue(Long.toString(counters.openCount));
                    }
                });

                tmp.addCounter("closeCount", new Instrument<String>() {
                    protected void sample() {
                        setValue(Long.toString(counters.closeCount));
                    }
                });

                tmp.addCounter("nodesRead", new Instrument<String>() {
                    protected void sample() {
                        setValue(Long.toString(counters.nodesRead));
                    }
                });

                tmp.addCounter("nodeReadFromDisk", new Instrument<String>() {
                    protected void sample() {
                        setValue(Long.toString(counters.nodesReadFromDisk));
                    }
                });

                tmp.addCounter("leavesReadFromDisk", new Instrument<String>() {
                    protected void sample() {
                        setValue(Long.toString(counters.leavesReadFromDisk));
                    }
                });

            }

//        }
        
        return counterSet;
        
    }
//    private CounterSet counterSet;

    /**
     * Read a record from the {@link IndexSegmentStore}. If the request is in
     * the node region and the nodes have been buffered then this uses a slice
     * on the node buffer. Otherwise this reads through to the backing file.
     * <p>
     * Note: An LRU disk cache is a poor choice for the leaves. Since the btree
     * already maintains a cache of the recently touched leaf objects, a recent
     * read against the disk is the best indication that we have that we will
     * NOT want to read that region again soon.
     */
    public ByteBuffer read(final long addr) {

//        assertOpen();
        
        /*
         * True IFF the starting address lies entirely within the region
         * dedicated to the B+Tree nodes.
         */
        final boolean isNodeAddr = addressManager.isNodeAddr(addr);
        
        if (log.isDebugEnabled()) {

            log.debug("addr=" + addr + "(" + addressManager.toString(addr)
                    + "), isNodeAddr="+isNodeAddr);
            
        }
        
        // abs. offset of the record in the file.
        final long offset = addressManager.getOffset(addr);

        // length of the record.
        final int length = addressManager.getByteCount(addr);

        if (isNodeAddr) {

            // a node.
            
            counters.nodesRead++;
            
            /*
             * Note: In order to read from [buf_nodes] we MUST be holding the
             * [lock] since it could otherwise be concurrently returned to the
             * DirectBufferPool by _close().
             * 
             * Note: Because this takes the global [lock] it forces reads
             * against the nodes buffer to be single threaded. That might cause
             * [lock] to be contended, but I have not yet observed this. If the
             * lock does become contended, then it should be replaced by a
             * ReadWriteLock. The readLock would be used to read on the buffer.
             * The writeLock would be used to allocate or release the buffer.
             */

            if (buf_nodes != null) {

                lock.lock();
            
                try {

                    if (buf_nodes != null) {

                        return readFromBuffer(offset, length);

                    }

                } finally {

                    lock.unlock();
                    
                }

            }

            counters.nodesReadFromDisk++;
            
            // The data need to be read from the file.
            return readFromFile(offset, length);

        } else {

			/*
			 * Read a leaf, a blob, or some other raw allocation on the
			 * IndexSegmentStore.
			 */
            
        	// @todo over estimates leaves read since can be blob, etc. as well.
            counters.leavesReadFromDisk++;

            // The data need to be read from the file.
            return readFromFile(offset, length);

        }

    }

    /**
     * The [addr] addresses a node and the data are buffered. Create and return
     * a read-only view so that concurrent reads do not modify the buffer state.
     * <p>
     * Note: The caller MUST be synchronized such that {@link #buf_nodes} is
     * in a known state (either allocated or released).
     * 
     * @param offset
     * @param length
     * @return 
     */
    final private ByteBuffer readFromBuffer(final long offset, final int length) {
        
        /*
         * Note: In order to allow concurrent readers against the buffer, we
         * need to take a slice() on the buffer and we need to be holding a lock
         * such that the offset and position of the buffer can not change while
         * we take that slice. (Creation of the slice views must be serialized.)
         * 
         * Note: NOT read-only until we decide if we have a direct buffer or not
         * when we take the slice.
         */
        final ByteBuffer tmp;
        {
         
            final ByteBuffer t = buf_nodes.buffer();
            
            synchronized (t) {

                // Take slice. Still a read/write direct buffer.
                tmp = t.slice();

            }
            
        }

        // correct the offset so that it is relative to the buffer.
        final long off = offset - checkpoint.offsetNodes;

        // set the limit on the buffer to the end of the record.
        tmp.limit((int) (off + length));

        // set the position on the buffer to the start of the record.
        tmp.position((int) off);

        /*
         * Create a slice of that view showing only the desired record. The
         * position() of the slice will be zero(0) and the limit() will be the
         * #of bytes in the record.
         * 
         * Note: slice restricts the view available to the caller to the view
         * that was setup on the buffer at the moment that the slice was
         * obtained.
         * 
         * Note: We MUST NOT return a view of the direct buffer since the direct
         * buffer could be released at any time. Therefore, if [tmp] is a direct
         * buffer, we copy the data into a byte[] and then wrap and return that
         * byte[].
         * 
         * Note: We DO NOT want to make the returned byte[] a read-only view.
         * The B+Tree code requires access to the backing byte[]. If we make the
         * view read-only then a new byte[] and a new view will have to be
         * created by the NodeSerializer. (Plus, the returned buffer is wrapping
         * a copy of the data so any writes on it just mess up the caller.)
         */

        if (tmp.isDirect()) {
            
            // Just the piece we are interested in.
            final ByteBuffer slice = tmp.slice();
            
            // backing array is not accessible, so copy into new byte[].
            final byte[] a = new byte[length];
            
            // Copy data.
            slice.get(a);
            
            // Wrap and return.
            return ByteBuffer.wrap(a);

        }
        
        return tmp.slice();
        
    }
    
    /**
     * Read the record from the file.
     */
    final private ByteBuffer readFromFile(final long offset, final int length) {

        try {
            
            // Allocate buffer: limit = capacity; pos = 0.
            final ByteBuffer dst = ByteBuffer.allocate(length);

            // read into [dst] - does not modify the channel's position().
            FileChannelUtility.readAll(opener, dst, offset);
            
            // successful read from file; flip buffer for reading by caller.
            dst.flip();

            // done.
            return dst;
            
        } catch (IOException ex) {

            throw new RuntimeException(ex);
            
        }

    }

    /**
     * Read from the file into the caller's buffer.
     * <p>
     * Note: This is package private in order to expose it to the
     * {@link IndexSegmentMultiBlockIterator}.
     * 
     * @param offset
     *            The offset of the first byte to be read.
     * @param dst
     *            The buffer into which the data will be read. Bytes will be
     *            read into the buffer starting at the current position and up
     *            to the limit.
     * 
     * @throws IOException
     */
    final void readFromFile(final long offset, final ByteBuffer dst)
            throws IOException {

        // read into [dst] - does not modify the channel's position().
        FileChannelUtility.readAll(opener, dst, offset);

    }
    
    private final IReopenChannel<FileChannel> opener = new IReopenChannel<FileChannel>() {

        public String toString() {
            
            return IndexSegmentStore.this.toString();
            
        }
        
        public FileChannel reopenChannel() throws IOException {

            return IndexSegmentStore.this.reopenChannel();

        }
        
    };
    
    /**
     * This method transparently re-opens the channel for the backing file.
     * <p>
     * Since the {@link IndexSegment} is a read-only data structure, all of the
     * in-memory state remains valid and we only need to re-open the
     * {@link FileChannel} to the backing store and retry. In particular, we do
     * not need to re-read the root node, {@link IndexMetadata},
     * {@link BloomFilter}, etc. All we have to do is re-open the
     * {@link FileChannel}.
     * <p>
     * Note: This method is internally synchronized so that concurrent readers
     * do not try to all open the store at the same time. Further, this is the
     * only method other than {@link #_close()} that can set {@link #raf}. Since
     * both this method and {@link #_close()} are synchronized the state of that
     * field is well known inside of this method.
     * <p>
     * Note: {@link OverlappingFileLockException}s can arise when there are
     * concurrent requests to obtain a shared lock on the same file. Personally,
     * I think that this is a bug since the lock requests are shared and should
     * be processed without deadlock. However, the code handles this case by
     * proceeding without the lock - exactly as it would handle the case where a
     * shared lock was not available. This is still somewhat fragile since it
     * someone does not test the {@link FileLock} and was in fact granted an
     * exclusive lock when they requested a shared lock then this code will be
     * unwilling to send the resource. There are two ways to make that work out
     * - either we DO NOT use {@link FileLock} for read-only files (index
     * segments) or we ALWAYS discard the {@link FileLock} if it is not shared
     * when we requested a shared lock and proceed without a lock. For this
     * reason, the behavior of this class and {@link ResourceService} MUST
     * match.
     * 
     * @see ResourceService
     * @see http://blogs.sun.com/DaveB/entry/new_improved_in_java_se1
     * @see http://forums.sun.com/thread.jspa?threadID=5324314.
     * 
     * @return The {@link FileChannel}.
     * 
     * @throws IOException
     *             if the backing file can not be locked.
     */
    final private FileChannel reopenChannel() throws IOException {

        /*
         * Note: This is basically a double-checked locking pattern. It is
         * used to avoid synchronizing when the backing channel is already
         * open.
         */
        {
            final RandomAccessFile tmp = raf;
            if (tmp != null) {
                final FileChannel channel = tmp.getChannel();
                if (channel.isOpen()) {
                    // The channel is still open.
                    return channel;
                }
            }
        }
        
        lock.lock();
        try {
        /*
         * Note: closing an IndexSegmentStore DOES NOT prevent it from being
         * transparently reopened.
         */
//      assertOpen();
        if(!open) {
            
            reopen();
            
        }
        
        if (raf != null && raf.getChannel().isOpen()) {
            
            /*
             * The channel is still open. If you are allowing concurrent reads
             * on the channel, then this could indicate that two readers each
             * found the channel closed and that one was able to re-open the
             * channel before the other such that the channel was open again by
             * the time the 2nd reader got here.
             */
            
            return raf.getChannel();
            
        }
        
        // open the file.
        this.raf = new RandomAccessFile(file, mode);

        if (log.isInfoEnabled())
            log.info("(Re-)opened file: " + file);

        try {

            /*
             * Request a shared file lock.
             */
            final FileLock fileLock = raf.getChannel().tryLock(0,
                    Long.MAX_VALUE, true/* shared */);
            
            if (fileLock == null) {

                /*
                 * Note: A null return indicates that someone else holds the
                 * lock. This can happen if the platform does not support shared
                 * locks or if someone requested an exclusive file lock.
                 */

                try {
                    raf.close();
                } catch (Throwable t) {
                    // ignore.
                }

                throw new IOException("File already locked: file=" + getFile());

            }

            if(!fileLock.isShared()) {
                
                /*
                 * DO NOT hold an exclusive lock for an index segment store
                 * file!
                 * 
                 * Note: On platforms where shared locks are not support the JDK
                 * will escalate to an exclusive lock. That would interfere with
                 * our ability to MOVE index segments around using the
                 * ResourceService so we make sure that we don't hold an
                 * exclusive file lock here.
                 */
                
                fileLock.release();
                
            }
            
        } catch (OverlappingFileLockException ex) {

            /*
             * Note: OverlappingFileLockException can be thrown when there are
             * concurrent requests to obtain the same shared lock. I consider
             * this a JDK bug. It should be possible to service both requests
             * without deadlock.
             * 
             * Note: I had seen this exception occasionally even before we
             * started using the ResourceService to MOVE index segments around.
             * That also looks like a JDK bug since we only request the FileLock
             * in this method, we know that the channel is closed, and this
             * method is [synchronized]. Ergo, it should not be possible to have
             * overlapping requests (concurrent requests).
             */

            if (log.isInfoEnabled())
                log
                        .info("Will proceed without lock: file=" + file + " : "
                                + ex);

        } catch (IOException ex) {

            /*
             * Note: This is true of NFS volumes. This is Ok and should be
             * ignored. However the backing file is not protected against
             * accidental deletes or overwrites.
             */

            if (log.isInfoEnabled())
                log.info("FileLock not supported: file=" + getFile(), ex);

        }

        return raf.getChannel();
        } finally {
            lock.unlock();
        }
    }

    /**
     * Attempts to read the index nodes into {@link #buf_nodes}.
     * <p>
     * Note: If the nodes could not be buffered then reads against the nodes
     * will read through to the backing file.
     */
    protected void bufferIndexNodes() throws IOException {
        
        if(!lock.isHeldByCurrentThread()) {

            throw new IllegalMonitorStateException();
            
        }
        
        if (buf_nodes != null) {

            // already buffered.
            return;
            
        }
        
        if(checkpoint.nnodes == 0) {
        
            throw new IllegalStateException();
            
        }

        if(checkpoint.offsetNodes == 0L) {
            
            throw new IllegalStateException();
            
        }

        if(checkpoint.extentNodes > DirectBufferPool.INSTANCE.getBufferCapacity()) {
            
            /*
             * The buffer would be too small to contain the nodes.
             */
            
            log.warn("Node extent exceeds buffer capacity: index="
                    + getIndexMetadata().getName() + ", file=" + file + ", "
                    + "extent=" + checkpoint.extentNodes + ", bufferCapacity="
                    + DirectBufferPool.INSTANCE.getBufferCapacity());
            
            return;
            
        }

        /*
         * This code is designed to be robust. If anything goes wrong then we
         * make certain that the direct buffer is released back to the pool, log
         * any errors, and return to the caller. While the nodes will not be
         * buffered if there is an error throw in this section, if the backing
         * file is Ok then they can still be read directly from the backing
         * file.
         */
        try {

            /*
             * Attempt to allocate a buffer to hold the disk image of the nodes.
             * 
             * FIXME There should be a direct buffer pool instance specifically
             * configured to buffer the index segment nodes. This will make it
             * possible to buffer the nodes even when the buffer size required
             * is not a good match for the buffer size used as the write cache
             * for the journal. We need to report counters for all buffer pools
             * in order to accurately track the memory overhead for each
             * purpose. [make sure to replace all references to the default
             * INSTANCE with the specialized pool and make sure that we have the
             * chance to configure the pool before it is placed into service.]
             * 
             * Actually, we can probably do just as well using a Java heap
             * byte[]. It is only the disk IO which is improved by the direct
             * NIO buffer. All in-memory access to the data is better for a Java
             * heap byte[].
             */
            
            buf_nodes = DirectBufferPool.INSTANCE.acquire(100/* ms */,
                    TimeUnit.MILLISECONDS);
            
            if (log.isInfoEnabled())
                log.info("Buffering nodes: #nodes=" + checkpoint.nnodes
                        + ", #bytes=" + checkpoint.extentNodes + ", file=" + file);

            // #of bytes to read.
            final ByteBuffer tmp = buf_nodes.buffer();
            tmp.limit((int)checkpoint.extentNodes);
            
            // attempt to read the nodes into the buffer.
            FileChannelUtility.readAll(opener, tmp,
                    checkpoint.offsetNodes);
            
            tmp.flip();
            
        } catch (Throwable t1) {

            /*
             * If we could not obtain a buffer without blocking, or if there was
             * ANY problem reading the data into the buffer, then release the
             * buffer and return. The nodes will not be buffered, but if the
             * file is Ok then the index will simply read through to the disk
             * for the nodes.
             */

            if (buf_nodes != null) {

                try {
                
                    // release buffer back to the pool.
                    buf_nodes.release();
                    
                } catch (Throwable t) {
                    
                    // log error and continue.
                    log.error(this, t);
                    
                } finally {
                    
                    // make sure the reference is cleared.
                    buf_nodes = null;
                    
                }
                
            }

            // log error and continue.
            log.error(this, t1);

        }

    }

    /**
     * Reads the bloom filter directly from the file.
     * 
     * @return The bloom filter -or- <code>null</code> if the bloom filter was
     *         not constructed when the {@link IndexSegment} was built.
     */
    protected BloomFilter readBloomFilter() throws IOException {

        final long addr = checkpoint.addrBloom;
        
        if(addr == 0L) {
            
            return null;
            
        }
        
        if (storeCache != null) {

            // Try the cache first.
            final BloomFilter bloomFilter = (BloomFilter) storeCache.get(addr);
            
            if (bloomFilter != null) {

                return bloomFilter;

            }

        }
        
        if (log.isInfoEnabled())
            log.info("reading bloom filter: "+addressManager.toString(addr));
        
        final long off = addressManager.getOffset(addr);
        
        final int len = addressManager.getByteCount(addr);
        
        final ByteBuffer buf = ByteBuffer.allocate(len);

        buf.limit(len);

        buf.position(0);

        try {

            // read into [dst] - does not modify the channel's position().
            FileChannelUtility.readAll(opener, buf, off);
            
            buf.flip(); // Flip buffer for reading.
            
        } catch (IOException ex) {

            throw new RuntimeException(ex);

        }

        assert buf.position() == 0;
        assert buf.limit() == len;

        final BloomFilter bloomFilter = (BloomFilter) SerializerUtil.deserialize(buf);

        if (log.isInfoEnabled())
            log.info("Read bloom filter: bytesOnDisk=" + len );

        if (storeCache != null) {

            storeCache.putIfAbsent(addr, bloomFilter);

        }

        return bloomFilter;

    }

    /**
     * Reads the {@link IndexMetadata} record directly from the file (this is
     * invoked by the ctor).
     */
    final private IndexMetadata readMetadata() throws IOException {

        final long addr = checkpoint.addrMetadata;
        
        assert addr != 0L;
        
        if (storeCache != null) {

            // Try the cache first.
            final IndexMetadata md = (IndexMetadata) storeCache.get(addr);
            
            if (md != null) {

                return md;

            }

        }
        
        if (log.isInfoEnabled())
            log.info("reading metadata: "+addressManager.toString(addr));
        
        final long off = addressManager.getOffset(addr);
        
        final int len = addressManager.getByteCount(addr);
        
        final ByteBuffer buf = ByteBuffer.allocate(len);

        buf.limit(len);

        buf.position(0);

        try {

            // read into [dst] - does not modify the channel's position().
            FileChannelUtility.readAll(opener, buf, off);
            
            buf.flip(); // Flip buffer for reading.
            
        } catch (IOException ex) {

            throw new RuntimeException(ex);

        }

        assert buf.position() == 0;
        assert buf.limit() == len;

        final IndexMetadata md = (IndexMetadata) SerializerUtil
                .deserialize(buf);

        if (log.isInfoEnabled())
            log.info("Read metadata: " + md);

        if (storeCache != null) {

            storeCache.putIfAbsent(addr, md);

        }
        
        return md;

    }

    /*
     * IAddressManager
     */
    
    final public int getByteCount(long addr) {
        return addressManager.getByteCount(addr);
    }

    final public long getOffset(long addr) {
        return addressManager.getOffset(addr);
    }

    final public long getPhysicalAddress(long addr) {
		return addressManager.getPhysicalAddress(addr);
	}
    
//    final public void packAddr(DataOutput out, long addr) throws IOException {
//        addressManager.packAddr(out, addr);
//    }

    final public long toAddr(int nbytes, long offset) {
        return addressManager.toAddr(nbytes, offset);
    }

    final public String toString(long addr) {
        return addressManager.toString(addr);
    }

//    final public long unpackAddr(DataInput in) throws IOException {
//        return addressManager.unpackAddr(in);
//    }

}