WriteCacheService.java example

Explorer
blazegraph-master
- database-master
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
/*
 * Created on Feb 10, 2010
 */

package com.bigdata.io.writecache;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.Channel;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock;
import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock;

import org.apache.log4j.Logger;

import com.bigdata.counters.CounterSet;
import com.bigdata.ha.HAPipelineGlue;
import com.bigdata.ha.QuorumPipeline;
import com.bigdata.io.ChecksumUtility;
import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.IBufferAccess;
import com.bigdata.io.IReopenChannel;
import com.bigdata.io.writecache.WriteCache.ReadCache;
import com.bigdata.io.writecache.WriteCache.RecordMetadata;
import com.bigdata.journal.AbstractBufferStrategy;
import com.bigdata.journal.IBufferStrategy;
import com.bigdata.journal.IRootBlockView;
import com.bigdata.journal.RWStrategy;
import com.bigdata.journal.WORMStrategy;
import com.bigdata.quorum.Quorum;
import com.bigdata.quorum.QuorumMember;
import com.bigdata.rawstore.IAddressManager;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rwstore.RWStore;
import com.bigdata.util.ChecksumError;
import com.bigdata.util.DaemonThreadFactory;
import com.bigdata.util.InnerCause;
import com.bigdata.util.concurrent.Computable;
import com.bigdata.util.concurrent.Memoizer;

/**
 * A {@link WriteCacheService} is provisioned with some number of
 * {@link WriteCache} buffers and a writer thread. Caller's populate
 * {@link WriteCache} instances. When they are full, they are transferred to a
 * queue which is drained by the thread writing on the local disk. Hooks are
 * provided to wait until the current write set has been written (e.g., at a
 * commit point when the cached writes must be written through to the backing
 * channel). This implementation supports high availability using a write
 * replication pipeline.
 * <p>
 * A pool of {@link WriteCache} instances is used. Readers test all of the
 * {@link WriteCache} using a shared {@link ConcurrentMap} and will return
 * immediately the desired record or <code>null</code> if the record is not in
 * any of the {@link WriteCache} instances. Write caches remain available to
 * readers until they need to be recycled as the current write cache (the one
 * servicing new writes).
 * <p>
 * The {@link WriteCacheService} maintains a dirty list of {@link WriteCache}
 * instances. A single thread handle writes onto the disk and onto the write
 * replication pipeline (for HA). When the caller calls flush() on the write
 * cache service it flush() the current write cache is transferred to the dirty
 * list and then wait until the write cache instances now on the dirty list have
 * been serviced. In order to simplify the design and the provide boundary
 * conditions for HA decision making, writers block during
 * {@link #flush(boolean, long, TimeUnit)}.
 * <p>
 * Instances of this class are used by both the {@link RWStrategy} and the
 * {@link WORMStrategy}. These classes differ in how they allocate space on the
 * backing file and in the concurrency which they permit for writers.
 * <dl>
 * <dt>{@link WORMStrategy}</dt>
 * <dd>The {@link WORMStrategy} serializes all calls to
 * {@link #writeChk(long, ByteBuffer, int)} since it must guarantee the precise
 * offset at which each record is written onto the backing file. As a
 * consequence of its design, each {@link WriteCache} is a single contiguous
 * chunk of data and is transferred directly to a known offset on the disk. This
 * append only strategy makes for excellent transfer rates to the disk.</dd>
 * <dt>{@link RWStrategy}</dt>
 * <dd>The {@link RWStrategy} only needs to serialize the decision making about
 * the offset at which the records are allocated. Since the records may be
 * allocated at any location in the backing file, each {@link WriteCache}
 * results in a scattered write on the disk.</dd>
 * </dl>
 * Both the {@link WORMStrategy} and the {@link RWStrategy} implementations need
 * to also establish a read-write lock to prevent changes in the file extent
 * from causing corrupt data for concurrent read or write operations on the
 * file. See {@link #writeChk(long, ByteBuffer, int)} for more information on
 * this issue (it is a workaround for a JVM bug).
 * 
 * <h2>Checksums</h2>
 * 
 * The WORM and RW buffer strategy implementations, the WriteCacheService, and
 * the WriteCache all know whether or not checksums are in use. When they are,
 * the buffer strategy computes the checksum and passes it down (otherwise it
 * passes down a 0, which will be ignored since checksums are not enabled). The
 * WriteCache adjusts its capacity by -4 when checksums are enabled and adds the
 * checksum when transferring the caller's data into the WriteCache. On read,
 * the WriteCache will verify the checksum if it exists and returns a new
 * allocation backed by a byte[] showing only the caller's record.
 * <p>
 * {@link IAddressManager#getByteCount(long)} must be the actual on the disk
 * record length, not the size of the record when it reaches the application
 * layer. This on the disk length is the adjusted size after optional
 * compression and with the optional checksum. Applications which assume that
 * lengthOf(addr) == byte[].length will break, but that's life.
 * 
 * <h2>ReadCache</h2>
 * 
 * Without a hotList the readCache is managed naively by clearing any new
 * readCache. This potentially results in frequently accessed records being lost
 * to the cache.
 * 
 * <h2>HotCache</h2>
 * 
 * With the HotCache evicted readCaches hot records get transferred to hotList
 * and 'old' hotCaches get added to end of readCache. Pattern is needed to pluck
 * reserve hotCache from readList so that it is always possible to transfer hot
 * records from the readList.
 * <p>
 * Start with hotCache AND hotReserve.
 * 
 * If new reserve needed, because existing one is now used, try and compress new
 * readCache into current hotCache - if won't fit, then call resetWith and lose
 * those writes, cycle again, moving front hotCache to readList and compress
 * that one.
 * <p>
 * LIMIT: If we begin with full caches with above threshold hitCounts then the
 * whole list will cycle around until we hit original cache which will contain
 * records with zero hitCounts - for practical purposes ignoring any concurrent
 * reads.
 * 
 * @see WriteCache
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 * 
 * @todo There needs to be a unit test which verifies overwrite of a record in
 *       the {@link WriteCache} (a write at the same offset in the backing file,
 *       but at a different position in the {@link WriteCache} buffer). It is
 *       possible for this to occur with the {@link RWStore} if a record is
 *       written, deleted, and the immediately reallocated. Whether or not this
 *       is a likely event depends on how aggressively the {@link RWStore}
 *       reallocates addresses which were allocated and then deleted within the
 *       same native transaction.
 * 
 * @todo When compression is enabled, it is applied above the level of the
 *       {@link WriteCache} and {@link WriteCacheService} (which after all
 *       require the caller to pass in the checksum of the compressed record).
 *       It is an open question as to whether the caller or the store handles
 *       record compression. Note that the B+Tree leaf and node records may
 *       require an uncompressed header to allow fixup of the priorAddr and
 *       nextAddr fields.
 */
abstract public class WriteCacheService implements IWriteCache {

    protected static final Logger log = Logger.getLogger(WriteCacheService.class);

    /**
     * Logger for HA events.
     */
    private static final Logger haLog = Logger.getLogger("com.bigdata.ha");

    /**
     * <code>true</code> until the service is {@link #close() closed}.
     */
//  private volatile boolean open = true;
    private final AtomicBoolean open = new AtomicBoolean(true);

    /**
     * <code>true</code> iff record level checksums are enabled.
     */
    final private boolean useChecksum;

    /**
     * A single threaded service which writes dirty {@link WriteCache}s onto the
     * backing store.
     */
    final private ExecutorService localWriteService;

    /**
     * The {@link Future} of the task running on the {@link #localWriteService}.
     * 
     * @see WriteTask
     * @see #reset()
     */
    private Future<Void> localWriteFuture;

    /**
     * The {@link Future} of the task running on the {@link #remoteWriteService}
     * .
     * <p>
     * Note: Since this is <em>volatile</em> you MUST guard against concurrent
     * clear to <code>null</code> by {@link #reset()}.
     * 
     * @see WriteTask
     * @see #reset()
     */
    private volatile Future<?> remoteWriteFuture = null;

    /**
     * A list of clean buffers. By clean, we mean not needing to be written.
     * Once a dirty write cache has been flushed, it is placed onto the
     * {@link #cleanList}. Clean buffers can be taken at any time for us as the
     * current buffer.
     */
    final private LinkedBlockingDeque<WriteCache> cleanList;

    /**
     * Lock for the {@link #cleanList} allows us to notice when it becomes empty
     * and not-empty.
     */
    final private ReentrantLock cleanListLock = new ReentrantLock();

    /**
     * Condition <code>!cleanList.isEmpty()</code>
     * <p>
     * Note: If you wake up from this condition you MUST also test {@link #halt}.
     */
    final private Condition cleanListNotEmpty = cleanListLock.newCondition();

    /**
     * The read lock allows concurrent {@link #acquireForWriter()}s while the
     * write lock prevents {@link #acquireForWriter()} when we must either reset
     * the {@link #current} cache buffer or change the {@link #current}
     * reference. E.g., {@link #flush(boolean, long, TimeUnit)}.
     * <p>
     * Note: {@link #read(long)} is non-blocking. It does NOT use this lock!!!
     */
    final private ReentrantReadWriteLock lock = new ReentrantReadWriteLock();

    /**
     * A list of dirty buffers. Writes from these may be combined, but not
     * across {@link #flush(boolean)}.
     */
    final private BlockingQueue<WriteCache> dirtyList;

    /**
     * Lock for the {@link #dirtyList} allows us to notice when it becomes empty
     * and not-empty.
     */
    final private ReentrantLock dirtyListLock = new ReentrantLock();

    /**
     * Lock used to put cache buffers onto the {@link #dirtyList}. This lock is
     * required in order for {@link #flush(boolean, long, TimeUnit)} to have
     * atomic semantics, otherwise new cache buffers could be added to the dirty
     * list. This lock is distinct from the {@link #lock} because we do not want
     * to yield that lock when awaiting the {@link #dirtyListEmpty} condition.
     * <p>
     * Note: If you wake up from this condition you MUST also test {@link #halt}.
     * 
     * @see #dirtyListLock.
     */
    final private Condition dirtyListEmpty = dirtyListLock.newCondition();

    /**
     * Condition signaled whenever content is added to the dirty list.
     * <p>
     * Note: If you wake up from this condition you MUST also test {@link #halt}.
     */
    final private Condition dirtyListChange = dirtyListLock.newCondition();

    /**
     * Used to compact sparsely utilized {@link WriteCache}.
     */
    private final AtomicReference<WriteCache> compactingCacheRef = new AtomicReference<WriteCache>();
    
    /**
     * Maintained to guarantee that compaction is possible. This is always a
     * clean cache. 
     */
    private final AtomicReference<WriteCache> compactingReserveRef = new AtomicReference<WriteCache>();
    
    /**
     * Disable {@link WriteCache} compaction when <code>false</code>.
     * <p>
     * Note: This is set to <code>false</code> when
     * {@link #compactionThreshold} is 100.
     */
    private final boolean compactionEnabled;
    
    /**
     * The minimum percentage of empty space that could be recovered before we
     * will attempt to compact a {@link WriteCache} buffer (in [0:100]).
     */
    private final int compactionThreshold = 20; 
    
    /**
     * The current buffer. Modification of this value and reset of the current
     * {@link WriteCache} are protected by the write lock of {@link #lock()}.
     */
    final private AtomicReference<WriteCache> current = new AtomicReference<WriteCache>();

    /**
     * The current read cache.
     */
    final private AtomicReference<ReadCache> readCache = new AtomicReference<ReadCache>();

    /**
     * Flag set if {@link WriteTask} encounters an error. The cause is set
     * on {@link #firstCause} as well.
     * <p>
     * Note: Error handling MUST cause the write cache service buffers to be
     * {@link #reset()} and make sure the HA write pipeline is correctly
     * configured. This is handled by a high-level abort() on the journal. It is
     * NOT Ok to simply re-try writes of partly filled buffers since they may
     * already have been partly written to the disk. A high-level abort() is
     * necessary to ensure that we discard any bad writes. The abort() will need
     * to propagate to all members of the {@link Quorum} so they are all reset
     * to the last commit point and have reconfigured write cache services and
     * write pipelines.
     */
    private volatile boolean halt = false;

    /**
     * The first cause of an error within the asynchronous
     * {@link WriteTask}.
     */
    private final AtomicReference<Throwable> firstCause = new AtomicReference<Throwable>();

    /**
     * The capacity of the cache buffers. This is assumed to be the same for
     * each buffer.
     */
    final private int capacity;

//  /**
//   * Object knows how to (re-)open the backing channel.
//   */
//  final private IReopenChannel<? extends Channel> opener;

    /**
     * A map from the offset of the record on the backing file to the cache
     * buffer on which that record was written.
     */
    final private ConcurrentMap<Long/* offset */, WriteCache> serviceMap;

    /**
     * An immutable array of the {@link WriteCache} buffer objects owned by the
     * {@link WriteCacheService} (in contract to those owner by the caller but
     * placed onto the {@link #dirtyList} by
     * {@link #writeChk(long, ByteBuffer, int)}).
     */
    final private WriteCache[] writeBuffers;
    
    /**
     * An immutable array of the {@link WriteCache} buffer objects owned by the
     * {@link WriteCacheService}.  These buffers are used for the readCache.
     */
    final private ReadCache[] readBuffers;
    
    /**
     * Debug arrays to chase down write/removal errors.
     * 
     * Toggle comment appropriately to activate/deactivate
     */
	// final long[] addrsUsed = new long[4024 * 1024];
	// private int addrsUsedCurs = 0;
	// final char[] addrActions = new char[addrsUsed.length];
	// final int[] addrLens = new int[addrsUsed.length];
	private final long[] addrsUsed = null;
	private int addrsUsedCurs = 0;
	private final char[] addrActions = null;
	private final int[] addrLens = null;
    
    /**
     * The backing reader that can be used when a cache read misses.
     */
    final private IBackingReader reader;
    
    /**
     * The current file extent.
     */
    final private AtomicLong fileExtent = new AtomicLong(-1L);

//  /**
//   * The environment in which this object participates
//   */
//  protected final Environment environment;

    /**
     * The object which manages {@link Quorum} state changes on the behalf of
     * this service.
     */
    final private Quorum<HAPipelineGlue, QuorumMember<HAPipelineGlue>> quorum;

//    /**
//     * The {@link UUID} of the highly available service.
//     */
//    final private UUID serviceId;
    
    /**
     * The {@link Quorum} token under which this {@link WriteCacheService}
     * instance is valid. This is fixed for the life cycle of the
     * {@link WriteCacheService}. This ensures that all writes are buffered
     * under a consistent quorum meet.
     */
    final private long quorumToken;
    
    final private int replicationFactor;
    
    /**
     * The object which manages {@link Quorum} state changes on the behalf of
     * this service.
     */
    protected Quorum<HAPipelineGlue, QuorumMember<HAPipelineGlue>> getQuorum() {

        return quorum;
        
    }

    /**
     * Allocates N buffers from the {@link DirectBufferPool}.
     * 
     * @param nwriteBuffers
     *            The #of {@link WriteCache} buffers.
     * @param minCleanListSize
     *            The maximum #of {@link WriteCache} buffers on the
     *            {@link #dirtyList} before we start to evict {@link WriteCache}
     *            buffers to the disk -or- ZERO (0) to use a default value. <br>
     *            Note: As a rule of thumb, you should set
     *            <code>maxDirtyListSize LTE nbuffers-4</code> such that we have
     *            at least: (1) for [current], (1) for [compactingCache], (1)
     *            for reserve and (1) buffer left available on the
     *            {@link #cleanList}.
     * @param prefixWrites
     *            When <code>true</code>, the {@link WriteCacheService} is
     *            supporting an RWS mode store and each {@link WriteCache}
     *            buffer will directly encode the fileOffset of each record
     *            written onto the {@link WriteCache}. When <code>false</code>,
     *            the {@link WriteCacheService} is supporting a WORM mode store
     *            and the {@link WriteCache} buffers contain the exact data to
     *            be written onto the backing store.
     * @param compactionThreshold
     *            The minimum percentage of space that could be reclaimed before
     *            we will attempt to coalesce the records in a
     *            {@link WriteCache} buffer. When <code>100</code>, compaction
     *            is explicitly disabled.
     *            <p>
     *            Note: This is ignored for WORM mode backing stores since we
     *            can not compact the buffer in that mode.
     * @param useChecksum
     *            <code>true</code> iff record level checksums are enabled.
     * @param fileExtent
     *            The current extent of the backing file.
     * @param opener
     *            The object which knows how to (re-)open the channel to which
     *            cached writes are flushed.
     * @param quorumManager
     *            The object which manages {@link Quorum} state changes on the
     *            behalf of this service.
     * 
     * @throws InterruptedException
     */
    public WriteCacheService(final int nwriteBuffers, int minCleanListSize,
    		final int nreadBuffers,
            final boolean prefixWrites, final int compactionThreshold,
            final int hotCacheSize, final int hotCacheThreshold,
            final boolean useChecksum, final long fileExtent,
            final IReopenChannel<? extends Channel> opener, final Quorum quorum,
            final IBackingReader reader)
            throws InterruptedException {

        if (nwriteBuffers <= 0)
            throw new IllegalArgumentException();

        if (minCleanListSize == 0) { // default

            /*
             * Setup a reasonable default if no value was specified.
             * Just need to make sure we have a few spare buffers to
             * prevent latency on acquiring a clean buffer for writing.
	     *
	     * The default here is 5% of the write cache buffers. This
	     * is based on historical experience that we do better with
	     * 50MB of dirty list when there are 2000 write cache buffers,
	     * which is 2.5%.  It seems a reasonable thing to give over
	     * 5%.  If you want more write elision, then just increase
	     * the number of write cache buffers.  95% of them will be 
	     * used to defer writes and elide writes.  5% of them will
	     * be available to drive the disk with random write IOs.
	     *
	     * See BLZG-1589 (Modify the default behavior for setting
the clear/dirty list threshold)
             */
            
            minCleanListSize = Math.max(4, (int) (nwriteBuffers*.003));

        }
        
        if (minCleanListSize > nwriteBuffers)
            minCleanListSize = nwriteBuffers;

        if (minCleanListSize < 0)
            throw new IllegalArgumentException();

        if (compactionThreshold <= 0)
            throw new IllegalArgumentException();

        if (compactionThreshold > 100)
            throw new IllegalArgumentException();

        if (fileExtent < 0L)
            throw new IllegalArgumentException();

        if (opener == null)
            throw new IllegalArgumentException();

//        if (quorum == null)
//            throw new IllegalArgumentException();

        this.useChecksum = useChecksum;

        /**
         * FIXME WCS compaction fails!
         * 
         * CORRECTION, it is NOT clearly established that WCS compaction fails
         * although some failures appear to correlate with it being enabled.
         * It may be that with compaction enabled other errors are more likely
         * that are not directly associated with the compaction; for example
         * as a result of denser data content.
         * 
         * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/674" >
         *      WCS write cache compaction causes errors in RWS postHACommit()
         *      </a>
         */
        this.compactionEnabled = canCompact() && compactionThreshold < 100;
        
        if (log.isInfoEnabled())
            log.info("Compaction Enabled: " + compactionEnabled
                    + " @ threshold=" + compactionThreshold);

//      this.opener = opener;

        // the token under which the write cache service was established.
        if ((this.quorum = quorum) != null) {
            this.quorumToken = quorum.token();
            this.replicationFactor = quorum.replicationFactor();
        } else {
            // Not HA.
            this.quorumToken = Quorum.NO_QUORUM;
            this.replicationFactor = 1;
        }
        
        this.reader = reader;
        
        dirtyList = new LinkedBlockingQueue<WriteCache>();

        cleanList = new LinkedBlockingDeque<WriteCache>();

        writeBuffers = new WriteCache[nwriteBuffers];

        /*
         * Configure the desired dirtyListThreshold.
         */
        if (compactionEnabled) {
            /*
             * Setup the RWS dirtyListThreshold.
             * 
             * allow for compacting cache and reserve
             */
            m_dirtyListThreshold = Math.max(1, nwriteBuffers - minCleanListSize - 2); 
        } else {
            /*
             * Note: We always want a threshold of ONE (1) for the WORM since:
             * 1) We can not compact cache buffers for that store mode.
             * 2) We still want to write data to the file even if it will
             * 	never be read (as in the case of "deleted" data in same transaction
             * 	as it was allocated).
             */
            m_dirtyListThreshold = 1;
        }
        assert m_dirtyListThreshold >= 1;
        assert m_dirtyListThreshold <= writeBuffers.length;
        
        // Setup ReadCache
        this.readListSize = nreadBuffers;
        this.readList = new LinkedBlockingDeque<ReadCache>();
        
        readBuffers = new ReadCache[nreadBuffers];
        
        // pre-allocate all ReadCache
        for (int i = 0; i < readBuffers.length; i++) {
        	readBuffers[i] = new ReadCache(null);
        }
        
       /*
         * Hot cache setup
         * 
         * Let's aim for a 1/10 of the readCache, but hotListSize must be at least 3 
         * to function
         */
        {
        	if (hotCacheSize < (readListSize * 0.8) && hotCacheSize > 2) {
        		hotListSize = hotCacheSize;
        	} else {
        		hotListSize = 0;
        	}
        }
        hotList = new LinkedBlockingDeque<ReadCache>();
        
        this.hotCacheThreshold = hotCacheThreshold;
        
        // pre-populate hotList and readList
        for (int i = 0; i < hotListSize; i++) {
        	hotList.add(readBuffers[i]);
        }

        for (int i = hotListSize; i < readListSize; i++) {
        	readList.add(readBuffers[i]);
        }
        
        // set initial read cache
        hotCache = hotList.poll();        
        hotReserve = hotList.poll();        
        readCache.set(readList.poll());
        {
        	final ReadCache curReadCache = readCache.get();
        	if (curReadCache != null) {
        		curReadCache.incrementReferenceCount();
        	}
        }
        
        if (log.isInfoEnabled())
            log.info("nbuffers=" + nwriteBuffers + ", dirtyListThreshold="
                    + m_dirtyListThreshold + ", compactionThreshold="
                    + compactionThreshold + ", compactionEnabled="
                    + compactionEnabled + ", prefixWrites=" + prefixWrites
                    + ", hotListSize=" + hotListSize
                    + ", useChecksum=" + useChecksum + ", quorum=" + quorum);

        // save the current file extent.
        this.fileExtent.set(fileExtent);

        // Add [current] WriteCache.
        current.set(writeBuffers[0] = newWriteCache(null/* buf */,
                useChecksum, false/* bufferHasData */, opener, fileExtent));
//        if (nbuffers > 1) {
//            readCache.set(buffers[1] = newWriteCache(null/* buf */,
//                useChecksum, false/* bufferHasData */, opener, fileExtent));
//            
//            buffers[1].incrementReferenceCount(); // for readCache
//            buffers[1].closeForWrites();
//        }
        
        // add remaining buffers.
        for (int i = 1; i < nwriteBuffers; i++) {

            final WriteCache tmp = newWriteCache(null/* buf */, useChecksum,
                    false/* bufferHasData */, opener, fileExtent);

            writeBuffers[i] = tmp;
            
             cleanList.add(tmp);

        }


        // Set the same counters object on each of the write cache instances.
        final WriteCacheServiceCounters counters = new WriteCacheServiceCounters(
        		nwriteBuffers, m_dirtyListThreshold, compactionThreshold);

        for (int i = 0; i < writeBuffers.length; i++) {
        
            writeBuffers[i].setCounters(counters);
            
        }
        
        this.counters = new AtomicReference<WriteCacheServiceCounters>(counters);

        // assume capacity is the same for each buffer instance.
        capacity = current.get().capacity();

        // set initial capacity based on an assumption of 1k buffers.
        serviceMap = new ConcurrentHashMap<Long, WriteCache>(nwriteBuffers
                * (capacity / 1024));

        /*
         * Memoizer used to install reads into the cache on a cache miss.
         */
        memo = new ReadMemoizer(loadChild);

        // start service to write on the backing channel.
        localWriteService = Executors
                .newSingleThreadExecutor(new DaemonThreadFactory(getClass()
                        .getName()));

        // run the write task
        localWriteFuture = localWriteService.submit(newWriteTask());
        
    }
    
    /**
     * Return <code>true</code> iff we are allowed to compact buffers. The
     * default implementation of the {@link WriteCache} is for a Worm and can
     * never compact.
     * <p>
     * Note: This method is package private for access by
     * {@link WriteCacheService}.
     */
    protected boolean canCompact() {

        return false;
        
    }
    
    /**
     * Called from {@link IBufferStrategy#commit()} and {@link #reset()} to
     * reset WriteCache sequence for HA synchronization. The return value winds
     * up propagated to the {@link IRootBlockView#getBlockSequence()} field in
     * the {@link IRootBlockView}s.
     * 
     * @return The value of the counter before this method was called.
     */
    public long resetSequence() {
     
        return cacheSequence.getAndSet(0L);
        
    }
    private final AtomicLong cacheSequence = new AtomicLong(0);

    /**
     * Return the then current write cache block sequence number.
     */
    public long getSequence() {

        return cacheSequence.get();
        
    }
    
    /**
     * Determines how long the dirty list should grow until the
     * {@link WriteCache} buffers are coalesced and/or written to disk.
     * <p>
     * Note: For the WORM there is no advantage to any buffering, but the
     * RWStore may recycle storage, so: 1) Writes can be avoided if delayed 2)
     * Buffers could potentially be compacted, further delaying writes.
     * <p>
     * Note: This MUST BE GTE ONE (1) since WriteTask.call() will otherwise drop
     * through without actually taking anything off of the dirtyList.
     */
    private final int m_dirtyListThreshold;
    
    /**
     * The readCache is managed separately from the writeCache.
     * <p>
     * If active then the readCache may optionally be managed together
     * with a hotList, to which frequently read buffers are transferred.
     * <p>
     * Data is added to the readCache:
     * <li>after an evicted WriteCache is written to disk/HA
     * <li>on a cache miss, disk reads are added to the cache
     * <p>
     * Data is added to the hotList when a readCache is evicted from the
     * readList.  resetWith uses the hitCount associated with live data
     * records to determine which data is transferred to the hotList.
     * <p>
     * When a readCache is evicted from the hotList, the entire cache
     * is moved to the readList.
     */
    private final int readListSize;
    /**
     * The readList - maximum of readListSize
     */
    final private BlockingQueue<ReadCache> readList;

    /**
     * Determines the size of the HIRS cache (will be zero if disabled)
     * <p>
     * Where HIRS captures High inter-reference vs Low inter-reference of
     * LIRS.
     * <p>
     * The HIRS cache is used in conjunction with the readCache which a naive
     * copying strategy would be a kind of LIRS cache.  Instead, cache hits
     * from "older" read cache records are copied to the HIRS cache which
     * should be recycled more slowly.
     * <p>
     * Once the HIRS cache is full (maximum number of buffers in use) then
     * then the per record hit count is used to determine which records are
     * transferred to be maintained.
     */
    private final int hotListSize;
    
     /**
     * The hotList - maximum of hirsSize - populated lazily from cleanList
     */
    final private BlockingQueue<ReadCache> hotList;

    /**
     * The current hotCache.
     * <p>
     * Note: Guarded by the {@link #readCache} reference.
     */
    private ReadCache hotCache = null;

    /**
     * Current hotCacheThreshold above which readCache records are 
     * transferred to the hotCache.
     */
	final private int hotCacheThreshold;

    /**
     * The current hotReserve.
     * <p>
     * Note: Guarded by the {@link #readCache} reference.
     */
    private ReadCache hotReserve = null;

//    /**
//     * Computes modular distance of a circular number list.
//     * 
//     * eg start: 1, end:5, mod: 20 = 5-1 = ((5+20)-1)%20 = 4
//     * or start:15, end:3, mod: 20 = (3+20)-15 = 8
//     * 
//     * Used to determine the position of a cache from front of
//     * the clean list
//     */
//    private int modDistance(final int start, final int end, final int mod) {
//     	return ((end + mod) - start) % mod;
//    }
        
    /**
     * When <code>true</code>, dirty buffers are immediately drained, compacted,
     * and then written out to the backing media and (in HA mode) to the
     * followers.
     */
    private volatile boolean flush = false;

    /**
     * When <code>true</code> any dirty buffers are written directly and never compacted.
     * This is only used in flush() when adding any compactingCache to the dirty list.
     */
    private volatile boolean directWrite = false;

    protected Callable<Void> newWriteTask() {

        return new WriteTask();

    }

    /**
     * The task responsible for writing dirty buffers onto the backing channel
     * and onto the downstream {@link Quorum} member if the service is highly
     * available.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
     *         Thompson</a>
     */
    class WriteTask implements Callable<Void> {

        private ByteBuffer checksumBuffer;
        
        /**
         * Note: If there is an error in this thread then it needs to be
         * propagated to the threads write()ing on the cache or awaiting flush()
         * and from there back to the caller and an abort(). We do not need to
         * bother the readers since the read() methods all allow for concurrent
         * close() and will return null rather than bad data. The reprovisioning
         * of the write cache service (e.g., by reset()) must hold the writeLock
         * so as to occur when there are no outstanding reads executing against
         * the write cache service.
         * 
         * @todo If resynchronization rolls back the lastCommitTime for a store,
         *       then we need to interrupt or otherwise invalidate any readers
         *       with access to historical data which is no longer part of the
         *       quorum.
         */
        public Void call() throws Exception {
            try {
                if (quorum != null) {
                    // allocate heap byte buffer for whole buffer checksum.
                    checksumBuffer = ByteBuffer.allocate(writeBuffers[0].peek()
                            .capacity());
                } else {
                    checksumBuffer = null;
                }
                doRun();
                return null;
            } catch (InterruptedException t) {
                /*
                 * This task can only be interrupted by a thread with its
                 * Future (or by shutting down the thread pool on which it
                 * is running), so this interrupt is a clear signal that the
                 * write cache service is closing down.
                 */
                return null;
            } catch (Throwable t) {
                if (InnerCause.isInnerCause(t,
                        AsynchronousCloseException.class)) {
                    /*
                     * The service was shutdown. We do not want to log an
                     * error here since this is normal shutdown. close()
                     * will handle all of the Condition notifies.
                     */
                    return null;
                }
                /*
                 * Anything else is an error and halts processing. Error
                 * processing MUST a high-level abort() and MUST do a
                 * reset() if this WriteCacheService instance will be
                 * reused.
                 * 
                 * Note: If a WriteCache was taken from the dirtyList above
                 * then it will have been dropped. However, all of the
                 * WriteCache instances owned by the WriteCacheService are
                 * in [buffers] and reset() is written in terms of [buffers]
                 * precisely so we do not loose buffers here.
                 */
                if (firstCause.compareAndSet(null/* expect */, t/* update */)) {
                    halt = true;
                }
                /*
                 * Signal anyone blocked on the dirtyList or cleanList
                 * Conditions. They need to notice the change in [halt] and
                 * wrap and rethrow [firstCause].
                 */
                dirtyListLock.lock();
                try {
                    dirtyListEmpty.signalAll();
                    dirtyListChange.signalAll();
                } finally {
                    dirtyListLock.unlock();
                }
                cleanListLock.lock();
                try {
                    cleanListNotEmpty.signalAll();
                } finally {
                    cleanListLock.unlock();
                }
                log.error(t, t);
                /*
                 * Halt processing. The WriteTask must be restarted by
                 * reset.
                 */
                return null;
            } finally {
                /*
                 * Clear compactingCache reference now that the WriteTask is
                 * known to be terminated.
                 */
                compactingCacheRef.set(null); // clear reference.
                checksumBuffer = null;
            }
        } // call()

        private void doRun() throws Exception {

            while (true) {

            	/*
            	 * Replace assert !halt; since it is set in WriteCacheService.close()
            	 */
                if (halt) {
                    throw new RuntimeException(firstCause.get());
                }

                // Await dirty cache buffer.
                final WriteCache cache = awaitDirtyBuffer();

                boolean didCompact = false;
                boolean didWrite = false;

                /*
                 * Note: When using a large number of write cache buffers and a
                 * bulk data load, it is not uncommon for all records to be
                 * recycled by the time we take something from the dirtyList, in
                 * which case the cache will be (logically) empty.
                 * 
                 * Note: This test (WriteCache.isEmpty()) is not decisive
                 * because we are not holding any locks across it and the
                 * subsequent actions. Therefore, it is possible that the cache
                 * will become empty after it has been tested through concurrent
                 * clearWrite() invocations. That should not be a problem. We
                 * want to leave the cache open (versus closing it against
                 * writes) in case we decide to compact the cache rather than
                 * evicting it. The cache MUST NOT be closed for writes when we
                 * compact it or we will lose the ability to clear recycled
                 * records out of that WriteCache.
                 */

                final boolean wasEmpty = cache.isEmpty();

                if (!wasEmpty) {

                    final int percentEmpty = cache.potentialCompaction();

                    if (compactionEnabled && !directWrite 
                            && percentEmpty >= compactionThreshold) {

                        if (log.isDebugEnabled())
                            log.debug("percentEmpty=" + percentEmpty + "%");

                        // Attempt to compact cache block.
                        if (compactCache(cache)) {

                            // [cache] is clean and empty.
                            assert cache.isEmpty();

                        } else {

                            // Write cache block if did not compact.
                            writeCacheBlock(cache);
                            
                            didWrite = true;
                            
                        }
                        
                        didCompact = true;

                    } else {

                        // Write cache block.
                        writeCacheBlock(cache);

                        didWrite = true;

                    }

                }

                // Now written/compacted, remove from dirtyList.
                if (dirtyList.take() != cache)
                    throw new AssertionError();
                counters.get().ndirty--;

                dirtyListLock.lockInterruptibly();
                try {
                    if (dirtyList.isEmpty()) {
                        /*
                         * Signal Condition when we release the
                         * dirtyListLock.
                         */
                        dirtyListEmpty.signalAll();
                    }
                } finally {
                    dirtyListLock.unlock();
                }

                addClean(cache, false/* addFirst */);

                if (!wasEmpty && log.isInfoEnabled()) {
                    final WriteCacheServiceCounters tmp = counters.get();
                    final long nhit = tmp.nhit.get();
                    final long ntests = nhit + tmp.nmiss.get();
                    final int hitRate = (int) (100 * ((ntests == 0L ? 0d
                            : (double) nhit / ntests)));
                    final WriteCacheServiceCounters c = counters.get();
                    log.info("WriteCacheService: bufferCapacity="
                            + writeBuffers[0].capacity() + ",nbuffers="
                            + tmp.nbuffers + ",nclean=" + tmp.nclean
                            + ",ndirty=" + tmp.ndirty + ",maxDirty="
                            + tmp.maxdirty + ",hitRate=" + hitRate + ",empty="
                            + wasEmpty + ",didCompact=" + didCompact
                            + ",didWrite=" + didWrite + ",ncompact="
                            + c.ncompact + ",nbufferEvictedToChannel="
                            + c.nbufferEvictedToChannel);
                }

            } // while(true)
            
        } // doRun()
        
        /**
         * We choose here whether to compact the cache.
         * 
         * 1) Reserve extra clean buffer, if none available do NOT attempt
         * compaction 2) Compact to "current" compacting buffer avoiding
         * contention with writing threads 3) If required replace current
         * compacting buffer with reserved, adding compacting buffer to dirty
         * list 4) Release compacted
         * 
         * @return <code>true</code> iff we compacted the cache.
         * 
         * @throws InterruptedException
         */
        private boolean compactCache(final WriteCache cache)
                throws InterruptedException, Exception {

            /*
             * The cache should not be closed against writes. If it were closed
             * for writes, then we would no longer be able to capture cleared
             * writes in the RecordMap. However, if we compact the cache, we
             * want any cleared writes to be propagated into the compacted
             * cache.
             */
            assert !cache.isClosedForWrites();

            if (compactingReserveRef.get() == null) {

                final WriteCache tmp = getDirectCleanCache();

                if (tmp == null)
                    return false; // cannot guarantee compaction

                tmp.resetWith(serviceMap); // should be NOP!

                compactingReserveRef.set(tmp);

            }
                
            /*
             * We can be certain to be able to compact.
             */
            
            /*
             * Grab the [compactingCache] (if any).
             */
            WriteCache curCompactingCache = null;
            dirtyListLock.lockInterruptibly();
            try {
                // Might be null.
                curCompactingCache = compactingCacheRef.getAndSet(null);
//            } finally {
//                dirtyListLock.unlock();
//            }
//            try {
                boolean done = false;
                if (curCompactingCache != null) {
                    if (log.isTraceEnabled())
                        log.trace("Transferring to curCompactingCache");
                    
                    done = WriteCache.transferTo(cache/* src */,
                            curCompactingCache/* dst */, serviceMap, 0/*threshold*/);
                    if (done) {
                        // Everything was compacted.  Send just the address metadata (empty cache block).
                        sendAddressMetadata(cache);
                        
                        if (log.isDebugEnabled())
                            log.debug("RETURNING RESERVE: curCompactingCache.bytesWritten="
                                    + curCompactingCache.bytesWritten());

                        return true;
                    }
                    /*
                     * The [curCompactingCache] is full.
                     */
                    if (flush) {
                        /*
                         * Send out the full cache block.
                         */
                        writeCacheBlock(curCompactingCache);
                        addClean(curCompactingCache, true/* addFirst */);
                        if (log.isTraceEnabled())
                            log.trace("Flushed curCompactingCache");
                    } else {
                        /*
                         * Add current compacting cache to dirty list.
                         */
                        dirtyList.add(curCompactingCache);
                        if (log.isTraceEnabled())
                            log.trace("Added curCompactingCache to dirtyList");
                    }
                    // fall through. fill in the reserve cache next.
                    curCompactingCache = null;
                }

                /*
                 * Clear the state on the reserve buffer and remove from
                 * cacheService map.
                 */
                if (log.isTraceEnabled())
                    log.trace("Setting curCompactingCache to reserve");

                curCompactingCache = compactingReserveRef.getAndSet(null);
                {
                    final WriteCache tmp = getDirectCleanCache();
                    if (tmp != null) {
                        tmp.resetWith(serviceMap); // should be NOP!
                        compactingReserveRef.set(tmp);
                    }
                }
                
                if (log.isTraceEnabled())
                    log.trace("Transferring to curCompactingCache");
                done = WriteCache.transferTo(cache/* src */,
                        curCompactingCache/* dst */, serviceMap, 0/*threshold*/);

                if (!done) {
                    throw new AssertionError(
                            "We must be able to compact the cache");
                }
                if (log.isDebugEnabled())
                    log.debug("USING RESERVE: curCompactingCache.bytesWritten="
                            + curCompactingCache.bytesWritten());
                sendAddressMetadata(cache);
                // Buffer was compacted.
                return true;
            } finally {
//                dirtyListLock.lock();
                try {
                    // Now reset compactingCache with dirtyListLock held
                    compactingCacheRef.set(curCompactingCache);
                    counters.get().ncompact++;
                } finally {
                    dirtyListLock.unlock();
                }
            }

        } // compactCache()

        /**
         * In HA, we need to notify a downstream RWS of the addresses that have
         * been allocated on the leader in the same order in which the leader
         * made those allocations. This information is used to infer the order
         * in which the allocators for the different allocation slot sizes are
         * created. This method will synchronously send those address notices and
         * and also makes sure that the followers see the recycled addresses
         * records so they can keep both their allocators and the actual
         * allocations synchronized with the leader.
         * 
         * @param cache
         *            A {@link WriteCache} whose data has been transfered into
         *            another {@link WriteCache} through a "compact" operation.
         * 
         * @throws IllegalStateException
         * @throws InterruptedException
         * @throws ExecutionException
         * @throws IOException
         * 
         * @see <a href="http://trac.blazegraph.com/ticket/721"> HA1 </a>
         */
        private void sendAddressMetadata(final WriteCache cache)
                throws IllegalStateException, InterruptedException,
                ExecutionException, IOException {

            if (quorum == null) { //|| !quorum.isHighlyAvailable()
//                    || !quorum.getClient().isLeader(quorumToken)) {
                return;
            }

            if (cache.prepareAddressMetadataForHA()) {

                writeCacheBlock(cache);

            }

        }

         /**
         * Get a dirty cache buffer. Unless we are flushing out the buffered
         * writes, we will allow the dirtyList to grow to the desired threshold
         * before we attempt to compact anything.
         * <p>
         * Note: This DOES NOT remove the {@link WriteCache} from the
         * {@link #dirtyList}. It uses a peek(). The {@link WriteCache} will
         * remain on the {@link #dirtyList} until it has been handled by
         * {@link #doRun()}.
         * 
         * @return A dirty {@link WriteCache}.
         */
        private WriteCache awaitDirtyBuffer() throws InterruptedException {

            dirtyListLock.lockInterruptibly();
            try {
                assert m_dirtyListThreshold >= 1
                        && m_dirtyListThreshold <= writeBuffers.length : "dirtyListThreshold="
                        + m_dirtyListThreshold
                        + ", #buffers="
                        + writeBuffers.length;
                /*
                 * Wait for a dirty buffer.
                 * 
                 * Note: [flush] and [m_dirtyListThreshold] can change
                 * during this loop!
                 */
                while (true) {
                    if (!flush) {
                        // Let dirtyList grow up to threshold.
                        if (dirtyList.size() < m_dirtyListThreshold
                                && !halt) {
                            dirtyListChange.await();
                        } else
                            break;
                    } else {
                        // We need to flush things out.
                        if (dirtyList.isEmpty() && !halt) {
                            dirtyListChange.await();
                        } else
                            break;
                    }
                }
                if (halt)
                    throw new RuntimeException(firstCause.get());

                // update counters.
                final WriteCacheServiceCounters c = counters.get();
                c.ndirty = dirtyList.size();
                if (c.maxdirty < c.ndirty)
                    c.maxdirty = c.ndirty;

                // Guaranteed available.
                final WriteCache cache = dirtyList.peek();
                if (cache == null)
                    throw new AssertionError();
                
                // System.err.println(cache.toString());

                return cache;

            } finally {
        
                dirtyListLock.unlock();
                
            }

        }

        /**
         * Write the {@link WriteCache} onto the disk and the HA pipeline.
         * 
         * @param cache
         *            The {@link WriteCache}.
         * 
         * @throws InterruptedException
         * @throws ExecutionException
         * @throws IOException
         */
        private void writeCacheBlock(final WriteCache cache)
                throws InterruptedException, ExecutionException, IOException {

            /**
             * IFF HA and this is the quorum leader.
             * 
             * Note: This is true for HA1 as well. The code path enabled by this
             * is responsible for writing the HALog files.
             * 
             * @see <a href="http://trac.blazegraph.com/ticket/721"> HA1 </a>
             */
            final boolean isHALeader = quorum != null
                    && quorum.getClient().isLeader(quorumToken);

            /*
             * Ensure nothing will modify this buffer before written to disk or
             * HA pipeline.
             * 
             * Note: Do NOT increment the cacheSequence here. We need to decide
             * whether or not the buffer is empty first, and it needs to be
             * closed for writes before we can make that decision.
             */

            // Must be closed for writes.
            cache.closeForWrites();
            
            /*
             * Test for an empty cache.
             * 
             * Note: We can not do this until the cache has been closed for
             * writes.
             */
            {
                final ByteBuffer b = cache.peek();
                if (b.position() == 0) {
                    // Empty cache.
                    return;
                }
            }

            // Increment WriteCache sequence.
            final long thisSequence = cacheSequence.getAndIncrement();
//            cache.setSequence(thisSequence);

            // Set the current file extent on the WriteCache.
            cache.setFileExtent(fileExtent.get());

            if (isHALeader) {//quorum != null && quorum.isHighlyAvailable()) {

                // Verify quorum still valid and we are the leader.
                quorum.assertLeader(quorumToken);

                /*
                 * Replicate from the leader to the first follower. Each
                 * non-final follower will receiveAndReplicate the write cache
                 * buffer. The last follower will receive the buffer.
                 */

                // send to 1st follower.
                @SuppressWarnings("unchecked")
                final QuorumPipeline<HAPipelineGlue> quorumMember = (QuorumPipeline<HAPipelineGlue>) quorum
                        .getMember();

                assert quorumMember != null : "Not quorum member?";

                final WriteCache.HAPackage pkg = cache.newHAPackage(//
                        quorumMember.getStoreUUID(),//
                        quorumToken,//
                        quorumMember.getLastCommitCounter(),//
                        quorumMember.getLastCommitTime(),//
                        thisSequence,//
                        replicationFactor,//
                        checksumBuffer
                        );

                assert pkg.getData().remaining() > 0 : "Empty cache: " + cache;

                /*
                 * Start the remote asynchronous IO before the local synchronous
                 * IO.
                 * 
                 * Note: In HA with replicationFactor=1, this should still
                 * attempt to replicate the write cache block in case there is
                 * someone else in the write pipeline (for example, off-site
                 * replication).
                 */
                /*
                 * FIXME There may be a problem with doing the async IO first.
                 * Track this down and document the nature of the problem,
                 * then clean up the documentation here (see the commented
                 * out version of this line below).
                 */
                quorumMember.logWriteCacheBlock(pkg.getMessage(), pkg.getData().duplicate());

                /*
                 * TODO Do we want to always support the replication code path
                 * when a quorum exists (that is, also for HA1) in case there
                 * are pipeline listeners that are not HAJournalServer
                 * instances? E.g., for offsite replication?
                 */
                if (quorum.replicationFactor() > 1) {

                    // ASYNC MSG RMI + NIO XFER.
                    remoteWriteFuture = quorumMember.replicate(null/* req */,
                            pkg.getMessage(), pkg.getData().duplicate());

                    counters.get().nsend++;

                }

                /*
                 * The quorum leader logs the write cache block here. For the
                 * followers, the write cache blocks are currently logged by
                 * HAJournalServer.
                 */
//                quorumMember.logWriteCacheBlock(msg, b.duplicate());

            }

            /*
             * Do the local IOs (concurrent w/ remote replication).
             * 
             * Note: This will not throw out an InterruptedException unless this
             * thread is actually interrupted. The local storage managers all
             * trap asynchronous close exceptions arising from the interrupt of
             * a concurrent IO operation and retry until they succeed.
             */
            {

                if (log.isDebugEnabled())
                    log.debug("Writing to file: " + cache.toString());

                final long begin = System.nanoTime();
                final long nrecs = cache.recordMap.size(); // #of records in the write cache block.

                try {
                
                    // Flush WriteCache buffer to channel (write on disk)
                    cache.flush(false/* force */);
                    
                } finally {

                    // See BLZG-1589 (new latency-oriented counters)
                    final long elapsed = System.nanoTime() - begin;
                    
                    final WriteCacheServiceCounters c = counters.get();
                    
                    c.nbufferEvictedToChannel++;
                    c.nrecordsEvictedToChannel += nrecs;
                    c.elapsedBufferEvictedToChannelNanos += elapsed;
    
                }
                
            }

            /*
             * Wait for the downstream IOs to finish.
             * 
             * Note: Only the leader is doing replication of the WriteCache
             * blocks from this thread and only the leader will have a non-null
             * value for the [remoteWriteFuture]. The followers are replicating
             * to the downstream nodes in QuorumPipelineImpl. Since the WCS
             * absorbs a lot of latency, replication from QuorumPipelineImpl
             * should be fine.
             */
            if (remoteWriteFuture != null) {

                // Wait for the downstream IOs to finish.
                remoteWriteFuture.get();
                
            }

        } // writeCacheBlock()

    } // class WriteTask

    /**
     * Factory for {@link WriteCache} implementations.
     * 
     * @param buf
     *            The backing buffer (optional).
     * @param useChecksum
     *            <code>true</code> iff record level checksums are enabled.
     * @param bufferHasData
     *            <code>true</code> iff the buffer has data to be written onto
     *            the local persistence store (from a replicated write).
     * @param opener
     *            The object which knows how to re-open the backing channel
     *            (required).
     * @param fileExtent
     *            The then current extent of the backing file.
     * 
     * @return A {@link WriteCache} wrapping that buffer and able to write on
     *         that channel.
     * 
     * @throws InterruptedException
     */
    abstract public WriteCache newWriteCache(IBufferAccess buf,
            boolean useChecksum, boolean bufferHasData,
            IReopenChannel<? extends Channel> opener, final long fileExtent)
            throws InterruptedException;

    /**
     * {@inheritDoc}
     * <p>
     * All dirty buffers are reset and transferred to the head of the clean
     * list. The buffers on the clean list are NOT reset since they may contain
     * valid cached reads (data which is known to be on the disk). We do not
     * want to discard the read cache on reset().
     * <p>
     * Note: This approach deliberately does not cause any buffers belonging to
     * the caller of {@link #writeChk(long, ByteBuffer, int)} to become part of
     * the {@link #cleanList}.
     * <p>
     * Note: <strong>You MUST set the {@link #setExtent(long) file extent}
     * </strong> after {@link #reset() resetting} the {@link WriteCacheService}.
     * This is necessary in order to ensure that the correct file extent is
     * communicated along the write replication pipeline when high availability
     * is enabled.
     * <p>
     * Note: {@link #reset()} MUST NOT interrupt readers. It should only reset
     * those aspects of the write cache state that are associated with writes.
     * On the other hand, {@link #close()} must close all buffers and must not
     * permit readers to read from closed buffers.
     */
    public void reset() throws InterruptedException {
        final WriteLock writeLock = lock.writeLock();
        writeLock.lockInterruptibly();
        try {
            if (!open.get()) {
                // Reset can not recover from close().
                throw new IllegalStateException(firstCause.get());
            }

            /*
             * Note: The WriteTask must use lockInterruptably() so it will
             * notice when it is interrupted by cancel().
             */

            // cancel the current WriteTask.
            localWriteFuture.cancel(true/* mayInterruptIfRunning */);
            final Future<?> rwf = remoteWriteFuture;
            if (rwf != null) {
                // Note: Cancel of remote Future is RMI!
                try {
                    rwf.cancel(true/* mayInterruptIfRunning */);
                } catch (Throwable t) {
                    log.warn(t, t);
                }
            }

            /*
             * Drain and reset the dirty cache buffers, dropping them onto the
             * cleanList.
             */
            drainAndResetDirtyList();

            /*
             * Now that we have sent all the signal()s we know how to send, go
             * ahead and wait for the WriteTask to notice and terminate.
             */
            try {
                // wait for it
                localWriteFuture.get();
            } catch (Throwable t) {
                // ignored.
            } finally {

                /*
                 * Once more, drain and reset the dirty cache buffers, dropping
                 * them onto the cleanList.
                 * 
                 * Note: This is intended to handle the case where there might
                 * be concurrency in WriteTask.call() such that we did not get
                 * all of the dirty buffers the first time we called this method
                 * above.
                 * 
                 * Note: This will ignore the [compactingReserve]. That
                 * WriteCache is always clean and can stay where it is.
                 */
                drainAndResetDirtyList();

                /*
                 * Verify some post-conditions once the WriteTask is terminated.
                 */
 
                dirtyListLock.lockInterruptibly();
                try {
                    if (!dirtyList.isEmpty())
                        throw new AssertionError();
                } finally {
                    dirtyListLock.unlock();
                }
                if (compactingCacheRef.get() != null)
                    throw new AssertionError();

                // ensure cleanList is not empty after WriteTask terminates, handling single buffer case
                cleanListLock.lockInterruptibly();
                try {
                    if (writeBuffers.length > 1 && cleanList.isEmpty())
                        throw new AssertionError();
                } finally {
                    cleanListLock.unlock();
                }

            }

            /*
             * Note: DO NOT clear the service record map. This still has valid
             * cache entries (the read cache).
             */
//            // clear the service record map.
//            recordMap.clear();
//
//            // reset each buffer.
//            for (WriteCache t : buffers) {
//                t.reset();
//            }

            /*
             * Make sure the [current] is reset and non-null.
             */
            {

                final WriteCache x = current.get();

                if (x != null) {

                    // reset if found.
                    x.resetWith(serviceMap);

                    // addClean(x, true/* addFirst */);

                } else {

                    // Non-blocking take.
                    final WriteCache t = cleanList.poll();

                    if (t == null)
                        throw new AssertionError();

                    if (!current.compareAndSet(null/* expect */, t/* update */)) {

                        // Concurrently set.
                        throw new AssertionError();

                    }

                }

            }
            
//            // set readCache
//            if (buffers.length > 1) {
//                readCache.set(buffers[1]);
//                buffers[1].closeForWrites();
//            }
//
//            // re-populate the clean list with remaining buffers
//            for (int i = 2; i < buffers.length; i++) {
//                cleanList.put(buffers[i]);
//            }

            // reset the counters.
            {
                final WriteCacheServiceCounters c = counters.get();
                c.ndirty = 0;
                c.nclean = writeBuffers.length-1;
                c.nreset++;
            }
            
            // reset cacheSequence for HA
            resetSequence();

            /*
             * Restart the WriteTask
             * 
             * Note: don't do Future#get() for the remote Future. The task was
             * cancelled above and we don't want to wait on RMI (for the remote
             * Future). The remote service will have to handle any problems on
             * its end when resynchronizing if it was disconnected and did not
             * see our cancel() message.
             */
            // if (rwf != null) {
            // try {
            // rwf.get();
            // } catch (Throwable t) {
            // // ignored.
            // }
            // }
            this.localWriteFuture = localWriteService.submit(newWriteTask());
            this.remoteWriteFuture = null;

            // clear the file extent to an illegal value.
            fileExtent.set(-1L);

            counters.get().nreset++;

            flush = false;
            
        } finally {
            writeLock.unlock();
        }
    }

    public void resetAndClear() throws InterruptedException {
        final WriteLock writeLock = lock.writeLock();
        writeLock.lockInterruptibly();
        try {
	    	reset();
	        /*
	         * Note: DO NOT clear the service record map. This still has valid
	         * cache entries (the read cache).
	         */
	        // clear the service record map.
	        serviceMap.clear();
	
	        // reset each buffer.
	        for (WriteCache t : writeBuffers) {
	            t.reset();
	        }
        } finally {
        	writeLock.unlock();
        }

    }

    /**
     * Drain the dirty list; reset each dirty cache buffer, and then add the
     * reset buffers to the front of the cleanList (since they are known to be
     * empty).
     * 
     * @throws InterruptedException
     */
    private void drainAndResetDirtyList() throws InterruptedException {

        final List<WriteCache> c = new LinkedList<WriteCache>();

        // drain the dirty list.
        dirtyListLock.lockInterruptibly();
        try {
            dirtyList.drainTo(c);
            dirtyListEmpty.signalAll();
            dirtyListChange.signalAll(); // NB: you must verify
                                         // Condition once signaled!
        } finally {
            dirtyListLock.unlock();
        }
        
        // Reset dirty cache buffers and add to cleanList.
        cleanListLock.lockInterruptibly();
        try {
            for (WriteCache x : c) {
                x.resetWith(serviceMap);
                 cleanList.addFirst(x);
            }
            
            assert !cleanList.isEmpty();
            
            cleanListNotEmpty.signalAll();
            counters.get().nclean = cleanList.size();
        } finally {
            cleanListLock.unlock();
        }

    }
    
    public void close() { //throws InterruptedException {

        if (!open.compareAndSet(true/* expect */, false/* update */)) {
            // Already closed, so this is a NOP.
            return;
        }

        /*
         * Set [firstCause] and [halt] to ensure that other threads report
         * errors.
         * 
         * Note: If the firstCause has not yet been set, then we set it now to a
         * stack trace which will indicate that the WriteCacheService was
         * asynchronously closed (that is, it was closed by another thread).
         */
        if (firstCause.compareAndSet(null/* expect */,
                new AsynchronousCloseException()/* update */)) {
            halt = true;
        }
        
        // Interrupt the write task.
        localWriteFuture.cancel(true/* mayInterruptIfRunning */);
        final Future<?> rwf = remoteWriteFuture;
        if (rwf != null) {
            // Note: Cancel of remote Future is RMI!
            try {
                rwf.cancel(true/* mayInterruptIfRunning */);
            } catch (Throwable t) {
                log.warn(t, t);
            }
        }

        // Immediate shutdown of the write service.
        localWriteService.shutdownNow();

//          // Immediate shutdown of the remote write service (if running).
//          if (remoteWriteService != null) {
//              remoteWriteService.shutdownNow();
//          }

        boolean interrupted = false;

        // Note: Possible code to ensure Futures are terminated....
//        // Wait for the Futures.
//        try {
//            localWriteFuture.get();
//        } catch (Throwable t) {
//            if (InnerCause.isInnerCause(t, InterruptedException.class)) {
//                interrupted = true;
//            }
//        }
//        if (rwf != null) {
//            try {
//                rwf.get();
//            } catch (Throwable t) {
//                if (InnerCause.isInnerCause(t, InterruptedException.class)) {
//                    interrupted = true;
//                }
//            }
//        }
        
        /*
         * Ensure that the WriteCache buffers are close()d in a timely
         * manner.
         */

        // reset buffers on the dirtyList.
        dirtyListLock.lock/*Interruptibly*/();
        try {
            dirtyList.drainTo(new LinkedList<WriteCache>());
            dirtyListEmpty.signalAll();
            dirtyListChange.signalAll();
        } finally {
            dirtyListLock.unlock();
        }

        // close() buffers on the cleanList.
        cleanListLock.lock/*Interruptibly*/();
        try {
            cleanList.drainTo(new LinkedList<WriteCache>());
        } finally {
            cleanListLock.unlock();
        }

        /*
         * Note: The lock protects the [current] reference.
         */
        final WriteLock writeLock = lock.writeLock();
        writeLock.lock/*Interruptibly*/();
        try {

            // close all buffers.
            for (WriteCache t : writeBuffers) {
                try {
                    t.close();
                } catch (InterruptedException ex) {
                    interrupted = true;
                    continue;
                }
            }

            // and any ReadCache buffers
            for (ReadCache t : readBuffers) {
                try {
                    t.close();
                } catch (InterruptedException ex) {
                    interrupted = true;
                    continue;
                }
            }
            
            // clear reference to the current buffer.
            current.getAndSet(null);

            // clear reference to the compactingCache buffer.
            compactingCacheRef.getAndSet(null);

            // clear reference to the readCache buffer.
            readCache.getAndSet(null);
            synchronized (readCache) {
                hotCache = null;
                hotReserve = null;
            }

            // clear the service record map.
            serviceMap.clear();

            // clear the file extent to an illegal value.
            fileExtent.set(-1L);

            if(interrupted)
                Thread.currentThread().interrupt();

        } finally {
            writeLock.unlock();
        }
    
        if (log.isInfoEnabled())
            log.info(counters.get().toString());

    }

    /**
     * Ensures that {@link #close()} is eventually invoked so the buffers can be
     * returned to the {@link DirectBufferPool}.
     * 
     * @throws Throwable
     */
    protected void finalized() throws Throwable {

        close();

    }

    /**
     * This method is called ONLY by write threads and verifies that the service
     * is {@link #open}, that the {@link WriteTask} has not been
     * {@link #halt halted}, and that the {@link WriteTask} is still
     * executing (in case any uncaught errors are thrown out of
     * {@link WriteTask#call()}.
     * <p>
     * Note: {@link #read(long)} DOES NOT throw an exception if the service is
     * closed, asynchronously closed, or even just plain dead. It just returns
     * <code>null</code> to indicate that the desired record is not available
     * from the cache.
     * 
     * @throws IllegalStateException
     *             if the service is closed.
     * @throws RuntimeException
     *             if the {@link WriteTask} has failed.
     */
    private void assertOpenForWriter() {

        if (!open.get())
            throw new IllegalStateException(firstCause.get());

        if (halt)
            throw new RuntimeException(firstCause.get());

        if (localWriteFuture.isDone()) {

            /*
             * If the write task terminates abnormally then throw the exception
             * out here.
             */

            try {
                // @todo don't do get() all the time...?
                localWriteFuture.get();

            } catch (Throwable t) {

                throw new RuntimeException(t);

            }

        }

    }

    /**
     * Return the current buffer to a write thread. Once they are done, the
     * caller MUST call {@link #release()}.
     * 
     * @return The buffer.
     * 
     * @throws InterruptedException
     * @throws IllegalStateException
     *             if the {@link WriteCacheService} is closed.
     * @throws RuntimeException
     *             if the service has been {@link #halt halted}
     */
    private WriteCache acquireForWriter() throws InterruptedException, IllegalStateException {

        final ReadLock readLock = lock.readLock();

        readLock.lockInterruptibly();

        try {

            /*
             * We only want to throw errors from the WriteTask out of write()
             * and flush(). However, this method is NOT invoked by read() which
             * uses a different non-blocking protocol to access the record if it
             * is in a cache buffer.
             */
            assertOpenForWriter();

            /*
             * Note: acquire() does not block since it holds the ReadLock.
             * Methods which change [current] MUST hold the WriteLock across
             * that operation to ensure that [current] is always non-null since
             * acquire() will not block once it acquires the ReadLock.
             */
            final WriteCache tmp = current.get();

            if (tmp == null) {

                throw new RuntimeException();

            }

            // Note: The ReadLock is still held!
            return tmp;

        } catch (Throwable t) {

            /*
             * Note: release the lock only on the error path.
             */

            readLock.unlock();

            if (t instanceof InterruptedException)
                throw (InterruptedException) t;

            if (t instanceof IllegalStateException)
                throw (IllegalStateException) t;

            throw new RuntimeException(t);

        }

    }

    /**
     * Release the latch on an acquired buffer.
     */
    private void release() {

        /*
         * Note: This is releasing the ReadLock which was left open by
         * acquire().
         */
        lock.readLock().unlock();

    }

    /**
     * Flush the current write set through to the backing channel.
     * 
     * @throws InterruptedException
     */
    public void flush(final boolean force) throws InterruptedException {

        try {

            if (!flush(force, Long.MAX_VALUE, TimeUnit.NANOSECONDS)) {

                throw new RuntimeException();

            }

        } catch (TimeoutException e) {

            throw new RuntimeException(e);

        }

    }

    /**
     * {@inheritDoc}
     * <p>
     * flush() is a blocking method. At most one flush() operation may run at a
     * time. The {@link #current} buffer is moved to the {@link #dirtyList}
     * while holding the {@link WriteLock} and flush() then waits until the
     * dirtyList becomes empty, at which point all dirty records have been
     * written through to the backing file.
     * <p>
     * Note: Any exception thrown from this method MUST trigger error handling
     * resulting in a high-level abort() and {@link #reset()} of the
     * {@link WriteCacheService}.
     * 
     * TODO flush() is currently designed to block concurrent writes() in
     * order to give us clean decision boundaries for the HA write pipeline and
     * also to simplify the internal locking design. Once we get HA worked out
     * cleanly we should explore whether or not we can relax this constraint
     * such that writes can run concurrently with flush(). That would have
     * somewhat higher throughput since mutable B+Tree evictions would no longer
     * cause concurrent tasks to block during the commit protocol or the file
     * extent protocol. [Perhaps by associating each write set with a distinct
     * sequence counter (that is incremented by both commit and abort)?]
     * 
     * TODO Flush should order ALL {@link WriteCache}'s on the dirtyList by
     * their fileOffset and then evict them in that order. This reordering will
     * maximize the opportunity for locality during the IOs. With a large write
     * cache (multiple GBs) this reordering could substantially reduce the
     * IOWait associated with flush() for a large update. Note: The reordering
     * should only be performed by the leader in HA mode - the followers will
     * receive the {@link WriteCache} blocks in the desired order and can just
     * drop them onto the dirtyList.
     * 
     * @see WriteTask
     * @see #dirtyList
     * @see #dirtyListEmpty
     */
    public boolean flush(final boolean force, final long timeout,
            final TimeUnit units) throws TimeoutException, InterruptedException {

        if (haLog.isInfoEnabled()) {
            /*
             * Note: This is an important event for HA. The write cache is
             * flushed to ensure that the entire write set is replicated on the
             * followers. Once that has been done, HA will do a 2-phase commit
             * to verify that there is a quorum that agrees to write the root
             * block. Writing the root block is the only thing that the nodes in
             * the quorum need to do once the write cache has been flushed.
             */
            haLog.info("Flushing the write cache: seq=" + cacheSequence);
        }

        final long begin = System.nanoTime();
        final long nanos = units.toNanos(timeout);
        long remaining = nanos;

        final WriteLock writeLock = lock.writeLock();
        if (!writeLock.tryLock(remaining, TimeUnit.NANOSECONDS))
            throw new TimeoutException();
        try {
            final WriteCache tmp = current.getAndSet(null);
//            if (tmp.remaining() == 0) {
//                /*
//                 * Handle an empty buffer by waiting until the dirtyList is
//                 * empty.
//                 */
//                // remaining := (total - elapsed).
//                remaining = nanos - (System.nanoTime() - begin);
//                if (!dirtyListLock.tryLock(remaining, TimeUnit.NANOSECONDS))
//                    throw new TimeoutException();
//                try {
//                    while (!dirtyList.isEmpty() && !halt) {
//                        // remaining := (total - elapsed).
//                        remaining = nanos - (System.nanoTime() - begin);
//                        if (!dirtyListEmpty.await(remaining,
//                                TimeUnit.NANOSECONDS)) {
//                            throw new TimeoutException();
//                        }
//                    }
//                    if (halt)
//                        throw new RuntimeException(firstCause.get());
//                } finally {
//                    dirtyListLock.unlock();
//                }
//                return true;
//            }
//            /*
//             * Otherwise, the current buffer is non-empty.
//             */
            // remaining := (total - elapsed).
            remaining = nanos - (System.nanoTime() - begin);
            if (!dirtyListLock.tryLock(remaining, TimeUnit.NANOSECONDS))
                throw new TimeoutException();
            
            try {
                /*
                 * Force WriteTask.call() to evict anything in the cache.
                 * 
                 * Note: We need to wait until the dirtyList has been evicted
                 * before writing out the compacting cache (if any) and then
                 * finally drop the compactingCache onto the cleanList. Or have
                 * a 2-stage flush.
                 */
                flush = true;
                
                /*
                 * Wait until the dirtyList has been emptied.
                 * 
                 * Note: [tmp] may be empty, but there is basically zero cost in
                 * WriteTask to process an empty buffer and, done this way, the
                 * code is much less complex here.
                 */
                dirtyList.add(tmp);
                counters.get().ndirty++;
                dirtyListChange.signalAll();
                while (!dirtyList.isEmpty() && !halt) {
                    // remaining := (total - elapsed).
                    remaining = nanos - (System.nanoTime() - begin);
                    if (!dirtyListEmpty.await(remaining, TimeUnit.NANOSECONDS)) {
                        throw new TimeoutException();
                    }
                }
                /*
                 * Add the [compactingCache] (if any) to dirty list and spin it
                 * down again.
                 * 
                 * Note: We can not drop the compactingCache onto the dirtyList
                 * until the dirtyList has been spun down to empty.
                 * 
                 * Note: We have introduced the directWrite state variable to indicate
                 * that the compactingCache must not be compacted or it may not be
                 * written.
                 */
                final WriteCache tmp2 = compactingCacheRef.getAndSet(null/* newValue */);
                if (tmp2 != null) {
                	directWrite = true;
                	try {
	                    if (log.isInfoEnabled()) {
	                        log.info("Adding compacting cache");
	                    }
	                    dirtyList.add(tmp2);
	                    counters.get().ndirty++;
	                    dirtyListChange.signalAll();
	                    while (!dirtyList.isEmpty() && !halt) {
	                        // remaining := (total - elapsed).
	                        remaining = nanos - (System.nanoTime() - begin);
	                        if (!dirtyListEmpty.await(remaining, TimeUnit.NANOSECONDS)) {
	                            throw new TimeoutException();
	                        }
	                    }
                	} finally {
                		directWrite = false;
                	}
                }
                if (halt)
                    throw new RuntimeException(firstCause.get());
            } finally {
                flush = false;
                try {
                    if(!halt) {
                        /*
                         * Check assertions for clean WCS after flush().
                         * 
                         * Note: Can not check assertion if there is an existing
                         * exception.
                         */
                        assert dirtyList.size() == 0;
                        assert compactingCacheRef.get() == null;
                        assert current.get() == null;
                    }
                } finally {
                    dirtyListLock.unlock();
                }
            }
            /*
             * Replace [current] with a clean cache buffer.
             */
            // remaining := (total - elapsed).
            remaining = nanos - (System.nanoTime() - begin);
            if (!cleanListLock.tryLock(remaining, TimeUnit.NANOSECONDS))
                throw new TimeoutException();
            try {
                // Note: use of Condition let's us notice [halt].
                while (cleanList.isEmpty() && !halt) {
                    // remaining := (total - elapsed).
                    remaining = nanos - (System.nanoTime() - begin);
                    if (!cleanListNotEmpty.await(remaining, TimeUnit.NANOSECONDS)) {
                        throw new TimeoutException();
                    }
                    if (halt)
                        throw new RuntimeException(firstCause.get());
                }
                // Guaranteed available hence non-blocking.
                final WriteCache nxt = cleanList.take();
                counters.get().nclean--;
                
                // Note: should already be pristine
                nxt.resetWith(serviceMap);//, fileExtent.get());
                current.set(nxt);
                if (haLog.isInfoEnabled())
                    haLog.info("Flushed the write cache: seq=" + cacheSequence);
                return true;
            } finally {
                cleanListLock.unlock();
            }
        } finally {
            writeLock.unlock();
        }
    }
    
    /**
     * Set the extent of the file on the current {@link WriteCache}. The then
     * current value of the extent will be communicated together with the rest
     * of the {@link WriteCache} state if it is written onto another service
     * using the write replication pipeline (HA only). The receiver will use the
     * value read from the {@link WriteCache} message to adjust the extent of
     * its backing file.
     * <p>
     * Note: Changes in the file extent for persistence store implementations
     * MUST (a) be mutually exclusive with reads and writes on the backing file
     * (due to a JVM bug); and (b) force the file data and the file metadata to
     * the disk. Thus any change in the {@link #fileExtent} MUST be followed by
     * a {@link #flush(boolean, long, TimeUnit)}.
     * <p>
     * Note: You MUST set the file extent each time you invoke {@link #reset()}
     * so the {@link WriteCacheService} is always aware of the correct file
     * extent.
     * 
     * @throws InterruptedException
     * @throws IllegalStateException
     */
    public void setExtent(final long fileExtent) throws IllegalStateException,
            InterruptedException {

        if (fileExtent < 0L)
            throw new IllegalArgumentException();

//        final WriteCache cache = acquireForWriter();
//
//        try {
            if (log.isDebugEnabled())
                log.debug("Set fileExtent: " + fileExtent);

            // make a note of the current file extent.
            this.fileExtent.set(fileExtent);

//            // set the current file extent on the WriteCache.
//            cache.setFileExtent(fileExtent);
//
//        } finally {
//
//            release();
//
//        }

    }

    @Override
    public boolean write(final long offset, final ByteBuffer data, final int chk)
            throws InterruptedException, IllegalStateException {
     
        return write(offset, data, chk, useChecksum, 0/* latchedAddr */);
        
    }
    
    /**
     * Write the record onto the cache. If the record is too large for the cache
     * buffers, then it is written synchronously onto the backing channel.
     * Otherwise it is written onto a cache buffer which is lazily flushed onto
     * the backing channel. Cache buffers are written in order once they are
     * full. This method does not impose synchronization on writes which fit the
     * capacity of a cache buffer.
     * <p>
     * When integrating with the {@link RWStrategy} or the {@link WORMStrategy}
     * there needs to be a read/write lock such that file extension is mutually
     * exclusive with file read/write operations (due to a Sun bug). The caller
     * can override {@link #newWriteCache(ByteBuffer, IReopenChannel)} to
     * acquire the necessary lock (the read lock of a {@link ReadWriteLock}).
     * This is even true when the record is too large for the cache since we
     * delegate the write to a temporary {@link WriteCache} wrapping the
     * caller's buffer.
     * <p>
     * Note: Any exception thrown from this method MUST trigger error handling
     * resulting in a high-level abort() and {@link #reset()} of the
     * {@link WriteCacheService}.
     * 
     * @param latchedAddr The latched address (RWStore only).
     * 
     * @return <code>true</code> since the record is always accepted by the
     *         {@link WriteCacheService} (unless an exception is thrown).
     * 
     * @see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6371642
     * 
     * @todo The WORM serializes invocations on this method because it must put
     *       each record at a specific offset into the user extent of the file.
     *       However, the RW store does not do this. Therefore, for the RW store
     *       only, we could use a queue with lost cost access and scan for best
     *       fit packing into the write cache buffer. When a new buffer is set
     *       as [current], we could pack the larger records in the queue onto
     *       that buffer first. This might provide better throughput for the RW
     *       store but would require an override of this method specific to that
     *       implementation.
     *       
     * See BLZG-1589 (new latency-oriented counters)
     */
    public boolean write(final long offset, final ByteBuffer data, final int chk, final boolean useChecksum,final int latchedAddr)
            throws InterruptedException, IllegalStateException {

        final long begin = System.nanoTime();
        
        try {

            return write_timed(offset, data, chk, useChecksum, latchedAddr);
            
        } finally {
            
            final long elapsed = System.nanoTime() - begin;
            
            final WriteCacheServiceCounters c = counters.get();
            
            c.ncacheWrites++; // maintain nwrites
            c.elapsedCacheWriteNanos += elapsed;
            
        }
        
    }
    
    private boolean write_timed(final long offset, final ByteBuffer data, final int chk, final boolean useChecksum,final int latchedAddr)
                throws InterruptedException, IllegalStateException {

      	if (log.isTraceEnabled()) {
            log.trace("offset: " + offset + ", length: " + data.limit()
                    + ", chk=" + chk + ", useChecksum=" + useChecksum);
        }
        
        if (!open.get())
            throw new IllegalStateException(firstCause.get());

        if (offset < 0)
            throw new IllegalArgumentException();

        if (data == null)
            throw new IllegalArgumentException(
                    AbstractBufferStrategy.ERR_BUFFER_NULL);

        // #of bytes in the record.
        final int remaining = data.remaining();

        // #of bytes to be written.
        final int nwrite = remaining + (useChecksum ? 4 : 0);

        if (remaining == 0)
            throw new IllegalArgumentException(
                    AbstractBufferStrategy.ERR_BUFFER_EMPTY);

        if (nwrite > capacity) {

            /*
             * Handle large records.
             */
            return writeLargeRecord(offset, data, chk, useChecksum);

        }

        /*
         * The record can fit into a cache instance, so try and acquire one and
         * write the record onto it.
         * 
         * @todo this could be refactored to use moveBufferToDirtyList()
         */
        {

            final WriteCache cache = acquireForWriter();

            try {
                debugAddrs(offset, data.remaining(), 'A');

                // write on the cache.
                if (cache.write(offset, data, chk, useChecksum, latchedAddr)) {
                	
                 	final WriteCache old = serviceMap.put(offset, cache);
                    // There should be no duplicate address in the record
                    //  map since these entries should be removed, although
                    //  write data may still exist in an old WriteCache.
                    // A duplicate may also be indicative of an allocation
                    //  error, which we need to be pretty strict about!
                    if (old == cache) {
                        throw new AssertionError("Record already in cache: offset=" + offset + " " + addrDebugInfo(offset));
                    }

                    return true;

                }

            } finally {

                release();

            }

        }

        /*
         * The record did not fit into the current buffer but it is small enough
         * to fit into an empty buffer. Grab the write lock and then try again.
         * If it still does not fit, then put the current buffer onto the dirty
         * list and take a buffer from the clean list and then write the record
         * onto that buffer while we are holding the lock. This last step must
         * succeed since the buffer will be empty and the record can fit into an
         * empty buffer.
         */
        {

            final Lock writeLock = lock.writeLock();

            writeLock.lockInterruptibly();

            try {

                /*
                 * While holding the write lock, see if the record can fit into
                 * the current buffer. Note that the buffer we acquire here MAY
                 * be a different buffer since a concurrent write could have
                 * already switched us to a new buffer. In that case, the record
                 * might fit into the new buffer.
                 */

                // Acquire a buffer. Maybe the same one, maybe different.
                WriteCache cache = acquireForWriter();

                try {

                    // While holding the write lock, see if the record fits.
                    if (cache.write(offset, data, chk, useChecksum, latchedAddr)) {

                        /*
                         * It fits: someone already changed to a new cache,
                         * which is fine.
                         */
                        if (serviceMap.put(offset, cache) != null) {
                            // The record should not already be in the cache.
                            throw new AssertionError("Record already in cache: offset=" + offset + " "  + addrDebugInfo(offset));
                        }
                        
                        return true;

                    }

                    /*
                     * There is not enough room in the current buffer for this
                     * record, so put the buffer onto the dirty list. Then take
                     * a new buffer from the clean list (block), reset the
                     * buffer to clear the old writes, and set it as current. At
                     * that point, the record should always fit.
                     * 
                     * Note: When we take a cache instances from the cleanList
                     * we need to remove any entries in our recordMap which are
                     * in its record map.
                     * 
                     * Note: We move the current buffer to the dirty list before
                     * we take a buffer from the clean list. This is absolutely
                     * necessary since the code will otherwise deadlock if there
                     * is only one buffer.
                     * 
                     * Note: Do NOT yield the WriteLock here. That would make it
                     * possible for another thread to acquire() the current
                     * buffer, which has already been placed onto the dirtyList
                     * by this thread!!!
                     */

                    /*
                     * Move the current buffer to the dirty list.
                     * 
                     * Note: The lock here is not required to give flush() atomic
                     * semantics with regard to the set of dirty write buffers
                     * when flush() gained the writeLock [in fact, we only need
                     * the dirtyListLock for the dirtyListEmpty Condition].
                     */
                    if (!current
                            .compareAndSet(cache/* expect */, null/* update */)) {
                        throw new AssertionError();
                    }
                    dirtyListLock.lockInterruptibly();
                    try {
                        dirtyList.add(cache);
                        dirtyListChange.signalAll();
                    } finally {
                        dirtyListLock.unlock();
                    }

                    /*
                     * Take the buffer from the cleanList and set it has the
                     * [current] buffer.
                     */
                    
                    // Grab buffer from clean list.
                    final WriteCache newBuffer = takeFromClean();
                    
                    counters.get().nclean--;
                    // Clear the state on the new buffer and remove from
                    // cacheService map
                    newBuffer.resetWith(serviceMap);//, fileExtent.get());

                    // Set it as the new buffer.
                    current.set(cache = newBuffer);

                    // Try to write on the new buffer.
                    if (cache.write(offset, data, chk, useChecksum, latchedAddr)) {

                        // This must be the only occurrence of this record.
                        if (serviceMap.put(offset, cache) != null) {
                            throw new AssertionError("Record already in cache: offset=" + offset + " " + addrDebugInfo(offset));
                        }

                        
                        return true;

                    }

                    /*
                     * Should never happen.
                     */
                    throw new AssertionError("Unable to write into current WriteCache " + offset + " " + addrDebugInfo(offset));

                } finally {

                    release();

                }

            } finally {

                writeLock.unlock();

            }

        }

    }
    
	private WriteCache takeFromClean() throws InterruptedException {
		cleanListLock.lockInterruptibly();

		try {

			while (true) {

				if (log.isInfoEnabled() && cleanList.isEmpty())
					log.info("Waiting for clean buffer");

				/*
				 * Note: We use the [cleanListNotEmpty] Condition so we can
				 * notice a [halt].
				 */
				while (cleanList.isEmpty() && !halt) {
					cleanListNotEmpty.await();
				}

				if (halt)
					throw new RuntimeException(firstCause.get());

				// Poll() rather than take() since other methods poll() the list
				// unprotected.
				final WriteCache ret = cleanList.poll();

				if (ret != null) {
					return ret;
				}

			}

		} finally {
			cleanListLock.unlock();
		}
	}
    

//    /**
//     * Caches data read from disk (or even read from "older" cache).
//     * The assumption is that we do not need a "reserve" buffer.
//     * 
//     * @param addr
//     * @param bb
//     * @throws InterruptedException
//     */
//    public void cache(final long addr, final ByteBuffer bb)
//			throws InterruptedException {
//		// I think this is fine!
//		synchronized (readCache) {
//			final WriteCache cache = readCache.get();
//			if (cache != null && !cache.cache(addr, bb)) {
//				// add existing non-null cache to clean list
//				if (cache != null)
//					addClean(cache, false /* add first */);
//
//				// fetch new readCache from clean list
//				final WriteCache ncache = getDirectCleanCache();
//
//				// should not be null
//				assert ncache != null;
//
//				// if we decide it CAN be null then we simply do not cache the
//				// read
//				if (ncache == null)
//					return;
//
//				// remove any global references to existing data
//				ncache.resetWith(recordMap);
//				// only closed caches can cache reads
//				ncache.closeForWrites();
//
//				readCache.set(ncache);
//				ncache.closeForWrites();
//				ncache.cache(addr, bb);
//
//				if (recordMap.put(addr, ncache) != null) {
//					throw new AssertionError("Record already in cache: offset="
//							+ addr + " " + addrDebugInfo(addr));
//				}
//			} else if (cache != null) {
//				if (recordMap.put(addr, cache) != null) {
//					throw new AssertionError("Record already in cache: offset="
//							+ addr + " " + addrDebugInfo(addr));
//				}
//			}
//
//			// we've written the byte buffer, so flip it!
//			bb.flip();
//		}
//	}
    
    public void debugAddrs(long offset, int length, char c) {
        if (addrsUsed != null) {
            addrsUsed[addrsUsedCurs] = offset;
            addrActions[addrsUsedCurs] = c;
            addrLens[addrsUsedCurs] = length;
            
            addrsUsedCurs++;
            if (addrsUsedCurs >= addrsUsed.length) {
                addrsUsedCurs = 0;
            }
        }
    }

    /**
     * Write a record whose size (when combined with the optional checksum) is
     * larger than the capacity of an individual {@link WriteCache} buffer. This
     * operation is synchronous (to protect the ByteBuffer from concurrent
     * modification by the caller). It will block until the record has been
     * written.
     * <p>
     * This implementation will write the record onto a sequence of
     * {@link WriteCache} objects and wait until all of those objects have been
     * written through to the backing file and the optional HA write pipeline. A
     * checksum will be appended after the last chunk of the record. This
     * strategy works for the WORM since the bytes will be laid out in a
     * contiguous region on the disk.
     * <p>
     * Note: For the WORM, this code MUST NOT allow the writes to proceed out of
     * order or the data will not be laid out correctly on the disk !!!
     * <p>
     * Note: The RW store MUST NOT permit individual allocations whose size on
     * the disk is greater than the capacity of an individual {@link WriteCache}
     * buffer (@todo Or is this Ok? Perhaps it is if the RW store holds a lock
     * across the write for a large record? Maybe if we also add a low-level
     * method for inserting an entry into the record map?)
     * <p>
     * Note: This method DOES NOT register the record with the shared
     * {@link #serviceMap}. Since the record spans multiple {@link WriteCache}
     * objects it can not be directly recovered without reading it from the
     * backing file.
     * 
     * <h2>Dialog on large records</h2>
     * 
     * It seems to me that the RW store is designed to break up large records
     * into multiple allocations. If we constrain the size of the largest
     * allocation slot on the RW store to be the capacity of a WriteCache buffer
     * (including the bytes for the checksum and other record level metadata)
     * then we do not have a problem with breaking up large records for it in
     * the WriteCacheService and it will automatically benefit from HA using the
     * write replication logic.
     * <p>
     * The WORM does not have these limits on the allocation size, so it seems
     * likely that breaking it up across multiple WriteCache buffer instances
     * would have to be done inside of the WriteCacheService in order to prevent
     * checksums from being interleaved with each WriteCache worth of data it
     * emits for a large record. We can't raise this out of the
     * WriteCacheService because the large record would not be replicated for
     * HA.
     */
    protected boolean writeLargeRecord(final long offset, final ByteBuffer data, final int chk, final boolean useChecksum)
            throws InterruptedException, IllegalStateException {

        if (log.isTraceEnabled()) {
            log.trace("offset: " + offset + ", length: " + data.limit() + ", chk=" + chk + ", useChecksum="
                    + useChecksum);
        }

        if (offset < 0)
            throw new IllegalArgumentException();

        if (data == null)
            throw new IllegalArgumentException(AbstractBufferStrategy.ERR_BUFFER_NULL);

        // #of bytes in the record.
        final int remaining = data.remaining();

        if (remaining == 0)
            throw new IllegalArgumentException(AbstractBufferStrategy.ERR_BUFFER_EMPTY);

        // Small records should not take this code path.
        if (remaining < capacity)
            throw new AssertionError();

        /*
         * Put as much into each WriteCache instance as well fit, then transfer
         * the WriteCache onto the dirtyList, take a new WriteCache from the
         * cleanList, and continue until all data as been transferred. If
         * checksums are enabled, add a 4 byte checksum afterwards.
         * 
         * Note: We hold the WriteLock across this operation since we will be
         * changing out [current] each time it fills up. This has the
         * side-effect of guaranteeing that the writes are emitted without
         * intervening writes of other record.
         * 
         * while(r > 0) {
         * 
         * cache = acquire();
         * 
         * copy up to [r] bytes into the buffer.
         * 
         * if the buffer is full, then transfer it to the dirty list.
         * 
         * release()
         * 
         * }
         * 
         * write checksum on buffer
         */

        final Lock writeLock = lock.writeLock();
        writeLock.lockInterruptibly();
        try {
            // the offset of the next byte to transfer to a cache buffer.
            int p = 0;
            // #of bytes remaining in the large record (w/o the checksum).
            int r = remaining;
            while (r > 0) {
                // Acquire a buffer.
                final WriteCache cache = acquireForWriter();
                try {
                    // #of bytes to copy onto the write cache.
                    final int ncpy = Math.min(r, cache.remaining());
                    if (ncpy > 0) {
                        // create view of the data to be copied.
                        final ByteBuffer tmp = data.duplicate();
                        tmp.limit(p + ncpy);
                        tmp.position(p);
                        // Note: For WORM, this MUST NOT add the checksum except
                        // for the last chunk!
                        if (!cache.write(offset + p, tmp, chk, false/* writeChecksum */,0/*latchedAddr*/))
                            throw new AssertionError();
                        r -= ncpy;
                        p += ncpy;
                    }
                    if (cache.remaining() == 0) {
                        moveBufferToDirtyList();
                    }
                } finally {
                    release();
                }
            } // while( remaining > 0 )
            /*
             * Now we need to write out the optional checksum. We do not have to
             * flush this write through. The buffer can remain partly full.
             */
            if (useChecksum) {
                // Acquire a buffer.
                final WriteCache cache = acquireForWriter();
                try {
                    // Allocate a small buffer
                    final ByteBuffer t = ByteBuffer.allocate(4);
                    // Add in the record checksum.
                    t.putInt(chk);
                    // Prepare for reading.
                    t.flip();
                    // Note: [t] _is_ the checksum.
                    if (!cache.write(offset + p, t, chk, false/* writeChecksum */,0/*latchedAddr*/))
                        throw new AssertionError();
                } finally {
                    release();
                }
            }
            /*
             * If the current cache buffer is dirty then we need to move it to
             * the dirty list since the caller MUST be able to read the record
             * back from the file by the time this method returns.
             */
            final WriteCache cache = acquireForWriter();
            try {
                if (!cache.isEmpty()) {
                    moveBufferToDirtyList();
                }
            } finally {
                release();
            }
            /*
             * In order to guarantee that the caller can read the record back
             * from the file we now flush the dirty list to the backing store.
             * When this method returns, the record will be on the disk and can
             * be read back safely from the disk.
             */
            if (log.isTraceEnabled())
                log.trace("FLUSHING LARGE RECORD");
            
            flush(false/* force */);
            // done.
            return true;
        } finally {
            writeLock.unlock();
        }

    }

    /**
     * Move the {@link #current} buffer to the dirty list and await a clean
     * buffer. The clean buffer is set as the {@link #current} buffer and
     * returned to the caller.
     * <p>
     * Note: If there is buffer available on the {@link #cleanList} then this
     * method can return immediately. Otherwise, this method will block until a
     * clean buffer becomes available.
     * 
     * @return A clean buffer.
     * 
     * @throws InterruptedException
     * @throws IllegalMonitorStateException
     *             unless the current thread is holding the {@link WriteLock}
     *             for {@link #lock}.
     */
    private WriteCache moveBufferToDirtyList() throws InterruptedException {

        if (!lock.isWriteLockedByCurrentThread())
            throw new IllegalMonitorStateException();

        final WriteCache cache = current.getAndSet(null);
        assert cache != null;
        
        /*
         * Note: The lock here is required to give flush() atomic semantics with
         * regard to the set of dirty write buffers when flush() gained the
         * writeLock [in fact, we only need the dirtyListLock for the
         * dirtyListEmpty Condition].
         */
        dirtyListLock.lockInterruptibly();
        try {
            dirtyList.add(cache);
            dirtyListChange.signalAll();
        } finally {
            dirtyListLock.unlock();
        }

        /*
         * Take the buffer from the cleanList and set it as the [current]
         * buffer.
         * 
         * Note: We use the [cleanListNotEmpty] Condition so we can notice a
         * [halt].
         */
        cleanListLock.lockInterruptibly();

        try {

            while (cleanList.isEmpty() && !halt) {
                cleanListNotEmpty.await();
            }

            if (halt)
                throw new RuntimeException(firstCause.get());

            // Take a buffer from the cleanList (guaranteed avail).
            final WriteCache newBuffer = cleanList.take();

            counters.get().nclean--;
            
            // Clear state on new buffer and remove from cacheService map
            newBuffer.resetWith(serviceMap);//, fileExtent.get());

            // Set it as the new buffer.
            current.set(newBuffer);

            return newBuffer;

        } finally {

            cleanListLock.unlock();

        }

    }
   
    /**
     * Add to the cleanList.
     * <p>
     * Since moving to an explicit readCache, we now call resetWith before
     * adding the the cleanList.  Potentially removing latency on acquiring
     * a new cache from the clean list.
     * <p>
     * If a readCache is in operation then we will transfer to the read cache
     */
    private void addClean(final WriteCache cache, final boolean addFirst)
            throws InterruptedException {
        if (cache == null)
            throw new IllegalArgumentException();
        
        
        if (this.readListSize > 0) { // if there is a readCache
        	installReads(cache);
        } else {
        	cache.resetWith(serviceMap);
        }
        
        cleanListLock.lockInterruptibly();
        try {
            assert cache.isEmpty() || cache.isClosedForWrites();
            if (addFirst) {
                cleanList.addFirst(cache);
            } else  {
                cleanList.addLast(cache);
                
            }
            cleanListNotEmpty.signalAll();
            counters.get().nclean = cleanList.size();
        } finally {
            cleanListLock.unlock();
        }
    }
    
    public boolean installReads(final WriteCache cache) throws InterruptedException {
    	if (readListSize == 0)
    		return false;
    	
    	synchronized (readCache) {
    		final ReadCache rcache = readCache.get();
    		if (!WriteCache.transferTo(cache, rcache, serviceMap, 0)) {
    			// full readCache
    			readCache.set(null);
    			if (rcache.decrementReferenceCount()==0) {
    				readList.add(rcache);
    			}
    			
    			final ReadCache ncache = getDirectReadCache();
    			if (ncache == null) {
    				throw new AssertionError();
    			}
    			
    			// remaining must be >= to announced capacity after getDirectReadCache
    			if (ncache.remaining() < ncache.capacity())
    				throw new AssertionError("New Cache, remaining() < capacity(): " + ncache.remaining() + " < " + ncache.capacity());
    			
    			// Now transfer remaining to new readCache
    			if (!WriteCache.transferTo(cache, ncache, serviceMap, 0)) {
    				throw new AssertionError("Unable to complete transfer to new cache with remaining: " + ncache.remaining());
    			}
    			
    			ncache.incrementReferenceCount();
    			readCache.set(ncache);
    		}
    	}

    	return true;
    }

    /**
     * Pool the {@link #cleanList} and return the {@link WriteCache} from the
     * head of the {@link #cleanList} IFF one is available and otherwise
     * <code>null</code>.
     * 
     * @return The {@link WriteCache} iff one was available.
     * 
     * @throws InterruptedException
     */
    private WriteCache getDirectCleanCache() throws InterruptedException {

        final WriteCache tmp = cleanList.poll();

        if (tmp != null) {
        
            counters.get().nclean--;
            
        }

        return tmp;

    }

    /**
     * Non-blocking take of a {@link ReadCache}. If successful, the returned
     * {@link ReadCache} will be clean. Otherwise return <code>null</code>.
     */
    private ReadCache getDirectReadCache() throws InterruptedException {

        // Non-blocking take.
        ReadCache tmp = readList.poll();

        if (tmp == null)
            return null;

        try {

            /*
             * Attempt to reset the record.
             */
            synchronized (readCache) {
                if (hotCache == null) {
                    tmp.resetWith(serviceMap);
                    return tmp;
                }
                int cycles = 0;
                while (tmp != null) {
                    if (log.isDebugEnabled() && !tmp.isEmpty()) {
                        /*
                         * Just debug stuff.
                         */
                        int hitRecords = 0;
                        int hotRecords = 0;
                        int totalRecords = 0;
                        final Iterator<RecordMetadata> values = tmp.recordMap
                                .values().iterator();
                        while (values.hasNext()) {
                            final RecordMetadata md = values.next();
                            totalRecords++;
                            if (md.getHitCount() > 0) {
                                hitRecords++;
                                if (md.getHitCount() > hotCacheThreshold)
                                    hotRecords++;
                            }
                        }
                        log.debug("Recycled ReadCache, hot(>" + hotCacheThreshold + "): " + hotRecords + ", hit: " + hitRecords + " of " + totalRecords);
                    }

                    if (WriteCache.transferTo(tmp, hotCache, serviceMap,
                            hotCacheThreshold)) {
                        if (!tmp.isEmpty())
                            throw new AssertionError();

                        tmp.reset();
                        break;
                    }

                    if (log.isDebugEnabled())
                        log.debug("Cycle HOTCACHE: " + ++cycles);

                    // transfer not completed, so:
                    // move current hotCache to end of HotList
                    // move head of HotList to end of ReadList
                    // make hotReserve new hotCache
                    // complete transfer to new hotCache
                    // make now empty tmp new hotReserve
                    hotList.add(hotCache);
                    readList.add(hotList.poll().resetHitCounts());
                    if (!hotReserve.isEmpty())
                        throw new AssertionError();

                    hotCache = hotReserve;
                    hotReserve = null;
                    if (!WriteCache.transferTo(tmp, hotCache, serviceMap,
                            hotCacheThreshold)) {
                        throw new AssertionError();
                    }
                    tmp.reset();
                    hotReserve = tmp;

                    tmp = readList.poll();
                } // while (tmp != null)
            } // synchronized(readCache)
        } catch (InterruptedException ex) {
            /*
             * If interrupted, then return the ReadCache to the list and
             * propagate the interrupt to the caller. This makes the operation
             * safe with respect to an interrupt. Either the operation succeeds
             * fully, or we return [null] to the caller and propagate restore
             * the interrupt status on the current Thread.
             */
            readList.put(tmp);
            // Propagate the interrupt status.
            Thread.currentThread().interrupt();
            // ReadCache is not available.
            return null;
        }

        return tmp;

    }

    /**
     * This is a non-blocking query of all write cache buffers (current, clean
     * and dirty).
     * <p>
     * This implementation DOES NOT throw an {@link IllegalStateException} if
     * the service is already closed NOR if there is an asynchronous close of
     * the service. Instead it just returns <code>null</code> to indicate a
     * cache miss.
     */
    public ByteBuffer read(final long offset, final int nbytes)
            throws InterruptedException, ChecksumError {

        // Check the cache.
        final ByteBuffer tmp = _readFromCache(offset, nbytes);

        if (tmp != null) {
        	
        	if (tmp.remaining() == 0)
        		throw new AssertionError();

            // Cache hit.
            return tmp;

        }

        // Cache miss.
        counters.get().nmiss.increment();
        
        if (reader != null) {
            
            /*
             * Read through to the disk and install the record into cache.
             */
            final ByteBuffer ret = loadRecord(offset, nbytes);
            
            if (ret != null && ret.remaining() == 0)
            	throw new AssertionError();
            
            return ret;

        } else {
            
            /*
             * No reader. Return null. Caller is responsible for reading through
             * to the disk.
             */
            
            return null;
            
        }

    }
    
    /**
     * Attempt to read record from cache (either write cache or read cache
     * depending on the service map state).
     */
    public ByteBuffer _readFromCache(final long offset, final int nbytes)
            throws ChecksumError, InterruptedException {
    
        if (nbytes > capacity) {
            /*
             * Note: Writes larger than a single write cache buffer are NOT
             * cached.
             */
            return null;
        }

        final Long off = Long.valueOf(offset);

        while (true) {

            if (!open.get()) {

                /*
                 * Not open. Return [null] rather than throwing an exception per
                 * the contract for this implementation.
                 */

                return null;

            }

            final WriteCache cache = serviceMap.get(off);

            if (cache == null) {
             
                // Cache miss.
                break;
                
            }

            /*
             * Ask the cache buffer if it has the record still. It will not
             * if the cache buffer has been concurrently reset.
             */
            try {

                final ByteBuffer ret = cache.read(off.longValue(), nbytes);

                if (ret == null && serviceMap.get(off) == cache) {

                    throw new IllegalStateException(
                            "Inconsistent cache for offset: " + off);
                    
                }

                if (ret == null && log.isDebugEnabled()) {

                    log.debug("WriteCache out of sync with WriteCacheService");

                }

                if (ret != null)
                    return ret;

                // May have been transferred to another Cache!
                //
                // Fall through.
                continue;
                
            } catch (IllegalStateException ex) {
                /*
                 * The write cache was closed. Per the API for this method,
                 * return [null] so that the caller will read through to the
                 * backing store.
                 */
                assert !open.get();
                return null;

            }

        }
        
        // Cache miss.
        return null;

    }
    
    /**
     * Helper class models a request to load a record from the backing store.
     * <p>
     * Note: This class must implement equals() and hashCode() since it is used
     * within the {@link Memoizer} pattern.
     */
    private static class LoadRecordRequest {

        final WriteCacheService service;
        final long offset;
        final int nbytes;

        public LoadRecordRequest(final WriteCacheService service,
                final long offset, final int nbytes) {

            this.service = service;

            this.offset = offset;

            this.nbytes = nbytes;

        }

        /**
         * Equals returns true iff the request has the same parameters.
         */
        public boolean equals(final Object o) {

            if (!(o instanceof LoadRecordRequest))
                return false;

            final LoadRecordRequest r = (LoadRecordRequest) o;

            return service == r.service && offset == r.offset
                    && nbytes == r.nbytes;

        }

        /**
         * The hashCode() implementation assumes that the <code>offset</code>'s
         * hashCode() is well distributed.
         */
        public int hashCode() {
            
            return (int) (offset ^ (offset >>> 32));
            
        }
        
    }

    /**
     * Helper loads a child node from the specified address by delegating
     * {@link WriteCacheService#_getRecord(long, int)}.
     */
    final private static Computable<LoadRecordRequest, ByteBuffer> loadChild = new Computable<LoadRecordRequest, ByteBuffer>() {

        /**
         * Loads a record from the specified address.
         * 
         * @return A heap {@link ByteBuffer} containing the data for that
         *         record.
         * 
         * @throws IllegalArgumentException
         *             if addr is {@link IRawStore#NULL}.
         */
        public ByteBuffer compute(final LoadRecordRequest req)
                throws InterruptedException {

			try {

				final ByteBuffer ret = req.service._getRecord(req.offset, req.nbytes);
				
				if (ret != null && ret.remaining() == 0)
					throw new AssertionError();
				
				return ret;

			} finally {

				/*
				 * Clear the future task from the memoizer cache.
				 * 
				 * Note: This is necessary in order to prevent the cache from
				 * retaining a hard reference to each child materialized for the
				 * B+Tree.
				 * 
				 * Note: This does not depend on any additional synchronization.
				 * The Memoizer pattern guarantees that only one thread actually
				 * call ft.run() and hence runs this code.
				 */

				req.service.memo.removeFromCache(req);

			}

		}
        
    };

    /**
     * A {@link Memoizer} subclass which exposes an additional method to remove
     * a {@link FutureTask} from the internal cache.
     */
    private static class ReadMemoizer extends
            Memoizer<LoadRecordRequest/* request */, ByteBuffer/* child */> {

        /**
         * @param c
         */
        public ReadMemoizer(final Computable<LoadRecordRequest, ByteBuffer> c) {

            super(c);

        }

        /**
         * The approximate size of the cache (used solely for debugging to
         * detect cache leaks).
         */
        int size() {
            
            return cache.size();
            
        }

        /**
         * Called by the thread which atomically installs the record into the
         * cache and updates the service record map. At that point the record is
         * available from the service record map.
         * 
         * @param req
         *            The request.
         */
        void removeFromCache(final LoadRecordRequest req) {

            if (cache.remove(req) == null) {

                throw new AssertionError();
                
            }

        }

//        /**
//         * Called from {@link AbstractBTree#close()}.
//         * 
//         * @todo should we do this?  There should not be any reads against the
//         * the B+Tree when it is close()d.  Therefore I do not believe there 
//         * is any reason to clear the FutureTask cache.
//         */
//        void clear() {
//            
//            cache.clear();
//            
//        }
        
    };

    /**
     * Used to materialize records with at most one thread reading the
     * record from disk for a given address. Other threads desiring the
     * same record will wait on the {@link Future} for the thread doing
     * the work.
     */
    private final ReadMemoizer memo;

    /**
     * Enter the memoizer pattern.
     */
    private ByteBuffer loadRecord(final long offset, final int nbytes) {
        
        try {

            counters.get().memoCacheSize.set(memo.size());

            final ByteBuffer ret = memo.compute(new LoadRecordRequest(this, offset, nbytes));
            
            // Duplicate buffer since memoizer may return same ByteBuffer to multiple callers
            //	resulting in problems of concurrent read
            return ret.duplicate();

        } catch (InterruptedException e) {

            /*
             * Note: This exception will be thrown iff interrupted while
             * awaiting the FutureTask inside of the Memoizer.
             */

            throw new RuntimeException(e);

        }
        
    }

    /**
     * Method invoked from within the memoizer pattern to read the record from
     * the backing store and install it into the cache. The method must first
     * verify that the record is not in the cache.
     * 
     * @param offset
     * @param nbytes
     * @return A heap byte buffer containing the read record.
     * @throws IllegalStateException
     * @throws InterruptedException
     */
    private ByteBuffer _getRecord(final long offset, final int nbytes)
            throws IllegalStateException, InterruptedException {

		/*
		 * On entry, this thread will either install the read into the cache or
		 * the record will already be in the cache. We are protected by the
		 * memoizer pattern here. No other thread will be attempting to install
		 * the same record (the record for that offset) into the cache.
		 */

		ByteBuffer tmp = _readFromCache(offset, nbytes);

		if (tmp != null) {

		    // Already in the read cache.
			if (tmp.remaining() == 0)
				throw new AssertionError();
			
			return tmp;

		}
		
		final boolean largeRecord = nbytes > capacity;
		final boolean directRead = largeRecord || this.readListSize == 0;

		if (directRead) {
        
            // No free buffer to install the read (OR largeRecord)
            final ByteBuffer ret = _readFromLocalDiskIntoNewHeapByteBuffer(offset, nbytes);
            
            if (ret != null && ret.remaining() == 0)
            	throw new AssertionError();
      
            return ret;
        }

        /*
         * The reader threads co-operatively manage the readCache on behalf of
         * the WCS. The allocation attempt for a cache buffer is serialized and
         * when an allocation fails a new readCache is initialized and the
         * previous cache reference is decremented (no longer referenced as the
         * current read cache).
         * 
         * When a cache is selected to buffer a read, the reference is
         * incremented while the read is active.
         * 
         * When the reference is finally decremented to zero (either at the end
         * of a read or after a failed allocation) the cache can be returned to
         * the clean list.
         */

        // The cache block into which we will install the record.
		ReadCache theCache = null;
		// The buffer slice into which we will install the record.
		ByteBuffer bb = null;
        /*
         * Set true iff we will install a record and have incremented the
         * reference count for the cache. if true, then this Thread MUST
         * decrement the reference count by any code path that leaves this
         * method. (If a obtain an allocation but do not set this flag, then we
         * will not actually perform the installation and the cache block will
         * not be pinned.)
         */
        boolean willInstall = false;
        try {
            synchronized (readCache) {
                theCache = readCache.get();
                if (theCache != null) {
                    /*
                     * Attempt to allocate record on current read cache.
                     */
                    assert theCache.getReferenceCount() > 0;
                    bb = theCache.allocate(nbytes); // intr iff can't lock().
                    if (bb != null) {
                        // increment while readCache synchronized
                        theCache.incrementReferenceCount();
                        willInstall = true;
                    } else {
                        /*
                         * At this point, the current [readCache] does not have
                         * enough room to install the record. We will clear the
                         * [readCache] reference and transfer it to the
                         * [readList].
                         * 
                         * *** CRITICAL SECTION ***
                         * 
                         * We MUST transfer cache once reference is cleared or
                         * the buffer will be lost!
                         * 
                         * Note: Anything on the readList MUST have
                         * referenceCount==0 since we do not transfer to the
                         * readList until that condition is met.
                         */
                        readCache.set(null);
                        if (theCache.decrementReferenceCount() == 0) {
                            readList.add(theCache);
                        }
                    }
                }
                if (bb == null) {
                    /*
                     * Either no [readCache] on entry or no room in current
                     * [readCache] and [readCache] was set to [null].
                     */
                    assert readCache.get() == null; // pre-condition.
                    final ReadCache newCache = getDirectReadCache(); // non-blocking take
                    if (newCache != null) {
                        assert newCache.getReferenceCount() == 0;
                        { // CRITICAL SECTION
                            // Pre-increment the new [readCache].
                            newCache.incrementReferenceCount();
                            // Set read cache reference.
                            readCache.set(newCache/* newValue */);
                        }
                        // guaranteed to succeed unless interrupted
                        bb = newCache.allocate(nbytes);
                        theCache = newCache;
                        { // CRITICAL SECTION.
                            // increment while readCache synchronized
                            theCache.incrementReferenceCount();
                            willInstall = true;
                        }
                    }
				}
			} // synchronized(readCache)
    
            if (bb == null) {
                /*
                 * No free buffer to install the read. Read directly into a heap
                 * ByteBuffer and return that to the caller.
                 */
                assert willInstall == false;
                return _readFromLocalDiskIntoNewHeapByteBuffer(offset, nbytes);
    		}

            /*
             * [bb] is a view onto an allocation on [theCache] into which we can
             * install the read.
             */
            
            // The offset into [bb] of the allocation.
		    final int pos = bb.position();
		    
		    // Read the record from the disk into NIO buffer.
		    final ByteBuffer ret = reader.readRaw(offset, bb);
		
		    // must copy to heap buffer from cache, allowing for checksum
            final byte[] b = new byte[nbytes - 4];
            ret.get(b);
            
            // calculate checksum from readRaw before adding to readCache!
            {
            	final int datalen = nbytes - 4;
	            final int chk = ret.getInt(pos + datalen);
	
	            if (chk != ChecksumUtility.threadChk.get().checksum(b, 0/* offset */, datalen)) {
	
	                throw new ChecksumError();
	
	            }
            }
          
		    // update record maps
		    theCache.commitToMap(offset, pos, nbytes);
		    serviceMap.put(offset, theCache);

            return ByteBuffer.wrap(b);

        } catch (Throwable t) {
        	t.printStackTrace(System.err);
        	
        	throw new RuntimeException(t);
        } finally {
            /*
             * CRITICAL SECTION. If [willInstall] then we are responsible for
             * this ReadCache and MUST decrement the counter.
             */
            if (willInstall && theCache.decrementReferenceCount() == 0) {
                readList.add(theCache);
                // END CRITICAL SECTION.
                if (theCache == readCache.get())
                    throw new AssertionError();
            }
        }
 
	}

    /**
     * Read through to the backing file.
     * 
     * @param offset
     *            The byte offset of the record on the backing file.
     * @param nbytes
     *            The #of bytes to be read.
     * 
     * @return The installed record in a newly allocated heap {@link ByteBuffer}
     *         .
     */
    private final ByteBuffer _readFromLocalDiskIntoNewHeapByteBuffer(
            final long offset, final int nbytes) {

        if (log.isDebugEnabled())
            log.debug("Allocating direct, nbytes: " + nbytes);

        final ByteBuffer ret = reader.readRaw(offset,
                ByteBuffer.allocate(nbytes));

        final int chk = ChecksumUtility.getCHK().checksum(ret.array(),
                0/* offset */, nbytes - 4/* len */); // read checksum

        final int tstchk = ret.getInt(nbytes - 4);
        
        if (chk != tstchk)
            throw new ChecksumError("offset=" + offset + ",nbytes=" + nbytes
                    + ",expected=" + tstchk + ",actual=" + chk);
        
        ret.limit(nbytes - 4);
        
        if (ret.remaining() == 0)
        	throw new AssertionError();

        // This read was not installed into the read cache.
        counters.get().nreadNotInstalled.increment();
        
        return ret;
    
    }
    
    /**
     * Read the data from the backing file.
     * 
     * We need to know the size of the data so we can allocate the buffer.
     * 
     * @param offset
     * @return
     * @throws InterruptedException
     * @throws IllegalStateException
     */
//    private ByteBuffer readBacking(final long offset, final int nbytes)
//            throws IllegalStateException, InterruptedException {
//        if (reader == null)
//            return null;
//
//        if (nbytes > readCache.get().capacity()) // not possible to cache
//            return null;
//
//        // allocate space in readCache and retrieve buffer into which we'll
//        // read the data
//
//        ByteBuffer bb = null;
//        WriteCache installCache;
//        synchronized (readCache) {
//            final WriteCache cache = readCache.get();
//            bb = cache.allocate(offset, nbytes);
//            if (bb == null) { // return readCache to clean list
//                addClean(cache, false/* add to front */);
//                installCache = getDirectCleanCache();
//                readCache.set(installCache);
//                installCache.closeForWrites();
//
//                bb = installCache.allocate(offset, nbytes);
//
//                assert bb != null;
//            } else {
//                installCache = cache;
//            }
//        }
//
//        // must return new byte[] since original ByteBuffer will be updated
//        final byte[] ret = new byte[nbytes - 4];
//
//        // DEBUG readRaw into non-direct byte buffer
//        // final ByteBuffer trans = ByteBuffer.wrap(ret);
//        // reader.readRaw(offset, trans);
//
//        reader.readRaw(offset, bb);
//
//        recordMap.put(offset, installCache);
//
//        // copy WriteCache data into return buffer
//        bb.get(ret);
//
//        return ByteBuffer.wrap(ret);
//    }

    /**
     * Called to check if a write has already been flushed. This is only made if
     * a write has been made to previously committed data (in the current RW
     * session).
     * <p>
     * If dirty {@link WriteCache}s are flushed in order then it does not
     * matter, however, if we want to be able to combine {@link WriteCache}s
     * then it makes sense that there are no duplicate writes.
     * <p>
     * On reflection this is more likely needed since for the {@link RWStore},
     * depending on session parameters, the same cached area could be
     * overwritten. We could still maintain multiple writes but we need a
     * guarantee of order when retrieving data from the write cache (newest
     * first).
     * <p>
     * So the question is, whether it is better to keep cache consistent or to
     * constrain with read order?
     * 
     * @param offset
     *            the address to check
     */
    public boolean clearWrite(final long offset, final int latchedAddr) {
        try {
            counters.get().nclearAddrRequests++;
            while (true) {
                final WriteCache cache = serviceMap.get(offset);
                if (cache == null) {
                    // Not found.
                    return false;
                }
                cache.transferLock.lock();
                try {
//                    /**
//                     * Note: The tests below require us to take the read lock on
//                     * the WriteCache before we test the serviceMap again in
//                     * order to guard against a concurrent reset() of the
//                     * WriteCache.
//                     * 
//                     * @see <a href=
//                     *      "https://sourceforge.net/apps/trac/bigdata/ticket/654"
//                     *      Rare AssertionError in WriteCache.clearAddrMap()
//                     *      </a>
//                     */
//                    cache.acquire();
//                    try {
                    final WriteCache cache2 = serviceMap.get(offset);
                    if (cache2 != cache) {
                        /*
                         * Not found in this WriteCache.
                         * 
                         * Record was (re-)moved before we got the lock.
                         * 
                         * Note: We need to retry. WriteCache.transferTo() could
                         * have just migrated the record to another WriteCache.
                         */
                        continue;
                    }
                    
                    // Remove entry from the recordMap.
                    final WriteCache oldValue = serviceMap.remove(offset);
                    if (oldValue == null) {
                        /**
                         * Note: The [WriteCache.transferLock] protects the
                         * WriteCache against a concurrent transfer of a record
                         * in WriteCache.transferTo(). However,
                         * WriteCache.resetWith() does NOT take the
                         * transferLock. Therefore, it is possible (and valid)
                         * for the [recordMap] entry to be cleared to [null] for
                         * this record by a concurrent resetWith() call.
                         * 
                         * @see <a href=
                         *      "https://sourceforge.net/apps/trac/bigdata/ticket/654"
                         *      Rare AssertionError in WriteCache.clearAddrMap()
                         *      </a>
                         */
                        continue;
                    }
					if (oldValue != cache) {
						/*
						 * Concurrent modification!
						 */
						throw new AssertionError("oldValue=" + oldValue
						+ ", cache=" + cache + ", offset=" + offset
						+ ", latchedAddr=" + latchedAddr);
					}

                    /*
                     * Note: clearAddrMap() is basically a NOP if the WriteCache
                     * has been closedForWrites().
                     */
                    if (cache.clearAddrMap(offset, latchedAddr)) {
                        // Found and cleared.
                        counters.get().nclearAddrCleared++;
                        debugAddrs(offset, 0, 'F');
                        return true;
                    }
//                    } finally {
//                        cache.release();
//                    }
                } finally {
                    cache.transferLock.unlock();
                }
            }
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }
    
//    /**
//     * Debug method to verify that the {@link WriteCacheService} has flushed all
//     * {@link WriteCache} buffers.
//     * 
//     * @return whether there are no outstanding writes buffered
//     */
//    public boolean isFlushed() {
//        
//        final boolean clear = 
//    			dirtyList.size() == 0
//    			&& compactingCacheRef.get() == null
//    			&& (current.get() == null || current.get().isEmpty());
//    	
//        return clear;
//        
//    }
    
    /**
     * An array of writeCache actions is maintained that can be used
     * to provide a breadcrumb of how that address has been written, saved,
     * freed or removed.
     * <p>
     * Write errors often show up as a checksum error, so the length of
     * data written to the address cab be crucial information in determining the
     * root of any problem.
     * 
     * @param address for which info requested
     * @return summary of writeCache actions
     */
    public String addrDebugInfo(final long paddr) {
        if (addrsUsed == null) {
            return "No WriteCache debug info";
        }
        
        final StringBuffer ret = new StringBuffer();
//      // first see if address was ever written
//      boolean written = false;
        for (int i = 0; i < addrsUsed.length; i++) {
            if (i == addrsUsedCurs) {
                ret.append("|...|");
            }
            if (addrsUsed[i] == paddr) {
                ret.append(addrActions[i]);
                if (addrActions[i]=='A') {
                    ret.append("[" + addrLens[i] + "]");
                }
            }
        }
        /*
         * Note: I've added in the write cache service counters here for
         * information about the maximum #of buffers from the pool which have
         * been in use, #of flushes, etc.
         */
        ret.append(":");
        ret.append(getCounters().toString());
        return ret.toString();
    }

    /**
     * Return <code>true</code> iff the address is in the write
     * cache at the moment which the write cache is checked.
     * <p>
     * Note: Unless the caller is holding an appropriate lock
     * across this operation, the result is NOT guaranteed to
     * be correct at any time other than the moment when the
     * cache was tested. 
     */
    public boolean isPresent(final long addr) {
        // System.out.println("Checking address: " + addr);
        
        return serviceMap.get(addr) != null;
    }
    
    /**
     * Note: Atomic reference is used so the counters may be imposed from
     * outside.
     */
    private final AtomicReference<WriteCacheServiceCounters> counters;

    /**
     * Return the performance counters for the {@link WriteCacheService}.
     */
    public CounterSet getCounters() {

        return counters.get().getCounters();

    }
    
    /**
     * Return the #of {@link WriteCache} blocks sent by the quorum leader to
     * the first downstream follower.
     */
    public long getSendCount() {

        return counters.get().nsend;

    }

    /**
     * An instance of this exception is thrown if a thread notices that the
     * {@link WriteCacheService} was closed by a concurrent process.
     * 
     * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
     *         Thompson</a>
     */
    public static class AsynchronousCloseException extends IllegalStateException {

        private static final long serialVersionUID = 1L;
        
    }

}