/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Feb 10, 2010 */ package com.bigdata.io.writecache; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.channels.Channel; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.FutureTask; import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import org.apache.log4j.Logger; import com.bigdata.counters.CounterSet; import com.bigdata.ha.HAPipelineGlue; import com.bigdata.ha.QuorumPipeline; import com.bigdata.io.ChecksumUtility; import com.bigdata.io.DirectBufferPool; import com.bigdata.io.IBufferAccess; import com.bigdata.io.IReopenChannel; import com.bigdata.io.writecache.WriteCache.ReadCache; import com.bigdata.io.writecache.WriteCache.RecordMetadata; import com.bigdata.journal.AbstractBufferStrategy; import com.bigdata.journal.IBufferStrategy; import com.bigdata.journal.IRootBlockView; import com.bigdata.journal.RWStrategy; import com.bigdata.journal.WORMStrategy; import com.bigdata.quorum.Quorum; import com.bigdata.quorum.QuorumMember; import com.bigdata.rawstore.IAddressManager; import com.bigdata.rawstore.IRawStore; import com.bigdata.rwstore.RWStore; import com.bigdata.util.ChecksumError; import com.bigdata.util.DaemonThreadFactory; import com.bigdata.util.InnerCause; import com.bigdata.util.concurrent.Computable; import com.bigdata.util.concurrent.Memoizer; /** * A {@link WriteCacheService} is provisioned with some number of * {@link WriteCache} buffers and a writer thread. Caller's populate * {@link WriteCache} instances. When they are full, they are transferred to a * queue which is drained by the thread writing on the local disk. Hooks are * provided to wait until the current write set has been written (e.g., at a * commit point when the cached writes must be written through to the backing * channel). This implementation supports high availability using a write * replication pipeline. * <p> * A pool of {@link WriteCache} instances is used. Readers test all of the * {@link WriteCache} using a shared {@link ConcurrentMap} and will return * immediately the desired record or <code>null</code> if the record is not in * any of the {@link WriteCache} instances. Write caches remain available to * readers until they need to be recycled as the current write cache (the one * servicing new writes). * <p> * The {@link WriteCacheService} maintains a dirty list of {@link WriteCache} * instances. A single thread handle writes onto the disk and onto the write * replication pipeline (for HA). When the caller calls flush() on the write * cache service it flush() the current write cache is transferred to the dirty * list and then wait until the write cache instances now on the dirty list have * been serviced. In order to simplify the design and the provide boundary * conditions for HA decision making, writers block during * {@link #flush(boolean, long, TimeUnit)}. * <p> * Instances of this class are used by both the {@link RWStrategy} and the * {@link WORMStrategy}. These classes differ in how they allocate space on the * backing file and in the concurrency which they permit for writers. * <dl> * <dt>{@link WORMStrategy}</dt> * <dd>The {@link WORMStrategy} serializes all calls to * {@link #writeChk(long, ByteBuffer, int)} since it must guarantee the precise * offset at which each record is written onto the backing file. As a * consequence of its design, each {@link WriteCache} is a single contiguous * chunk of data and is transferred directly to a known offset on the disk. This * append only strategy makes for excellent transfer rates to the disk.</dd> * <dt>{@link RWStrategy}</dt> * <dd>The {@link RWStrategy} only needs to serialize the decision making about * the offset at which the records are allocated. Since the records may be * allocated at any location in the backing file, each {@link WriteCache} * results in a scattered write on the disk.</dd> * </dl> * Both the {@link WORMStrategy} and the {@link RWStrategy} implementations need * to also establish a read-write lock to prevent changes in the file extent * from causing corrupt data for concurrent read or write operations on the * file. See {@link #writeChk(long, ByteBuffer, int)} for more information on * this issue (it is a workaround for a JVM bug). * * <h2>Checksums</h2> * * The WORM and RW buffer strategy implementations, the WriteCacheService, and * the WriteCache all know whether or not checksums are in use. When they are, * the buffer strategy computes the checksum and passes it down (otherwise it * passes down a 0, which will be ignored since checksums are not enabled). The * WriteCache adjusts its capacity by -4 when checksums are enabled and adds the * checksum when transferring the caller's data into the WriteCache. On read, * the WriteCache will verify the checksum if it exists and returns a new * allocation backed by a byte[] showing only the caller's record. * <p> * {@link IAddressManager#getByteCount(long)} must be the actual on the disk * record length, not the size of the record when it reaches the application * layer. This on the disk length is the adjusted size after optional * compression and with the optional checksum. Applications which assume that * lengthOf(addr) == byte[].length will break, but that's life. * * <h2>ReadCache</h2> * * Without a hotList the readCache is managed naively by clearing any new * readCache. This potentially results in frequently accessed records being lost * to the cache. * * <h2>HotCache</h2> * * With the HotCache evicted readCaches hot records get transferred to hotList * and 'old' hotCaches get added to end of readCache. Pattern is needed to pluck * reserve hotCache from readList so that it is always possible to transfer hot * records from the readList. * <p> * Start with hotCache AND hotReserve. * * If new reserve needed, because existing one is now used, try and compress new * readCache into current hotCache - if won't fit, then call resetWith and lose * those writes, cycle again, moving front hotCache to readList and compress * that one. * <p> * LIMIT: If we begin with full caches with above threshold hitCounts then the * whole list will cycle around until we hit original cache which will contain * records with zero hitCounts - for practical purposes ignoring any concurrent * reads. * * @see WriteCache * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ * * @todo There needs to be a unit test which verifies overwrite of a record in * the {@link WriteCache} (a write at the same offset in the backing file, * but at a different position in the {@link WriteCache} buffer). It is * possible for this to occur with the {@link RWStore} if a record is * written, deleted, and the immediately reallocated. Whether or not this * is a likely event depends on how aggressively the {@link RWStore} * reallocates addresses which were allocated and then deleted within the * same native transaction. * * @todo When compression is enabled, it is applied above the level of the * {@link WriteCache} and {@link WriteCacheService} (which after all * require the caller to pass in the checksum of the compressed record). * It is an open question as to whether the caller or the store handles * record compression. Note that the B+Tree leaf and node records may * require an uncompressed header to allow fixup of the priorAddr and * nextAddr fields. */ abstract public class WriteCacheService implements IWriteCache { protected static final Logger log = Logger.getLogger(WriteCacheService.class); /** * Logger for HA events. */ private static final Logger haLog = Logger.getLogger("com.bigdata.ha"); /** * <code>true</code> until the service is {@link #close() closed}. */ // private volatile boolean open = true; private final AtomicBoolean open = new AtomicBoolean(true); /** * <code>true</code> iff record level checksums are enabled. */ final private boolean useChecksum; /** * A single threaded service which writes dirty {@link WriteCache}s onto the * backing store. */ final private ExecutorService localWriteService; /** * The {@link Future} of the task running on the {@link #localWriteService}. * * @see WriteTask * @see #reset() */ private Future<Void> localWriteFuture; /** * The {@link Future} of the task running on the {@link #remoteWriteService} * . * <p> * Note: Since this is <em>volatile</em> you MUST guard against concurrent * clear to <code>null</code> by {@link #reset()}. * * @see WriteTask * @see #reset() */ private volatile Future<?> remoteWriteFuture = null; /** * A list of clean buffers. By clean, we mean not needing to be written. * Once a dirty write cache has been flushed, it is placed onto the * {@link #cleanList}. Clean buffers can be taken at any time for us as the * current buffer. */ final private LinkedBlockingDeque<WriteCache> cleanList; /** * Lock for the {@link #cleanList} allows us to notice when it becomes empty * and not-empty. */ final private ReentrantLock cleanListLock = new ReentrantLock(); /** * Condition <code>!cleanList.isEmpty()</code> * <p> * Note: If you wake up from this condition you MUST also test {@link #halt}. */ final private Condition cleanListNotEmpty = cleanListLock.newCondition(); /** * The read lock allows concurrent {@link #acquireForWriter()}s while the * write lock prevents {@link #acquireForWriter()} when we must either reset * the {@link #current} cache buffer or change the {@link #current} * reference. E.g., {@link #flush(boolean, long, TimeUnit)}. * <p> * Note: {@link #read(long)} is non-blocking. It does NOT use this lock!!! */ final private ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); /** * A list of dirty buffers. Writes from these may be combined, but not * across {@link #flush(boolean)}. */ final private BlockingQueue<WriteCache> dirtyList; /** * Lock for the {@link #dirtyList} allows us to notice when it becomes empty * and not-empty. */ final private ReentrantLock dirtyListLock = new ReentrantLock(); /** * Lock used to put cache buffers onto the {@link #dirtyList}. This lock is * required in order for {@link #flush(boolean, long, TimeUnit)} to have * atomic semantics, otherwise new cache buffers could be added to the dirty * list. This lock is distinct from the {@link #lock} because we do not want * to yield that lock when awaiting the {@link #dirtyListEmpty} condition. * <p> * Note: If you wake up from this condition you MUST also test {@link #halt}. * * @see #dirtyListLock. */ final private Condition dirtyListEmpty = dirtyListLock.newCondition(); /** * Condition signaled whenever content is added to the dirty list. * <p> * Note: If you wake up from this condition you MUST also test {@link #halt}. */ final private Condition dirtyListChange = dirtyListLock.newCondition(); /** * Used to compact sparsely utilized {@link WriteCache}. */ private final AtomicReference<WriteCache> compactingCacheRef = new AtomicReference<WriteCache>(); /** * Maintained to guarantee that compaction is possible. This is always a * clean cache. */ private final AtomicReference<WriteCache> compactingReserveRef = new AtomicReference<WriteCache>(); /** * Disable {@link WriteCache} compaction when <code>false</code>. * <p> * Note: This is set to <code>false</code> when * {@link #compactionThreshold} is 100. */ private final boolean compactionEnabled; /** * The minimum percentage of empty space that could be recovered before we * will attempt to compact a {@link WriteCache} buffer (in [0:100]). */ private final int compactionThreshold = 20; /** * The current buffer. Modification of this value and reset of the current * {@link WriteCache} are protected by the write lock of {@link #lock()}. */ final private AtomicReference<WriteCache> current = new AtomicReference<WriteCache>(); /** * The current read cache. */ final private AtomicReference<ReadCache> readCache = new AtomicReference<ReadCache>(); /** * Flag set if {@link WriteTask} encounters an error. The cause is set * on {@link #firstCause} as well. * <p> * Note: Error handling MUST cause the write cache service buffers to be * {@link #reset()} and make sure the HA write pipeline is correctly * configured. This is handled by a high-level abort() on the journal. It is * NOT Ok to simply re-try writes of partly filled buffers since they may * already have been partly written to the disk. A high-level abort() is * necessary to ensure that we discard any bad writes. The abort() will need * to propagate to all members of the {@link Quorum} so they are all reset * to the last commit point and have reconfigured write cache services and * write pipelines. */ private volatile boolean halt = false; /** * The first cause of an error within the asynchronous * {@link WriteTask}. */ private final AtomicReference<Throwable> firstCause = new AtomicReference<Throwable>(); /** * The capacity of the cache buffers. This is assumed to be the same for * each buffer. */ final private int capacity; // /** // * Object knows how to (re-)open the backing channel. // */ // final private IReopenChannel<? extends Channel> opener; /** * A map from the offset of the record on the backing file to the cache * buffer on which that record was written. */ final private ConcurrentMap<Long/* offset */, WriteCache> serviceMap; /** * An immutable array of the {@link WriteCache} buffer objects owned by the * {@link WriteCacheService} (in contract to those owner by the caller but * placed onto the {@link #dirtyList} by * {@link #writeChk(long, ByteBuffer, int)}). */ final private WriteCache[] writeBuffers; /** * An immutable array of the {@link WriteCache} buffer objects owned by the * {@link WriteCacheService}. These buffers are used for the readCache. */ final private ReadCache[] readBuffers; /** * Debug arrays to chase down write/removal errors. * * Toggle comment appropriately to activate/deactivate */ // final long[] addrsUsed = new long[4024 * 1024]; // private int addrsUsedCurs = 0; // final char[] addrActions = new char[addrsUsed.length]; // final int[] addrLens = new int[addrsUsed.length]; private final long[] addrsUsed = null; private int addrsUsedCurs = 0; private final char[] addrActions = null; private final int[] addrLens = null; /** * The backing reader that can be used when a cache read misses. */ final private IBackingReader reader; /** * The current file extent. */ final private AtomicLong fileExtent = new AtomicLong(-1L); // /** // * The environment in which this object participates // */ // protected final Environment environment; /** * The object which manages {@link Quorum} state changes on the behalf of * this service. */ final private Quorum<HAPipelineGlue, QuorumMember<HAPipelineGlue>> quorum; // /** // * The {@link UUID} of the highly available service. // */ // final private UUID serviceId; /** * The {@link Quorum} token under which this {@link WriteCacheService} * instance is valid. This is fixed for the life cycle of the * {@link WriteCacheService}. This ensures that all writes are buffered * under a consistent quorum meet. */ final private long quorumToken; final private int replicationFactor; /** * The object which manages {@link Quorum} state changes on the behalf of * this service. */ protected Quorum<HAPipelineGlue, QuorumMember<HAPipelineGlue>> getQuorum() { return quorum; } /** * Allocates N buffers from the {@link DirectBufferPool}. * * @param nwriteBuffers * The #of {@link WriteCache} buffers. * @param minCleanListSize * The maximum #of {@link WriteCache} buffers on the * {@link #dirtyList} before we start to evict {@link WriteCache} * buffers to the disk -or- ZERO (0) to use a default value. <br> * Note: As a rule of thumb, you should set * <code>maxDirtyListSize LTE nbuffers-4</code> such that we have * at least: (1) for [current], (1) for [compactingCache], (1) * for reserve and (1) buffer left available on the * {@link #cleanList}. * @param prefixWrites * When <code>true</code>, the {@link WriteCacheService} is * supporting an RWS mode store and each {@link WriteCache} * buffer will directly encode the fileOffset of each record * written onto the {@link WriteCache}. When <code>false</code>, * the {@link WriteCacheService} is supporting a WORM mode store * and the {@link WriteCache} buffers contain the exact data to * be written onto the backing store. * @param compactionThreshold * The minimum percentage of space that could be reclaimed before * we will attempt to coalesce the records in a * {@link WriteCache} buffer. When <code>100</code>, compaction * is explicitly disabled. * <p> * Note: This is ignored for WORM mode backing stores since we * can not compact the buffer in that mode. * @param useChecksum * <code>true</code> iff record level checksums are enabled. * @param fileExtent * The current extent of the backing file. * @param opener * The object which knows how to (re-)open the channel to which * cached writes are flushed. * @param quorumManager * The object which manages {@link Quorum} state changes on the * behalf of this service. * * @throws InterruptedException */ public WriteCacheService(final int nwriteBuffers, int minCleanListSize, final int nreadBuffers, final boolean prefixWrites, final int compactionThreshold, final int hotCacheSize, final int hotCacheThreshold, final boolean useChecksum, final long fileExtent, final IReopenChannel<? extends Channel> opener, final Quorum quorum, final IBackingReader reader) throws InterruptedException { if (nwriteBuffers <= 0) throw new IllegalArgumentException(); if (minCleanListSize == 0) { // default /* * Setup a reasonable default if no value was specified. * Just need to make sure we have a few spare buffers to * prevent latency on acquiring a clean buffer for writing. * * The default here is 5% of the write cache buffers. This * is based on historical experience that we do better with * 50MB of dirty list when there are 2000 write cache buffers, * which is 2.5%. It seems a reasonable thing to give over * 5%. If you want more write elision, then just increase * the number of write cache buffers. 95% of them will be * used to defer writes and elide writes. 5% of them will * be available to drive the disk with random write IOs. * * See BLZG-1589 (Modify the default behavior for setting the clear/dirty list threshold) */ minCleanListSize = Math.max(4, (int) (nwriteBuffers*.003)); } if (minCleanListSize > nwriteBuffers) minCleanListSize = nwriteBuffers; if (minCleanListSize < 0) throw new IllegalArgumentException(); if (compactionThreshold <= 0) throw new IllegalArgumentException(); if (compactionThreshold > 100) throw new IllegalArgumentException(); if (fileExtent < 0L) throw new IllegalArgumentException(); if (opener == null) throw new IllegalArgumentException(); // if (quorum == null) // throw new IllegalArgumentException(); this.useChecksum = useChecksum; /** * FIXME WCS compaction fails! * * CORRECTION, it is NOT clearly established that WCS compaction fails * although some failures appear to correlate with it being enabled. * It may be that with compaction enabled other errors are more likely * that are not directly associated with the compaction; for example * as a result of denser data content. * * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/674" > * WCS write cache compaction causes errors in RWS postHACommit() * </a> */ this.compactionEnabled = canCompact() && compactionThreshold < 100; if (log.isInfoEnabled()) log.info("Compaction Enabled: " + compactionEnabled + " @ threshold=" + compactionThreshold); // this.opener = opener; // the token under which the write cache service was established. if ((this.quorum = quorum) != null) { this.quorumToken = quorum.token(); this.replicationFactor = quorum.replicationFactor(); } else { // Not HA. this.quorumToken = Quorum.NO_QUORUM; this.replicationFactor = 1; } this.reader = reader; dirtyList = new LinkedBlockingQueue<WriteCache>(); cleanList = new LinkedBlockingDeque<WriteCache>(); writeBuffers = new WriteCache[nwriteBuffers]; /* * Configure the desired dirtyListThreshold. */ if (compactionEnabled) { /* * Setup the RWS dirtyListThreshold. * * allow for compacting cache and reserve */ m_dirtyListThreshold = Math.max(1, nwriteBuffers - minCleanListSize - 2); } else { /* * Note: We always want a threshold of ONE (1) for the WORM since: * 1) We can not compact cache buffers for that store mode. * 2) We still want to write data to the file even if it will * never be read (as in the case of "deleted" data in same transaction * as it was allocated). */ m_dirtyListThreshold = 1; } assert m_dirtyListThreshold >= 1; assert m_dirtyListThreshold <= writeBuffers.length; // Setup ReadCache this.readListSize = nreadBuffers; this.readList = new LinkedBlockingDeque<ReadCache>(); readBuffers = new ReadCache[nreadBuffers]; // pre-allocate all ReadCache for (int i = 0; i < readBuffers.length; i++) { readBuffers[i] = new ReadCache(null); } /* * Hot cache setup * * Let's aim for a 1/10 of the readCache, but hotListSize must be at least 3 * to function */ { if (hotCacheSize < (readListSize * 0.8) && hotCacheSize > 2) { hotListSize = hotCacheSize; } else { hotListSize = 0; } } hotList = new LinkedBlockingDeque<ReadCache>(); this.hotCacheThreshold = hotCacheThreshold; // pre-populate hotList and readList for (int i = 0; i < hotListSize; i++) { hotList.add(readBuffers[i]); } for (int i = hotListSize; i < readListSize; i++) { readList.add(readBuffers[i]); } // set initial read cache hotCache = hotList.poll(); hotReserve = hotList.poll(); readCache.set(readList.poll()); { final ReadCache curReadCache = readCache.get(); if (curReadCache != null) { curReadCache.incrementReferenceCount(); } } if (log.isInfoEnabled()) log.info("nbuffers=" + nwriteBuffers + ", dirtyListThreshold=" + m_dirtyListThreshold + ", compactionThreshold=" + compactionThreshold + ", compactionEnabled=" + compactionEnabled + ", prefixWrites=" + prefixWrites + ", hotListSize=" + hotListSize + ", useChecksum=" + useChecksum + ", quorum=" + quorum); // save the current file extent. this.fileExtent.set(fileExtent); // Add [current] WriteCache. current.set(writeBuffers[0] = newWriteCache(null/* buf */, useChecksum, false/* bufferHasData */, opener, fileExtent)); // if (nbuffers > 1) { // readCache.set(buffers[1] = newWriteCache(null/* buf */, // useChecksum, false/* bufferHasData */, opener, fileExtent)); // // buffers[1].incrementReferenceCount(); // for readCache // buffers[1].closeForWrites(); // } // add remaining buffers. for (int i = 1; i < nwriteBuffers; i++) { final WriteCache tmp = newWriteCache(null/* buf */, useChecksum, false/* bufferHasData */, opener, fileExtent); writeBuffers[i] = tmp; cleanList.add(tmp); } // Set the same counters object on each of the write cache instances. final WriteCacheServiceCounters counters = new WriteCacheServiceCounters( nwriteBuffers, m_dirtyListThreshold, compactionThreshold); for (int i = 0; i < writeBuffers.length; i++) { writeBuffers[i].setCounters(counters); } this.counters = new AtomicReference<WriteCacheServiceCounters>(counters); // assume capacity is the same for each buffer instance. capacity = current.get().capacity(); // set initial capacity based on an assumption of 1k buffers. serviceMap = new ConcurrentHashMap<Long, WriteCache>(nwriteBuffers * (capacity / 1024)); /* * Memoizer used to install reads into the cache on a cache miss. */ memo = new ReadMemoizer(loadChild); // start service to write on the backing channel. localWriteService = Executors .newSingleThreadExecutor(new DaemonThreadFactory(getClass() .getName())); // run the write task localWriteFuture = localWriteService.submit(newWriteTask()); } /** * Return <code>true</code> iff we are allowed to compact buffers. The * default implementation of the {@link WriteCache} is for a Worm and can * never compact. * <p> * Note: This method is package private for access by * {@link WriteCacheService}. */ protected boolean canCompact() { return false; } /** * Called from {@link IBufferStrategy#commit()} and {@link #reset()} to * reset WriteCache sequence for HA synchronization. The return value winds * up propagated to the {@link IRootBlockView#getBlockSequence()} field in * the {@link IRootBlockView}s. * * @return The value of the counter before this method was called. */ public long resetSequence() { return cacheSequence.getAndSet(0L); } private final AtomicLong cacheSequence = new AtomicLong(0); /** * Return the then current write cache block sequence number. */ public long getSequence() { return cacheSequence.get(); } /** * Determines how long the dirty list should grow until the * {@link WriteCache} buffers are coalesced and/or written to disk. * <p> * Note: For the WORM there is no advantage to any buffering, but the * RWStore may recycle storage, so: 1) Writes can be avoided if delayed 2) * Buffers could potentially be compacted, further delaying writes. * <p> * Note: This MUST BE GTE ONE (1) since WriteTask.call() will otherwise drop * through without actually taking anything off of the dirtyList. */ private final int m_dirtyListThreshold; /** * The readCache is managed separately from the writeCache. * <p> * If active then the readCache may optionally be managed together * with a hotList, to which frequently read buffers are transferred. * <p> * Data is added to the readCache: * <li>after an evicted WriteCache is written to disk/HA * <li>on a cache miss, disk reads are added to the cache * <p> * Data is added to the hotList when a readCache is evicted from the * readList. resetWith uses the hitCount associated with live data * records to determine which data is transferred to the hotList. * <p> * When a readCache is evicted from the hotList, the entire cache * is moved to the readList. */ private final int readListSize; /** * The readList - maximum of readListSize */ final private BlockingQueue<ReadCache> readList; /** * Determines the size of the HIRS cache (will be zero if disabled) * <p> * Where HIRS captures High inter-reference vs Low inter-reference of * LIRS. * <p> * The HIRS cache is used in conjunction with the readCache which a naive * copying strategy would be a kind of LIRS cache. Instead, cache hits * from "older" read cache records are copied to the HIRS cache which * should be recycled more slowly. * <p> * Once the HIRS cache is full (maximum number of buffers in use) then * then the per record hit count is used to determine which records are * transferred to be maintained. */ private final int hotListSize; /** * The hotList - maximum of hirsSize - populated lazily from cleanList */ final private BlockingQueue<ReadCache> hotList; /** * The current hotCache. * <p> * Note: Guarded by the {@link #readCache} reference. */ private ReadCache hotCache = null; /** * Current hotCacheThreshold above which readCache records are * transferred to the hotCache. */ final private int hotCacheThreshold; /** * The current hotReserve. * <p> * Note: Guarded by the {@link #readCache} reference. */ private ReadCache hotReserve = null; // /** // * Computes modular distance of a circular number list. // * // * eg start: 1, end:5, mod: 20 = 5-1 = ((5+20)-1)%20 = 4 // * or start:15, end:3, mod: 20 = (3+20)-15 = 8 // * // * Used to determine the position of a cache from front of // * the clean list // */ // private int modDistance(final int start, final int end, final int mod) { // return ((end + mod) - start) % mod; // } /** * When <code>true</code>, dirty buffers are immediately drained, compacted, * and then written out to the backing media and (in HA mode) to the * followers. */ private volatile boolean flush = false; /** * When <code>true</code> any dirty buffers are written directly and never compacted. * This is only used in flush() when adding any compactingCache to the dirty list. */ private volatile boolean directWrite = false; protected Callable<Void> newWriteTask() { return new WriteTask(); } /** * The task responsible for writing dirty buffers onto the backing channel * and onto the downstream {@link Quorum} member if the service is highly * available. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> */ class WriteTask implements Callable<Void> { private ByteBuffer checksumBuffer; /** * Note: If there is an error in this thread then it needs to be * propagated to the threads write()ing on the cache or awaiting flush() * and from there back to the caller and an abort(). We do not need to * bother the readers since the read() methods all allow for concurrent * close() and will return null rather than bad data. The reprovisioning * of the write cache service (e.g., by reset()) must hold the writeLock * so as to occur when there are no outstanding reads executing against * the write cache service. * * @todo If resynchronization rolls back the lastCommitTime for a store, * then we need to interrupt or otherwise invalidate any readers * with access to historical data which is no longer part of the * quorum. */ public Void call() throws Exception { try { if (quorum != null) { // allocate heap byte buffer for whole buffer checksum. checksumBuffer = ByteBuffer.allocate(writeBuffers[0].peek() .capacity()); } else { checksumBuffer = null; } doRun(); return null; } catch (InterruptedException t) { /* * This task can only be interrupted by a thread with its * Future (or by shutting down the thread pool on which it * is running), so this interrupt is a clear signal that the * write cache service is closing down. */ return null; } catch (Throwable t) { if (InnerCause.isInnerCause(t, AsynchronousCloseException.class)) { /* * The service was shutdown. We do not want to log an * error here since this is normal shutdown. close() * will handle all of the Condition notifies. */ return null; } /* * Anything else is an error and halts processing. Error * processing MUST a high-level abort() and MUST do a * reset() if this WriteCacheService instance will be * reused. * * Note: If a WriteCache was taken from the dirtyList above * then it will have been dropped. However, all of the * WriteCache instances owned by the WriteCacheService are * in [buffers] and reset() is written in terms of [buffers] * precisely so we do not loose buffers here. */ if (firstCause.compareAndSet(null/* expect */, t/* update */)) { halt = true; } /* * Signal anyone blocked on the dirtyList or cleanList * Conditions. They need to notice the change in [halt] and * wrap and rethrow [firstCause]. */ dirtyListLock.lock(); try { dirtyListEmpty.signalAll(); dirtyListChange.signalAll(); } finally { dirtyListLock.unlock(); } cleanListLock.lock(); try { cleanListNotEmpty.signalAll(); } finally { cleanListLock.unlock(); } log.error(t, t); /* * Halt processing. The WriteTask must be restarted by * reset. */ return null; } finally { /* * Clear compactingCache reference now that the WriteTask is * known to be terminated. */ compactingCacheRef.set(null); // clear reference. checksumBuffer = null; } } // call() private void doRun() throws Exception { while (true) { /* * Replace assert !halt; since it is set in WriteCacheService.close() */ if (halt) { throw new RuntimeException(firstCause.get()); } // Await dirty cache buffer. final WriteCache cache = awaitDirtyBuffer(); boolean didCompact = false; boolean didWrite = false; /* * Note: When using a large number of write cache buffers and a * bulk data load, it is not uncommon for all records to be * recycled by the time we take something from the dirtyList, in * which case the cache will be (logically) empty. * * Note: This test (WriteCache.isEmpty()) is not decisive * because we are not holding any locks across it and the * subsequent actions. Therefore, it is possible that the cache * will become empty after it has been tested through concurrent * clearWrite() invocations. That should not be a problem. We * want to leave the cache open (versus closing it against * writes) in case we decide to compact the cache rather than * evicting it. The cache MUST NOT be closed for writes when we * compact it or we will lose the ability to clear recycled * records out of that WriteCache. */ final boolean wasEmpty = cache.isEmpty(); if (!wasEmpty) { final int percentEmpty = cache.potentialCompaction(); if (compactionEnabled && !directWrite && percentEmpty >= compactionThreshold) { if (log.isDebugEnabled()) log.debug("percentEmpty=" + percentEmpty + "%"); // Attempt to compact cache block. if (compactCache(cache)) { // [cache] is clean and empty. assert cache.isEmpty(); } else { // Write cache block if did not compact. writeCacheBlock(cache); didWrite = true; } didCompact = true; } else { // Write cache block. writeCacheBlock(cache); didWrite = true; } } // Now written/compacted, remove from dirtyList. if (dirtyList.take() != cache) throw new AssertionError(); counters.get().ndirty--; dirtyListLock.lockInterruptibly(); try { if (dirtyList.isEmpty()) { /* * Signal Condition when we release the * dirtyListLock. */ dirtyListEmpty.signalAll(); } } finally { dirtyListLock.unlock(); } addClean(cache, false/* addFirst */); if (!wasEmpty && log.isInfoEnabled()) { final WriteCacheServiceCounters tmp = counters.get(); final long nhit = tmp.nhit.get(); final long ntests = nhit + tmp.nmiss.get(); final int hitRate = (int) (100 * ((ntests == 0L ? 0d : (double) nhit / ntests))); final WriteCacheServiceCounters c = counters.get(); log.info("WriteCacheService: bufferCapacity=" + writeBuffers[0].capacity() + ",nbuffers=" + tmp.nbuffers + ",nclean=" + tmp.nclean + ",ndirty=" + tmp.ndirty + ",maxDirty=" + tmp.maxdirty + ",hitRate=" + hitRate + ",empty=" + wasEmpty + ",didCompact=" + didCompact + ",didWrite=" + didWrite + ",ncompact=" + c.ncompact + ",nbufferEvictedToChannel=" + c.nbufferEvictedToChannel); } } // while(true) } // doRun() /** * We choose here whether to compact the cache. * * 1) Reserve extra clean buffer, if none available do NOT attempt * compaction 2) Compact to "current" compacting buffer avoiding * contention with writing threads 3) If required replace current * compacting buffer with reserved, adding compacting buffer to dirty * list 4) Release compacted * * @return <code>true</code> iff we compacted the cache. * * @throws InterruptedException */ private boolean compactCache(final WriteCache cache) throws InterruptedException, Exception { /* * The cache should not be closed against writes. If it were closed * for writes, then we would no longer be able to capture cleared * writes in the RecordMap. However, if we compact the cache, we * want any cleared writes to be propagated into the compacted * cache. */ assert !cache.isClosedForWrites(); if (compactingReserveRef.get() == null) { final WriteCache tmp = getDirectCleanCache(); if (tmp == null) return false; // cannot guarantee compaction tmp.resetWith(serviceMap); // should be NOP! compactingReserveRef.set(tmp); } /* * We can be certain to be able to compact. */ /* * Grab the [compactingCache] (if any). */ WriteCache curCompactingCache = null; dirtyListLock.lockInterruptibly(); try { // Might be null. curCompactingCache = compactingCacheRef.getAndSet(null); // } finally { // dirtyListLock.unlock(); // } // try { boolean done = false; if (curCompactingCache != null) { if (log.isTraceEnabled()) log.trace("Transferring to curCompactingCache"); done = WriteCache.transferTo(cache/* src */, curCompactingCache/* dst */, serviceMap, 0/*threshold*/); if (done) { // Everything was compacted. Send just the address metadata (empty cache block). sendAddressMetadata(cache); if (log.isDebugEnabled()) log.debug("RETURNING RESERVE: curCompactingCache.bytesWritten=" + curCompactingCache.bytesWritten()); return true; } /* * The [curCompactingCache] is full. */ if (flush) { /* * Send out the full cache block. */ writeCacheBlock(curCompactingCache); addClean(curCompactingCache, true/* addFirst */); if (log.isTraceEnabled()) log.trace("Flushed curCompactingCache"); } else { /* * Add current compacting cache to dirty list. */ dirtyList.add(curCompactingCache); if (log.isTraceEnabled()) log.trace("Added curCompactingCache to dirtyList"); } // fall through. fill in the reserve cache next. curCompactingCache = null; } /* * Clear the state on the reserve buffer and remove from * cacheService map. */ if (log.isTraceEnabled()) log.trace("Setting curCompactingCache to reserve"); curCompactingCache = compactingReserveRef.getAndSet(null); { final WriteCache tmp = getDirectCleanCache(); if (tmp != null) { tmp.resetWith(serviceMap); // should be NOP! compactingReserveRef.set(tmp); } } if (log.isTraceEnabled()) log.trace("Transferring to curCompactingCache"); done = WriteCache.transferTo(cache/* src */, curCompactingCache/* dst */, serviceMap, 0/*threshold*/); if (!done) { throw new AssertionError( "We must be able to compact the cache"); } if (log.isDebugEnabled()) log.debug("USING RESERVE: curCompactingCache.bytesWritten=" + curCompactingCache.bytesWritten()); sendAddressMetadata(cache); // Buffer was compacted. return true; } finally { // dirtyListLock.lock(); try { // Now reset compactingCache with dirtyListLock held compactingCacheRef.set(curCompactingCache); counters.get().ncompact++; } finally { dirtyListLock.unlock(); } } } // compactCache() /** * In HA, we need to notify a downstream RWS of the addresses that have * been allocated on the leader in the same order in which the leader * made those allocations. This information is used to infer the order * in which the allocators for the different allocation slot sizes are * created. This method will synchronously send those address notices and * and also makes sure that the followers see the recycled addresses * records so they can keep both their allocators and the actual * allocations synchronized with the leader. * * @param cache * A {@link WriteCache} whose data has been transfered into * another {@link WriteCache} through a "compact" operation. * * @throws IllegalStateException * @throws InterruptedException * @throws ExecutionException * @throws IOException * * @see <a href="http://trac.blazegraph.com/ticket/721"> HA1 </a> */ private void sendAddressMetadata(final WriteCache cache) throws IllegalStateException, InterruptedException, ExecutionException, IOException { if (quorum == null) { //|| !quorum.isHighlyAvailable() // || !quorum.getClient().isLeader(quorumToken)) { return; } if (cache.prepareAddressMetadataForHA()) { writeCacheBlock(cache); } } /** * Get a dirty cache buffer. Unless we are flushing out the buffered * writes, we will allow the dirtyList to grow to the desired threshold * before we attempt to compact anything. * <p> * Note: This DOES NOT remove the {@link WriteCache} from the * {@link #dirtyList}. It uses a peek(). The {@link WriteCache} will * remain on the {@link #dirtyList} until it has been handled by * {@link #doRun()}. * * @return A dirty {@link WriteCache}. */ private WriteCache awaitDirtyBuffer() throws InterruptedException { dirtyListLock.lockInterruptibly(); try { assert m_dirtyListThreshold >= 1 && m_dirtyListThreshold <= writeBuffers.length : "dirtyListThreshold=" + m_dirtyListThreshold + ", #buffers=" + writeBuffers.length; /* * Wait for a dirty buffer. * * Note: [flush] and [m_dirtyListThreshold] can change * during this loop! */ while (true) { if (!flush) { // Let dirtyList grow up to threshold. if (dirtyList.size() < m_dirtyListThreshold && !halt) { dirtyListChange.await(); } else break; } else { // We need to flush things out. if (dirtyList.isEmpty() && !halt) { dirtyListChange.await(); } else break; } } if (halt) throw new RuntimeException(firstCause.get()); // update counters. final WriteCacheServiceCounters c = counters.get(); c.ndirty = dirtyList.size(); if (c.maxdirty < c.ndirty) c.maxdirty = c.ndirty; // Guaranteed available. final WriteCache cache = dirtyList.peek(); if (cache == null) throw new AssertionError(); // System.err.println(cache.toString()); return cache; } finally { dirtyListLock.unlock(); } } /** * Write the {@link WriteCache} onto the disk and the HA pipeline. * * @param cache * The {@link WriteCache}. * * @throws InterruptedException * @throws ExecutionException * @throws IOException */ private void writeCacheBlock(final WriteCache cache) throws InterruptedException, ExecutionException, IOException { /** * IFF HA and this is the quorum leader. * * Note: This is true for HA1 as well. The code path enabled by this * is responsible for writing the HALog files. * * @see <a href="http://trac.blazegraph.com/ticket/721"> HA1 </a> */ final boolean isHALeader = quorum != null && quorum.getClient().isLeader(quorumToken); /* * Ensure nothing will modify this buffer before written to disk or * HA pipeline. * * Note: Do NOT increment the cacheSequence here. We need to decide * whether or not the buffer is empty first, and it needs to be * closed for writes before we can make that decision. */ // Must be closed for writes. cache.closeForWrites(); /* * Test for an empty cache. * * Note: We can not do this until the cache has been closed for * writes. */ { final ByteBuffer b = cache.peek(); if (b.position() == 0) { // Empty cache. return; } } // Increment WriteCache sequence. final long thisSequence = cacheSequence.getAndIncrement(); // cache.setSequence(thisSequence); // Set the current file extent on the WriteCache. cache.setFileExtent(fileExtent.get()); if (isHALeader) {//quorum != null && quorum.isHighlyAvailable()) { // Verify quorum still valid and we are the leader. quorum.assertLeader(quorumToken); /* * Replicate from the leader to the first follower. Each * non-final follower will receiveAndReplicate the write cache * buffer. The last follower will receive the buffer. */ // send to 1st follower. @SuppressWarnings("unchecked") final QuorumPipeline<HAPipelineGlue> quorumMember = (QuorumPipeline<HAPipelineGlue>) quorum .getMember(); assert quorumMember != null : "Not quorum member?"; final WriteCache.HAPackage pkg = cache.newHAPackage(// quorumMember.getStoreUUID(),// quorumToken,// quorumMember.getLastCommitCounter(),// quorumMember.getLastCommitTime(),// thisSequence,// replicationFactor,// checksumBuffer ); assert pkg.getData().remaining() > 0 : "Empty cache: " + cache; /* * Start the remote asynchronous IO before the local synchronous * IO. * * Note: In HA with replicationFactor=1, this should still * attempt to replicate the write cache block in case there is * someone else in the write pipeline (for example, off-site * replication). */ /* * FIXME There may be a problem with doing the async IO first. * Track this down and document the nature of the problem, * then clean up the documentation here (see the commented * out version of this line below). */ quorumMember.logWriteCacheBlock(pkg.getMessage(), pkg.getData().duplicate()); /* * TODO Do we want to always support the replication code path * when a quorum exists (that is, also for HA1) in case there * are pipeline listeners that are not HAJournalServer * instances? E.g., for offsite replication? */ if (quorum.replicationFactor() > 1) { // ASYNC MSG RMI + NIO XFER. remoteWriteFuture = quorumMember.replicate(null/* req */, pkg.getMessage(), pkg.getData().duplicate()); counters.get().nsend++; } /* * The quorum leader logs the write cache block here. For the * followers, the write cache blocks are currently logged by * HAJournalServer. */ // quorumMember.logWriteCacheBlock(msg, b.duplicate()); } /* * Do the local IOs (concurrent w/ remote replication). * * Note: This will not throw out an InterruptedException unless this * thread is actually interrupted. The local storage managers all * trap asynchronous close exceptions arising from the interrupt of * a concurrent IO operation and retry until they succeed. */ { if (log.isDebugEnabled()) log.debug("Writing to file: " + cache.toString()); final long begin = System.nanoTime(); final long nrecs = cache.recordMap.size(); // #of records in the write cache block. try { // Flush WriteCache buffer to channel (write on disk) cache.flush(false/* force */); } finally { // See BLZG-1589 (new latency-oriented counters) final long elapsed = System.nanoTime() - begin; final WriteCacheServiceCounters c = counters.get(); c.nbufferEvictedToChannel++; c.nrecordsEvictedToChannel += nrecs; c.elapsedBufferEvictedToChannelNanos += elapsed; } } /* * Wait for the downstream IOs to finish. * * Note: Only the leader is doing replication of the WriteCache * blocks from this thread and only the leader will have a non-null * value for the [remoteWriteFuture]. The followers are replicating * to the downstream nodes in QuorumPipelineImpl. Since the WCS * absorbs a lot of latency, replication from QuorumPipelineImpl * should be fine. */ if (remoteWriteFuture != null) { // Wait for the downstream IOs to finish. remoteWriteFuture.get(); } } // writeCacheBlock() } // class WriteTask /** * Factory for {@link WriteCache} implementations. * * @param buf * The backing buffer (optional). * @param useChecksum * <code>true</code> iff record level checksums are enabled. * @param bufferHasData * <code>true</code> iff the buffer has data to be written onto * the local persistence store (from a replicated write). * @param opener * The object which knows how to re-open the backing channel * (required). * @param fileExtent * The then current extent of the backing file. * * @return A {@link WriteCache} wrapping that buffer and able to write on * that channel. * * @throws InterruptedException */ abstract public WriteCache newWriteCache(IBufferAccess buf, boolean useChecksum, boolean bufferHasData, IReopenChannel<? extends Channel> opener, final long fileExtent) throws InterruptedException; /** * {@inheritDoc} * <p> * All dirty buffers are reset and transferred to the head of the clean * list. The buffers on the clean list are NOT reset since they may contain * valid cached reads (data which is known to be on the disk). We do not * want to discard the read cache on reset(). * <p> * Note: This approach deliberately does not cause any buffers belonging to * the caller of {@link #writeChk(long, ByteBuffer, int)} to become part of * the {@link #cleanList}. * <p> * Note: <strong>You MUST set the {@link #setExtent(long) file extent} * </strong> after {@link #reset() resetting} the {@link WriteCacheService}. * This is necessary in order to ensure that the correct file extent is * communicated along the write replication pipeline when high availability * is enabled. * <p> * Note: {@link #reset()} MUST NOT interrupt readers. It should only reset * those aspects of the write cache state that are associated with writes. * On the other hand, {@link #close()} must close all buffers and must not * permit readers to read from closed buffers. */ public void reset() throws InterruptedException { final WriteLock writeLock = lock.writeLock(); writeLock.lockInterruptibly(); try { if (!open.get()) { // Reset can not recover from close(). throw new IllegalStateException(firstCause.get()); } /* * Note: The WriteTask must use lockInterruptably() so it will * notice when it is interrupted by cancel(). */ // cancel the current WriteTask. localWriteFuture.cancel(true/* mayInterruptIfRunning */); final Future<?> rwf = remoteWriteFuture; if (rwf != null) { // Note: Cancel of remote Future is RMI! try { rwf.cancel(true/* mayInterruptIfRunning */); } catch (Throwable t) { log.warn(t, t); } } /* * Drain and reset the dirty cache buffers, dropping them onto the * cleanList. */ drainAndResetDirtyList(); /* * Now that we have sent all the signal()s we know how to send, go * ahead and wait for the WriteTask to notice and terminate. */ try { // wait for it localWriteFuture.get(); } catch (Throwable t) { // ignored. } finally { /* * Once more, drain and reset the dirty cache buffers, dropping * them onto the cleanList. * * Note: This is intended to handle the case where there might * be concurrency in WriteTask.call() such that we did not get * all of the dirty buffers the first time we called this method * above. * * Note: This will ignore the [compactingReserve]. That * WriteCache is always clean and can stay where it is. */ drainAndResetDirtyList(); /* * Verify some post-conditions once the WriteTask is terminated. */ dirtyListLock.lockInterruptibly(); try { if (!dirtyList.isEmpty()) throw new AssertionError(); } finally { dirtyListLock.unlock(); } if (compactingCacheRef.get() != null) throw new AssertionError(); // ensure cleanList is not empty after WriteTask terminates, handling single buffer case cleanListLock.lockInterruptibly(); try { if (writeBuffers.length > 1 && cleanList.isEmpty()) throw new AssertionError(); } finally { cleanListLock.unlock(); } } /* * Note: DO NOT clear the service record map. This still has valid * cache entries (the read cache). */ // // clear the service record map. // recordMap.clear(); // // // reset each buffer. // for (WriteCache t : buffers) { // t.reset(); // } /* * Make sure the [current] is reset and non-null. */ { final WriteCache x = current.get(); if (x != null) { // reset if found. x.resetWith(serviceMap); // addClean(x, true/* addFirst */); } else { // Non-blocking take. final WriteCache t = cleanList.poll(); if (t == null) throw new AssertionError(); if (!current.compareAndSet(null/* expect */, t/* update */)) { // Concurrently set. throw new AssertionError(); } } } // // set readCache // if (buffers.length > 1) { // readCache.set(buffers[1]); // buffers[1].closeForWrites(); // } // // // re-populate the clean list with remaining buffers // for (int i = 2; i < buffers.length; i++) { // cleanList.put(buffers[i]); // } // reset the counters. { final WriteCacheServiceCounters c = counters.get(); c.ndirty = 0; c.nclean = writeBuffers.length-1; c.nreset++; } // reset cacheSequence for HA resetSequence(); /* * Restart the WriteTask * * Note: don't do Future#get() for the remote Future. The task was * cancelled above and we don't want to wait on RMI (for the remote * Future). The remote service will have to handle any problems on * its end when resynchronizing if it was disconnected and did not * see our cancel() message. */ // if (rwf != null) { // try { // rwf.get(); // } catch (Throwable t) { // // ignored. // } // } this.localWriteFuture = localWriteService.submit(newWriteTask()); this.remoteWriteFuture = null; // clear the file extent to an illegal value. fileExtent.set(-1L); counters.get().nreset++; flush = false; } finally { writeLock.unlock(); } } public void resetAndClear() throws InterruptedException { final WriteLock writeLock = lock.writeLock(); writeLock.lockInterruptibly(); try { reset(); /* * Note: DO NOT clear the service record map. This still has valid * cache entries (the read cache). */ // clear the service record map. serviceMap.clear(); // reset each buffer. for (WriteCache t : writeBuffers) { t.reset(); } } finally { writeLock.unlock(); } } /** * Drain the dirty list; reset each dirty cache buffer, and then add the * reset buffers to the front of the cleanList (since they are known to be * empty). * * @throws InterruptedException */ private void drainAndResetDirtyList() throws InterruptedException { final List<WriteCache> c = new LinkedList<WriteCache>(); // drain the dirty list. dirtyListLock.lockInterruptibly(); try { dirtyList.drainTo(c); dirtyListEmpty.signalAll(); dirtyListChange.signalAll(); // NB: you must verify // Condition once signaled! } finally { dirtyListLock.unlock(); } // Reset dirty cache buffers and add to cleanList. cleanListLock.lockInterruptibly(); try { for (WriteCache x : c) { x.resetWith(serviceMap); cleanList.addFirst(x); } assert !cleanList.isEmpty(); cleanListNotEmpty.signalAll(); counters.get().nclean = cleanList.size(); } finally { cleanListLock.unlock(); } } public void close() { //throws InterruptedException { if (!open.compareAndSet(true/* expect */, false/* update */)) { // Already closed, so this is a NOP. return; } /* * Set [firstCause] and [halt] to ensure that other threads report * errors. * * Note: If the firstCause has not yet been set, then we set it now to a * stack trace which will indicate that the WriteCacheService was * asynchronously closed (that is, it was closed by another thread). */ if (firstCause.compareAndSet(null/* expect */, new AsynchronousCloseException()/* update */)) { halt = true; } // Interrupt the write task. localWriteFuture.cancel(true/* mayInterruptIfRunning */); final Future<?> rwf = remoteWriteFuture; if (rwf != null) { // Note: Cancel of remote Future is RMI! try { rwf.cancel(true/* mayInterruptIfRunning */); } catch (Throwable t) { log.warn(t, t); } } // Immediate shutdown of the write service. localWriteService.shutdownNow(); // // Immediate shutdown of the remote write service (if running). // if (remoteWriteService != null) { // remoteWriteService.shutdownNow(); // } boolean interrupted = false; // Note: Possible code to ensure Futures are terminated.... // // Wait for the Futures. // try { // localWriteFuture.get(); // } catch (Throwable t) { // if (InnerCause.isInnerCause(t, InterruptedException.class)) { // interrupted = true; // } // } // if (rwf != null) { // try { // rwf.get(); // } catch (Throwable t) { // if (InnerCause.isInnerCause(t, InterruptedException.class)) { // interrupted = true; // } // } // } /* * Ensure that the WriteCache buffers are close()d in a timely * manner. */ // reset buffers on the dirtyList. dirtyListLock.lock/*Interruptibly*/(); try { dirtyList.drainTo(new LinkedList<WriteCache>()); dirtyListEmpty.signalAll(); dirtyListChange.signalAll(); } finally { dirtyListLock.unlock(); } // close() buffers on the cleanList. cleanListLock.lock/*Interruptibly*/(); try { cleanList.drainTo(new LinkedList<WriteCache>()); } finally { cleanListLock.unlock(); } /* * Note: The lock protects the [current] reference. */ final WriteLock writeLock = lock.writeLock(); writeLock.lock/*Interruptibly*/(); try { // close all buffers. for (WriteCache t : writeBuffers) { try { t.close(); } catch (InterruptedException ex) { interrupted = true; continue; } } // and any ReadCache buffers for (ReadCache t : readBuffers) { try { t.close(); } catch (InterruptedException ex) { interrupted = true; continue; } } // clear reference to the current buffer. current.getAndSet(null); // clear reference to the compactingCache buffer. compactingCacheRef.getAndSet(null); // clear reference to the readCache buffer. readCache.getAndSet(null); synchronized (readCache) { hotCache = null; hotReserve = null; } // clear the service record map. serviceMap.clear(); // clear the file extent to an illegal value. fileExtent.set(-1L); if(interrupted) Thread.currentThread().interrupt(); } finally { writeLock.unlock(); } if (log.isInfoEnabled()) log.info(counters.get().toString()); } /** * Ensures that {@link #close()} is eventually invoked so the buffers can be * returned to the {@link DirectBufferPool}. * * @throws Throwable */ protected void finalized() throws Throwable { close(); } /** * This method is called ONLY by write threads and verifies that the service * is {@link #open}, that the {@link WriteTask} has not been * {@link #halt halted}, and that the {@link WriteTask} is still * executing (in case any uncaught errors are thrown out of * {@link WriteTask#call()}. * <p> * Note: {@link #read(long)} DOES NOT throw an exception if the service is * closed, asynchronously closed, or even just plain dead. It just returns * <code>null</code> to indicate that the desired record is not available * from the cache. * * @throws IllegalStateException * if the service is closed. * @throws RuntimeException * if the {@link WriteTask} has failed. */ private void assertOpenForWriter() { if (!open.get()) throw new IllegalStateException(firstCause.get()); if (halt) throw new RuntimeException(firstCause.get()); if (localWriteFuture.isDone()) { /* * If the write task terminates abnormally then throw the exception * out here. */ try { // @todo don't do get() all the time...? localWriteFuture.get(); } catch (Throwable t) { throw new RuntimeException(t); } } } /** * Return the current buffer to a write thread. Once they are done, the * caller MUST call {@link #release()}. * * @return The buffer. * * @throws InterruptedException * @throws IllegalStateException * if the {@link WriteCacheService} is closed. * @throws RuntimeException * if the service has been {@link #halt halted} */ private WriteCache acquireForWriter() throws InterruptedException, IllegalStateException { final ReadLock readLock = lock.readLock(); readLock.lockInterruptibly(); try { /* * We only want to throw errors from the WriteTask out of write() * and flush(). However, this method is NOT invoked by read() which * uses a different non-blocking protocol to access the record if it * is in a cache buffer. */ assertOpenForWriter(); /* * Note: acquire() does not block since it holds the ReadLock. * Methods which change [current] MUST hold the WriteLock across * that operation to ensure that [current] is always non-null since * acquire() will not block once it acquires the ReadLock. */ final WriteCache tmp = current.get(); if (tmp == null) { throw new RuntimeException(); } // Note: The ReadLock is still held! return tmp; } catch (Throwable t) { /* * Note: release the lock only on the error path. */ readLock.unlock(); if (t instanceof InterruptedException) throw (InterruptedException) t; if (t instanceof IllegalStateException) throw (IllegalStateException) t; throw new RuntimeException(t); } } /** * Release the latch on an acquired buffer. */ private void release() { /* * Note: This is releasing the ReadLock which was left open by * acquire(). */ lock.readLock().unlock(); } /** * Flush the current write set through to the backing channel. * * @throws InterruptedException */ public void flush(final boolean force) throws InterruptedException { try { if (!flush(force, Long.MAX_VALUE, TimeUnit.NANOSECONDS)) { throw new RuntimeException(); } } catch (TimeoutException e) { throw new RuntimeException(e); } } /** * {@inheritDoc} * <p> * flush() is a blocking method. At most one flush() operation may run at a * time. The {@link #current} buffer is moved to the {@link #dirtyList} * while holding the {@link WriteLock} and flush() then waits until the * dirtyList becomes empty, at which point all dirty records have been * written through to the backing file. * <p> * Note: Any exception thrown from this method MUST trigger error handling * resulting in a high-level abort() and {@link #reset()} of the * {@link WriteCacheService}. * * TODO flush() is currently designed to block concurrent writes() in * order to give us clean decision boundaries for the HA write pipeline and * also to simplify the internal locking design. Once we get HA worked out * cleanly we should explore whether or not we can relax this constraint * such that writes can run concurrently with flush(). That would have * somewhat higher throughput since mutable B+Tree evictions would no longer * cause concurrent tasks to block during the commit protocol or the file * extent protocol. [Perhaps by associating each write set with a distinct * sequence counter (that is incremented by both commit and abort)?] * * TODO Flush should order ALL {@link WriteCache}'s on the dirtyList by * their fileOffset and then evict them in that order. This reordering will * maximize the opportunity for locality during the IOs. With a large write * cache (multiple GBs) this reordering could substantially reduce the * IOWait associated with flush() for a large update. Note: The reordering * should only be performed by the leader in HA mode - the followers will * receive the {@link WriteCache} blocks in the desired order and can just * drop them onto the dirtyList. * * @see WriteTask * @see #dirtyList * @see #dirtyListEmpty */ public boolean flush(final boolean force, final long timeout, final TimeUnit units) throws TimeoutException, InterruptedException { if (haLog.isInfoEnabled()) { /* * Note: This is an important event for HA. The write cache is * flushed to ensure that the entire write set is replicated on the * followers. Once that has been done, HA will do a 2-phase commit * to verify that there is a quorum that agrees to write the root * block. Writing the root block is the only thing that the nodes in * the quorum need to do once the write cache has been flushed. */ haLog.info("Flushing the write cache: seq=" + cacheSequence); } final long begin = System.nanoTime(); final long nanos = units.toNanos(timeout); long remaining = nanos; final WriteLock writeLock = lock.writeLock(); if (!writeLock.tryLock(remaining, TimeUnit.NANOSECONDS)) throw new TimeoutException(); try { final WriteCache tmp = current.getAndSet(null); // if (tmp.remaining() == 0) { // /* // * Handle an empty buffer by waiting until the dirtyList is // * empty. // */ // // remaining := (total - elapsed). // remaining = nanos - (System.nanoTime() - begin); // if (!dirtyListLock.tryLock(remaining, TimeUnit.NANOSECONDS)) // throw new TimeoutException(); // try { // while (!dirtyList.isEmpty() && !halt) { // // remaining := (total - elapsed). // remaining = nanos - (System.nanoTime() - begin); // if (!dirtyListEmpty.await(remaining, // TimeUnit.NANOSECONDS)) { // throw new TimeoutException(); // } // } // if (halt) // throw new RuntimeException(firstCause.get()); // } finally { // dirtyListLock.unlock(); // } // return true; // } // /* // * Otherwise, the current buffer is non-empty. // */ // remaining := (total - elapsed). remaining = nanos - (System.nanoTime() - begin); if (!dirtyListLock.tryLock(remaining, TimeUnit.NANOSECONDS)) throw new TimeoutException(); try { /* * Force WriteTask.call() to evict anything in the cache. * * Note: We need to wait until the dirtyList has been evicted * before writing out the compacting cache (if any) and then * finally drop the compactingCache onto the cleanList. Or have * a 2-stage flush. */ flush = true; /* * Wait until the dirtyList has been emptied. * * Note: [tmp] may be empty, but there is basically zero cost in * WriteTask to process an empty buffer and, done this way, the * code is much less complex here. */ dirtyList.add(tmp); counters.get().ndirty++; dirtyListChange.signalAll(); while (!dirtyList.isEmpty() && !halt) { // remaining := (total - elapsed). remaining = nanos - (System.nanoTime() - begin); if (!dirtyListEmpty.await(remaining, TimeUnit.NANOSECONDS)) { throw new TimeoutException(); } } /* * Add the [compactingCache] (if any) to dirty list and spin it * down again. * * Note: We can not drop the compactingCache onto the dirtyList * until the dirtyList has been spun down to empty. * * Note: We have introduced the directWrite state variable to indicate * that the compactingCache must not be compacted or it may not be * written. */ final WriteCache tmp2 = compactingCacheRef.getAndSet(null/* newValue */); if (tmp2 != null) { directWrite = true; try { if (log.isInfoEnabled()) { log.info("Adding compacting cache"); } dirtyList.add(tmp2); counters.get().ndirty++; dirtyListChange.signalAll(); while (!dirtyList.isEmpty() && !halt) { // remaining := (total - elapsed). remaining = nanos - (System.nanoTime() - begin); if (!dirtyListEmpty.await(remaining, TimeUnit.NANOSECONDS)) { throw new TimeoutException(); } } } finally { directWrite = false; } } if (halt) throw new RuntimeException(firstCause.get()); } finally { flush = false; try { if(!halt) { /* * Check assertions for clean WCS after flush(). * * Note: Can not check assertion if there is an existing * exception. */ assert dirtyList.size() == 0; assert compactingCacheRef.get() == null; assert current.get() == null; } } finally { dirtyListLock.unlock(); } } /* * Replace [current] with a clean cache buffer. */ // remaining := (total - elapsed). remaining = nanos - (System.nanoTime() - begin); if (!cleanListLock.tryLock(remaining, TimeUnit.NANOSECONDS)) throw new TimeoutException(); try { // Note: use of Condition let's us notice [halt]. while (cleanList.isEmpty() && !halt) { // remaining := (total - elapsed). remaining = nanos - (System.nanoTime() - begin); if (!cleanListNotEmpty.await(remaining, TimeUnit.NANOSECONDS)) { throw new TimeoutException(); } if (halt) throw new RuntimeException(firstCause.get()); } // Guaranteed available hence non-blocking. final WriteCache nxt = cleanList.take(); counters.get().nclean--; // Note: should already be pristine nxt.resetWith(serviceMap);//, fileExtent.get()); current.set(nxt); if (haLog.isInfoEnabled()) haLog.info("Flushed the write cache: seq=" + cacheSequence); return true; } finally { cleanListLock.unlock(); } } finally { writeLock.unlock(); } } /** * Set the extent of the file on the current {@link WriteCache}. The then * current value of the extent will be communicated together with the rest * of the {@link WriteCache} state if it is written onto another service * using the write replication pipeline (HA only). The receiver will use the * value read from the {@link WriteCache} message to adjust the extent of * its backing file. * <p> * Note: Changes in the file extent for persistence store implementations * MUST (a) be mutually exclusive with reads and writes on the backing file * (due to a JVM bug); and (b) force the file data and the file metadata to * the disk. Thus any change in the {@link #fileExtent} MUST be followed by * a {@link #flush(boolean, long, TimeUnit)}. * <p> * Note: You MUST set the file extent each time you invoke {@link #reset()} * so the {@link WriteCacheService} is always aware of the correct file * extent. * * @throws InterruptedException * @throws IllegalStateException */ public void setExtent(final long fileExtent) throws IllegalStateException, InterruptedException { if (fileExtent < 0L) throw new IllegalArgumentException(); // final WriteCache cache = acquireForWriter(); // // try { if (log.isDebugEnabled()) log.debug("Set fileExtent: " + fileExtent); // make a note of the current file extent. this.fileExtent.set(fileExtent); // // set the current file extent on the WriteCache. // cache.setFileExtent(fileExtent); // // } finally { // // release(); // // } } @Override public boolean write(final long offset, final ByteBuffer data, final int chk) throws InterruptedException, IllegalStateException { return write(offset, data, chk, useChecksum, 0/* latchedAddr */); } /** * Write the record onto the cache. If the record is too large for the cache * buffers, then it is written synchronously onto the backing channel. * Otherwise it is written onto a cache buffer which is lazily flushed onto * the backing channel. Cache buffers are written in order once they are * full. This method does not impose synchronization on writes which fit the * capacity of a cache buffer. * <p> * When integrating with the {@link RWStrategy} or the {@link WORMStrategy} * there needs to be a read/write lock such that file extension is mutually * exclusive with file read/write operations (due to a Sun bug). The caller * can override {@link #newWriteCache(ByteBuffer, IReopenChannel)} to * acquire the necessary lock (the read lock of a {@link ReadWriteLock}). * This is even true when the record is too large for the cache since we * delegate the write to a temporary {@link WriteCache} wrapping the * caller's buffer. * <p> * Note: Any exception thrown from this method MUST trigger error handling * resulting in a high-level abort() and {@link #reset()} of the * {@link WriteCacheService}. * * @param latchedAddr The latched address (RWStore only). * * @return <code>true</code> since the record is always accepted by the * {@link WriteCacheService} (unless an exception is thrown). * * @see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6371642 * * @todo The WORM serializes invocations on this method because it must put * each record at a specific offset into the user extent of the file. * However, the RW store does not do this. Therefore, for the RW store * only, we could use a queue with lost cost access and scan for best * fit packing into the write cache buffer. When a new buffer is set * as [current], we could pack the larger records in the queue onto * that buffer first. This might provide better throughput for the RW * store but would require an override of this method specific to that * implementation. * * See BLZG-1589 (new latency-oriented counters) */ public boolean write(final long offset, final ByteBuffer data, final int chk, final boolean useChecksum,final int latchedAddr) throws InterruptedException, IllegalStateException { final long begin = System.nanoTime(); try { return write_timed(offset, data, chk, useChecksum, latchedAddr); } finally { final long elapsed = System.nanoTime() - begin; final WriteCacheServiceCounters c = counters.get(); c.ncacheWrites++; // maintain nwrites c.elapsedCacheWriteNanos += elapsed; } } private boolean write_timed(final long offset, final ByteBuffer data, final int chk, final boolean useChecksum,final int latchedAddr) throws InterruptedException, IllegalStateException { if (log.isTraceEnabled()) { log.trace("offset: " + offset + ", length: " + data.limit() + ", chk=" + chk + ", useChecksum=" + useChecksum); } if (!open.get()) throw new IllegalStateException(firstCause.get()); if (offset < 0) throw new IllegalArgumentException(); if (data == null) throw new IllegalArgumentException( AbstractBufferStrategy.ERR_BUFFER_NULL); // #of bytes in the record. final int remaining = data.remaining(); // #of bytes to be written. final int nwrite = remaining + (useChecksum ? 4 : 0); if (remaining == 0) throw new IllegalArgumentException( AbstractBufferStrategy.ERR_BUFFER_EMPTY); if (nwrite > capacity) { /* * Handle large records. */ return writeLargeRecord(offset, data, chk, useChecksum); } /* * The record can fit into a cache instance, so try and acquire one and * write the record onto it. * * @todo this could be refactored to use moveBufferToDirtyList() */ { final WriteCache cache = acquireForWriter(); try { debugAddrs(offset, data.remaining(), 'A'); // write on the cache. if (cache.write(offset, data, chk, useChecksum, latchedAddr)) { final WriteCache old = serviceMap.put(offset, cache); // There should be no duplicate address in the record // map since these entries should be removed, although // write data may still exist in an old WriteCache. // A duplicate may also be indicative of an allocation // error, which we need to be pretty strict about! if (old == cache) { throw new AssertionError("Record already in cache: offset=" + offset + " " + addrDebugInfo(offset)); } return true; } } finally { release(); } } /* * The record did not fit into the current buffer but it is small enough * to fit into an empty buffer. Grab the write lock and then try again. * If it still does not fit, then put the current buffer onto the dirty * list and take a buffer from the clean list and then write the record * onto that buffer while we are holding the lock. This last step must * succeed since the buffer will be empty and the record can fit into an * empty buffer. */ { final Lock writeLock = lock.writeLock(); writeLock.lockInterruptibly(); try { /* * While holding the write lock, see if the record can fit into * the current buffer. Note that the buffer we acquire here MAY * be a different buffer since a concurrent write could have * already switched us to a new buffer. In that case, the record * might fit into the new buffer. */ // Acquire a buffer. Maybe the same one, maybe different. WriteCache cache = acquireForWriter(); try { // While holding the write lock, see if the record fits. if (cache.write(offset, data, chk, useChecksum, latchedAddr)) { /* * It fits: someone already changed to a new cache, * which is fine. */ if (serviceMap.put(offset, cache) != null) { // The record should not already be in the cache. throw new AssertionError("Record already in cache: offset=" + offset + " " + addrDebugInfo(offset)); } return true; } /* * There is not enough room in the current buffer for this * record, so put the buffer onto the dirty list. Then take * a new buffer from the clean list (block), reset the * buffer to clear the old writes, and set it as current. At * that point, the record should always fit. * * Note: When we take a cache instances from the cleanList * we need to remove any entries in our recordMap which are * in its record map. * * Note: We move the current buffer to the dirty list before * we take a buffer from the clean list. This is absolutely * necessary since the code will otherwise deadlock if there * is only one buffer. * * Note: Do NOT yield the WriteLock here. That would make it * possible for another thread to acquire() the current * buffer, which has already been placed onto the dirtyList * by this thread!!! */ /* * Move the current buffer to the dirty list. * * Note: The lock here is not required to give flush() atomic * semantics with regard to the set of dirty write buffers * when flush() gained the writeLock [in fact, we only need * the dirtyListLock for the dirtyListEmpty Condition]. */ if (!current .compareAndSet(cache/* expect */, null/* update */)) { throw new AssertionError(); } dirtyListLock.lockInterruptibly(); try { dirtyList.add(cache); dirtyListChange.signalAll(); } finally { dirtyListLock.unlock(); } /* * Take the buffer from the cleanList and set it has the * [current] buffer. */ // Grab buffer from clean list. final WriteCache newBuffer = takeFromClean(); counters.get().nclean--; // Clear the state on the new buffer and remove from // cacheService map newBuffer.resetWith(serviceMap);//, fileExtent.get()); // Set it as the new buffer. current.set(cache = newBuffer); // Try to write on the new buffer. if (cache.write(offset, data, chk, useChecksum, latchedAddr)) { // This must be the only occurrence of this record. if (serviceMap.put(offset, cache) != null) { throw new AssertionError("Record already in cache: offset=" + offset + " " + addrDebugInfo(offset)); } return true; } /* * Should never happen. */ throw new AssertionError("Unable to write into current WriteCache " + offset + " " + addrDebugInfo(offset)); } finally { release(); } } finally { writeLock.unlock(); } } } private WriteCache takeFromClean() throws InterruptedException { cleanListLock.lockInterruptibly(); try { while (true) { if (log.isInfoEnabled() && cleanList.isEmpty()) log.info("Waiting for clean buffer"); /* * Note: We use the [cleanListNotEmpty] Condition so we can * notice a [halt]. */ while (cleanList.isEmpty() && !halt) { cleanListNotEmpty.await(); } if (halt) throw new RuntimeException(firstCause.get()); // Poll() rather than take() since other methods poll() the list // unprotected. final WriteCache ret = cleanList.poll(); if (ret != null) { return ret; } } } finally { cleanListLock.unlock(); } } // /** // * Caches data read from disk (or even read from "older" cache). // * The assumption is that we do not need a "reserve" buffer. // * // * @param addr // * @param bb // * @throws InterruptedException // */ // public void cache(final long addr, final ByteBuffer bb) // throws InterruptedException { // // I think this is fine! // synchronized (readCache) { // final WriteCache cache = readCache.get(); // if (cache != null && !cache.cache(addr, bb)) { // // add existing non-null cache to clean list // if (cache != null) // addClean(cache, false /* add first */); // // // fetch new readCache from clean list // final WriteCache ncache = getDirectCleanCache(); // // // should not be null // assert ncache != null; // // // if we decide it CAN be null then we simply do not cache the // // read // if (ncache == null) // return; // // // remove any global references to existing data // ncache.resetWith(recordMap); // // only closed caches can cache reads // ncache.closeForWrites(); // // readCache.set(ncache); // ncache.closeForWrites(); // ncache.cache(addr, bb); // // if (recordMap.put(addr, ncache) != null) { // throw new AssertionError("Record already in cache: offset=" // + addr + " " + addrDebugInfo(addr)); // } // } else if (cache != null) { // if (recordMap.put(addr, cache) != null) { // throw new AssertionError("Record already in cache: offset=" // + addr + " " + addrDebugInfo(addr)); // } // } // // // we've written the byte buffer, so flip it! // bb.flip(); // } // } public void debugAddrs(long offset, int length, char c) { if (addrsUsed != null) { addrsUsed[addrsUsedCurs] = offset; addrActions[addrsUsedCurs] = c; addrLens[addrsUsedCurs] = length; addrsUsedCurs++; if (addrsUsedCurs >= addrsUsed.length) { addrsUsedCurs = 0; } } } /** * Write a record whose size (when combined with the optional checksum) is * larger than the capacity of an individual {@link WriteCache} buffer. This * operation is synchronous (to protect the ByteBuffer from concurrent * modification by the caller). It will block until the record has been * written. * <p> * This implementation will write the record onto a sequence of * {@link WriteCache} objects and wait until all of those objects have been * written through to the backing file and the optional HA write pipeline. A * checksum will be appended after the last chunk of the record. This * strategy works for the WORM since the bytes will be laid out in a * contiguous region on the disk. * <p> * Note: For the WORM, this code MUST NOT allow the writes to proceed out of * order or the data will not be laid out correctly on the disk !!! * <p> * Note: The RW store MUST NOT permit individual allocations whose size on * the disk is greater than the capacity of an individual {@link WriteCache} * buffer (@todo Or is this Ok? Perhaps it is if the RW store holds a lock * across the write for a large record? Maybe if we also add a low-level * method for inserting an entry into the record map?) * <p> * Note: This method DOES NOT register the record with the shared * {@link #serviceMap}. Since the record spans multiple {@link WriteCache} * objects it can not be directly recovered without reading it from the * backing file. * * <h2>Dialog on large records</h2> * * It seems to me that the RW store is designed to break up large records * into multiple allocations. If we constrain the size of the largest * allocation slot on the RW store to be the capacity of a WriteCache buffer * (including the bytes for the checksum and other record level metadata) * then we do not have a problem with breaking up large records for it in * the WriteCacheService and it will automatically benefit from HA using the * write replication logic. * <p> * The WORM does not have these limits on the allocation size, so it seems * likely that breaking it up across multiple WriteCache buffer instances * would have to be done inside of the WriteCacheService in order to prevent * checksums from being interleaved with each WriteCache worth of data it * emits for a large record. We can't raise this out of the * WriteCacheService because the large record would not be replicated for * HA. */ protected boolean writeLargeRecord(final long offset, final ByteBuffer data, final int chk, final boolean useChecksum) throws InterruptedException, IllegalStateException { if (log.isTraceEnabled()) { log.trace("offset: " + offset + ", length: " + data.limit() + ", chk=" + chk + ", useChecksum=" + useChecksum); } if (offset < 0) throw new IllegalArgumentException(); if (data == null) throw new IllegalArgumentException(AbstractBufferStrategy.ERR_BUFFER_NULL); // #of bytes in the record. final int remaining = data.remaining(); if (remaining == 0) throw new IllegalArgumentException(AbstractBufferStrategy.ERR_BUFFER_EMPTY); // Small records should not take this code path. if (remaining < capacity) throw new AssertionError(); /* * Put as much into each WriteCache instance as well fit, then transfer * the WriteCache onto the dirtyList, take a new WriteCache from the * cleanList, and continue until all data as been transferred. If * checksums are enabled, add a 4 byte checksum afterwards. * * Note: We hold the WriteLock across this operation since we will be * changing out [current] each time it fills up. This has the * side-effect of guaranteeing that the writes are emitted without * intervening writes of other record. * * while(r > 0) { * * cache = acquire(); * * copy up to [r] bytes into the buffer. * * if the buffer is full, then transfer it to the dirty list. * * release() * * } * * write checksum on buffer */ final Lock writeLock = lock.writeLock(); writeLock.lockInterruptibly(); try { // the offset of the next byte to transfer to a cache buffer. int p = 0; // #of bytes remaining in the large record (w/o the checksum). int r = remaining; while (r > 0) { // Acquire a buffer. final WriteCache cache = acquireForWriter(); try { // #of bytes to copy onto the write cache. final int ncpy = Math.min(r, cache.remaining()); if (ncpy > 0) { // create view of the data to be copied. final ByteBuffer tmp = data.duplicate(); tmp.limit(p + ncpy); tmp.position(p); // Note: For WORM, this MUST NOT add the checksum except // for the last chunk! if (!cache.write(offset + p, tmp, chk, false/* writeChecksum */,0/*latchedAddr*/)) throw new AssertionError(); r -= ncpy; p += ncpy; } if (cache.remaining() == 0) { moveBufferToDirtyList(); } } finally { release(); } } // while( remaining > 0 ) /* * Now we need to write out the optional checksum. We do not have to * flush this write through. The buffer can remain partly full. */ if (useChecksum) { // Acquire a buffer. final WriteCache cache = acquireForWriter(); try { // Allocate a small buffer final ByteBuffer t = ByteBuffer.allocate(4); // Add in the record checksum. t.putInt(chk); // Prepare for reading. t.flip(); // Note: [t] _is_ the checksum. if (!cache.write(offset + p, t, chk, false/* writeChecksum */,0/*latchedAddr*/)) throw new AssertionError(); } finally { release(); } } /* * If the current cache buffer is dirty then we need to move it to * the dirty list since the caller MUST be able to read the record * back from the file by the time this method returns. */ final WriteCache cache = acquireForWriter(); try { if (!cache.isEmpty()) { moveBufferToDirtyList(); } } finally { release(); } /* * In order to guarantee that the caller can read the record back * from the file we now flush the dirty list to the backing store. * When this method returns, the record will be on the disk and can * be read back safely from the disk. */ if (log.isTraceEnabled()) log.trace("FLUSHING LARGE RECORD"); flush(false/* force */); // done. return true; } finally { writeLock.unlock(); } } /** * Move the {@link #current} buffer to the dirty list and await a clean * buffer. The clean buffer is set as the {@link #current} buffer and * returned to the caller. * <p> * Note: If there is buffer available on the {@link #cleanList} then this * method can return immediately. Otherwise, this method will block until a * clean buffer becomes available. * * @return A clean buffer. * * @throws InterruptedException * @throws IllegalMonitorStateException * unless the current thread is holding the {@link WriteLock} * for {@link #lock}. */ private WriteCache moveBufferToDirtyList() throws InterruptedException { if (!lock.isWriteLockedByCurrentThread()) throw new IllegalMonitorStateException(); final WriteCache cache = current.getAndSet(null); assert cache != null; /* * Note: The lock here is required to give flush() atomic semantics with * regard to the set of dirty write buffers when flush() gained the * writeLock [in fact, we only need the dirtyListLock for the * dirtyListEmpty Condition]. */ dirtyListLock.lockInterruptibly(); try { dirtyList.add(cache); dirtyListChange.signalAll(); } finally { dirtyListLock.unlock(); } /* * Take the buffer from the cleanList and set it as the [current] * buffer. * * Note: We use the [cleanListNotEmpty] Condition so we can notice a * [halt]. */ cleanListLock.lockInterruptibly(); try { while (cleanList.isEmpty() && !halt) { cleanListNotEmpty.await(); } if (halt) throw new RuntimeException(firstCause.get()); // Take a buffer from the cleanList (guaranteed avail). final WriteCache newBuffer = cleanList.take(); counters.get().nclean--; // Clear state on new buffer and remove from cacheService map newBuffer.resetWith(serviceMap);//, fileExtent.get()); // Set it as the new buffer. current.set(newBuffer); return newBuffer; } finally { cleanListLock.unlock(); } } /** * Add to the cleanList. * <p> * Since moving to an explicit readCache, we now call resetWith before * adding the the cleanList. Potentially removing latency on acquiring * a new cache from the clean list. * <p> * If a readCache is in operation then we will transfer to the read cache */ private void addClean(final WriteCache cache, final boolean addFirst) throws InterruptedException { if (cache == null) throw new IllegalArgumentException(); if (this.readListSize > 0) { // if there is a readCache installReads(cache); } else { cache.resetWith(serviceMap); } cleanListLock.lockInterruptibly(); try { assert cache.isEmpty() || cache.isClosedForWrites(); if (addFirst) { cleanList.addFirst(cache); } else { cleanList.addLast(cache); } cleanListNotEmpty.signalAll(); counters.get().nclean = cleanList.size(); } finally { cleanListLock.unlock(); } } public boolean installReads(final WriteCache cache) throws InterruptedException { if (readListSize == 0) return false; synchronized (readCache) { final ReadCache rcache = readCache.get(); if (!WriteCache.transferTo(cache, rcache, serviceMap, 0)) { // full readCache readCache.set(null); if (rcache.decrementReferenceCount()==0) { readList.add(rcache); } final ReadCache ncache = getDirectReadCache(); if (ncache == null) { throw new AssertionError(); } // remaining must be >= to announced capacity after getDirectReadCache if (ncache.remaining() < ncache.capacity()) throw new AssertionError("New Cache, remaining() < capacity(): " + ncache.remaining() + " < " + ncache.capacity()); // Now transfer remaining to new readCache if (!WriteCache.transferTo(cache, ncache, serviceMap, 0)) { throw new AssertionError("Unable to complete transfer to new cache with remaining: " + ncache.remaining()); } ncache.incrementReferenceCount(); readCache.set(ncache); } } return true; } /** * Pool the {@link #cleanList} and return the {@link WriteCache} from the * head of the {@link #cleanList} IFF one is available and otherwise * <code>null</code>. * * @return The {@link WriteCache} iff one was available. * * @throws InterruptedException */ private WriteCache getDirectCleanCache() throws InterruptedException { final WriteCache tmp = cleanList.poll(); if (tmp != null) { counters.get().nclean--; } return tmp; } /** * Non-blocking take of a {@link ReadCache}. If successful, the returned * {@link ReadCache} will be clean. Otherwise return <code>null</code>. */ private ReadCache getDirectReadCache() throws InterruptedException { // Non-blocking take. ReadCache tmp = readList.poll(); if (tmp == null) return null; try { /* * Attempt to reset the record. */ synchronized (readCache) { if (hotCache == null) { tmp.resetWith(serviceMap); return tmp; } int cycles = 0; while (tmp != null) { if (log.isDebugEnabled() && !tmp.isEmpty()) { /* * Just debug stuff. */ int hitRecords = 0; int hotRecords = 0; int totalRecords = 0; final Iterator<RecordMetadata> values = tmp.recordMap .values().iterator(); while (values.hasNext()) { final RecordMetadata md = values.next(); totalRecords++; if (md.getHitCount() > 0) { hitRecords++; if (md.getHitCount() > hotCacheThreshold) hotRecords++; } } log.debug("Recycled ReadCache, hot(>" + hotCacheThreshold + "): " + hotRecords + ", hit: " + hitRecords + " of " + totalRecords); } if (WriteCache.transferTo(tmp, hotCache, serviceMap, hotCacheThreshold)) { if (!tmp.isEmpty()) throw new AssertionError(); tmp.reset(); break; } if (log.isDebugEnabled()) log.debug("Cycle HOTCACHE: " + ++cycles); // transfer not completed, so: // move current hotCache to end of HotList // move head of HotList to end of ReadList // make hotReserve new hotCache // complete transfer to new hotCache // make now empty tmp new hotReserve hotList.add(hotCache); readList.add(hotList.poll().resetHitCounts()); if (!hotReserve.isEmpty()) throw new AssertionError(); hotCache = hotReserve; hotReserve = null; if (!WriteCache.transferTo(tmp, hotCache, serviceMap, hotCacheThreshold)) { throw new AssertionError(); } tmp.reset(); hotReserve = tmp; tmp = readList.poll(); } // while (tmp != null) } // synchronized(readCache) } catch (InterruptedException ex) { /* * If interrupted, then return the ReadCache to the list and * propagate the interrupt to the caller. This makes the operation * safe with respect to an interrupt. Either the operation succeeds * fully, or we return [null] to the caller and propagate restore * the interrupt status on the current Thread. */ readList.put(tmp); // Propagate the interrupt status. Thread.currentThread().interrupt(); // ReadCache is not available. return null; } return tmp; } /** * This is a non-blocking query of all write cache buffers (current, clean * and dirty). * <p> * This implementation DOES NOT throw an {@link IllegalStateException} if * the service is already closed NOR if there is an asynchronous close of * the service. Instead it just returns <code>null</code> to indicate a * cache miss. */ public ByteBuffer read(final long offset, final int nbytes) throws InterruptedException, ChecksumError { // Check the cache. final ByteBuffer tmp = _readFromCache(offset, nbytes); if (tmp != null) { if (tmp.remaining() == 0) throw new AssertionError(); // Cache hit. return tmp; } // Cache miss. counters.get().nmiss.increment(); if (reader != null) { /* * Read through to the disk and install the record into cache. */ final ByteBuffer ret = loadRecord(offset, nbytes); if (ret != null && ret.remaining() == 0) throw new AssertionError(); return ret; } else { /* * No reader. Return null. Caller is responsible for reading through * to the disk. */ return null; } } /** * Attempt to read record from cache (either write cache or read cache * depending on the service map state). */ public ByteBuffer _readFromCache(final long offset, final int nbytes) throws ChecksumError, InterruptedException { if (nbytes > capacity) { /* * Note: Writes larger than a single write cache buffer are NOT * cached. */ return null; } final Long off = Long.valueOf(offset); while (true) { if (!open.get()) { /* * Not open. Return [null] rather than throwing an exception per * the contract for this implementation. */ return null; } final WriteCache cache = serviceMap.get(off); if (cache == null) { // Cache miss. break; } /* * Ask the cache buffer if it has the record still. It will not * if the cache buffer has been concurrently reset. */ try { final ByteBuffer ret = cache.read(off.longValue(), nbytes); if (ret == null && serviceMap.get(off) == cache) { throw new IllegalStateException( "Inconsistent cache for offset: " + off); } if (ret == null && log.isDebugEnabled()) { log.debug("WriteCache out of sync with WriteCacheService"); } if (ret != null) return ret; // May have been transferred to another Cache! // // Fall through. continue; } catch (IllegalStateException ex) { /* * The write cache was closed. Per the API for this method, * return [null] so that the caller will read through to the * backing store. */ assert !open.get(); return null; } } // Cache miss. return null; } /** * Helper class models a request to load a record from the backing store. * <p> * Note: This class must implement equals() and hashCode() since it is used * within the {@link Memoizer} pattern. */ private static class LoadRecordRequest { final WriteCacheService service; final long offset; final int nbytes; public LoadRecordRequest(final WriteCacheService service, final long offset, final int nbytes) { this.service = service; this.offset = offset; this.nbytes = nbytes; } /** * Equals returns true iff the request has the same parameters. */ public boolean equals(final Object o) { if (!(o instanceof LoadRecordRequest)) return false; final LoadRecordRequest r = (LoadRecordRequest) o; return service == r.service && offset == r.offset && nbytes == r.nbytes; } /** * The hashCode() implementation assumes that the <code>offset</code>'s * hashCode() is well distributed. */ public int hashCode() { return (int) (offset ^ (offset >>> 32)); } } /** * Helper loads a child node from the specified address by delegating * {@link WriteCacheService#_getRecord(long, int)}. */ final private static Computable<LoadRecordRequest, ByteBuffer> loadChild = new Computable<LoadRecordRequest, ByteBuffer>() { /** * Loads a record from the specified address. * * @return A heap {@link ByteBuffer} containing the data for that * record. * * @throws IllegalArgumentException * if addr is {@link IRawStore#NULL}. */ public ByteBuffer compute(final LoadRecordRequest req) throws InterruptedException { try { final ByteBuffer ret = req.service._getRecord(req.offset, req.nbytes); if (ret != null && ret.remaining() == 0) throw new AssertionError(); return ret; } finally { /* * Clear the future task from the memoizer cache. * * Note: This is necessary in order to prevent the cache from * retaining a hard reference to each child materialized for the * B+Tree. * * Note: This does not depend on any additional synchronization. * The Memoizer pattern guarantees that only one thread actually * call ft.run() and hence runs this code. */ req.service.memo.removeFromCache(req); } } }; /** * A {@link Memoizer} subclass which exposes an additional method to remove * a {@link FutureTask} from the internal cache. */ private static class ReadMemoizer extends Memoizer<LoadRecordRequest/* request */, ByteBuffer/* child */> { /** * @param c */ public ReadMemoizer(final Computable<LoadRecordRequest, ByteBuffer> c) { super(c); } /** * The approximate size of the cache (used solely for debugging to * detect cache leaks). */ int size() { return cache.size(); } /** * Called by the thread which atomically installs the record into the * cache and updates the service record map. At that point the record is * available from the service record map. * * @param req * The request. */ void removeFromCache(final LoadRecordRequest req) { if (cache.remove(req) == null) { throw new AssertionError(); } } // /** // * Called from {@link AbstractBTree#close()}. // * // * @todo should we do this? There should not be any reads against the // * the B+Tree when it is close()d. Therefore I do not believe there // * is any reason to clear the FutureTask cache. // */ // void clear() { // // cache.clear(); // // } }; /** * Used to materialize records with at most one thread reading the * record from disk for a given address. Other threads desiring the * same record will wait on the {@link Future} for the thread doing * the work. */ private final ReadMemoizer memo; /** * Enter the memoizer pattern. */ private ByteBuffer loadRecord(final long offset, final int nbytes) { try { counters.get().memoCacheSize.set(memo.size()); final ByteBuffer ret = memo.compute(new LoadRecordRequest(this, offset, nbytes)); // Duplicate buffer since memoizer may return same ByteBuffer to multiple callers // resulting in problems of concurrent read return ret.duplicate(); } catch (InterruptedException e) { /* * Note: This exception will be thrown iff interrupted while * awaiting the FutureTask inside of the Memoizer. */ throw new RuntimeException(e); } } /** * Method invoked from within the memoizer pattern to read the record from * the backing store and install it into the cache. The method must first * verify that the record is not in the cache. * * @param offset * @param nbytes * @return A heap byte buffer containing the read record. * @throws IllegalStateException * @throws InterruptedException */ private ByteBuffer _getRecord(final long offset, final int nbytes) throws IllegalStateException, InterruptedException { /* * On entry, this thread will either install the read into the cache or * the record will already be in the cache. We are protected by the * memoizer pattern here. No other thread will be attempting to install * the same record (the record for that offset) into the cache. */ ByteBuffer tmp = _readFromCache(offset, nbytes); if (tmp != null) { // Already in the read cache. if (tmp.remaining() == 0) throw new AssertionError(); return tmp; } final boolean largeRecord = nbytes > capacity; final boolean directRead = largeRecord || this.readListSize == 0; if (directRead) { // No free buffer to install the read (OR largeRecord) final ByteBuffer ret = _readFromLocalDiskIntoNewHeapByteBuffer(offset, nbytes); if (ret != null && ret.remaining() == 0) throw new AssertionError(); return ret; } /* * The reader threads co-operatively manage the readCache on behalf of * the WCS. The allocation attempt for a cache buffer is serialized and * when an allocation fails a new readCache is initialized and the * previous cache reference is decremented (no longer referenced as the * current read cache). * * When a cache is selected to buffer a read, the reference is * incremented while the read is active. * * When the reference is finally decremented to zero (either at the end * of a read or after a failed allocation) the cache can be returned to * the clean list. */ // The cache block into which we will install the record. ReadCache theCache = null; // The buffer slice into which we will install the record. ByteBuffer bb = null; /* * Set true iff we will install a record and have incremented the * reference count for the cache. if true, then this Thread MUST * decrement the reference count by any code path that leaves this * method. (If a obtain an allocation but do not set this flag, then we * will not actually perform the installation and the cache block will * not be pinned.) */ boolean willInstall = false; try { synchronized (readCache) { theCache = readCache.get(); if (theCache != null) { /* * Attempt to allocate record on current read cache. */ assert theCache.getReferenceCount() > 0; bb = theCache.allocate(nbytes); // intr iff can't lock(). if (bb != null) { // increment while readCache synchronized theCache.incrementReferenceCount(); willInstall = true; } else { /* * At this point, the current [readCache] does not have * enough room to install the record. We will clear the * [readCache] reference and transfer it to the * [readList]. * * *** CRITICAL SECTION *** * * We MUST transfer cache once reference is cleared or * the buffer will be lost! * * Note: Anything on the readList MUST have * referenceCount==0 since we do not transfer to the * readList until that condition is met. */ readCache.set(null); if (theCache.decrementReferenceCount() == 0) { readList.add(theCache); } } } if (bb == null) { /* * Either no [readCache] on entry or no room in current * [readCache] and [readCache] was set to [null]. */ assert readCache.get() == null; // pre-condition. final ReadCache newCache = getDirectReadCache(); // non-blocking take if (newCache != null) { assert newCache.getReferenceCount() == 0; { // CRITICAL SECTION // Pre-increment the new [readCache]. newCache.incrementReferenceCount(); // Set read cache reference. readCache.set(newCache/* newValue */); } // guaranteed to succeed unless interrupted bb = newCache.allocate(nbytes); theCache = newCache; { // CRITICAL SECTION. // increment while readCache synchronized theCache.incrementReferenceCount(); willInstall = true; } } } } // synchronized(readCache) if (bb == null) { /* * No free buffer to install the read. Read directly into a heap * ByteBuffer and return that to the caller. */ assert willInstall == false; return _readFromLocalDiskIntoNewHeapByteBuffer(offset, nbytes); } /* * [bb] is a view onto an allocation on [theCache] into which we can * install the read. */ // The offset into [bb] of the allocation. final int pos = bb.position(); // Read the record from the disk into NIO buffer. final ByteBuffer ret = reader.readRaw(offset, bb); // must copy to heap buffer from cache, allowing for checksum final byte[] b = new byte[nbytes - 4]; ret.get(b); // calculate checksum from readRaw before adding to readCache! { final int datalen = nbytes - 4; final int chk = ret.getInt(pos + datalen); if (chk != ChecksumUtility.threadChk.get().checksum(b, 0/* offset */, datalen)) { throw new ChecksumError(); } } // update record maps theCache.commitToMap(offset, pos, nbytes); serviceMap.put(offset, theCache); return ByteBuffer.wrap(b); } catch (Throwable t) { t.printStackTrace(System.err); throw new RuntimeException(t); } finally { /* * CRITICAL SECTION. If [willInstall] then we are responsible for * this ReadCache and MUST decrement the counter. */ if (willInstall && theCache.decrementReferenceCount() == 0) { readList.add(theCache); // END CRITICAL SECTION. if (theCache == readCache.get()) throw new AssertionError(); } } } /** * Read through to the backing file. * * @param offset * The byte offset of the record on the backing file. * @param nbytes * The #of bytes to be read. * * @return The installed record in a newly allocated heap {@link ByteBuffer} * . */ private final ByteBuffer _readFromLocalDiskIntoNewHeapByteBuffer( final long offset, final int nbytes) { if (log.isDebugEnabled()) log.debug("Allocating direct, nbytes: " + nbytes); final ByteBuffer ret = reader.readRaw(offset, ByteBuffer.allocate(nbytes)); final int chk = ChecksumUtility.getCHK().checksum(ret.array(), 0/* offset */, nbytes - 4/* len */); // read checksum final int tstchk = ret.getInt(nbytes - 4); if (chk != tstchk) throw new ChecksumError("offset=" + offset + ",nbytes=" + nbytes + ",expected=" + tstchk + ",actual=" + chk); ret.limit(nbytes - 4); if (ret.remaining() == 0) throw new AssertionError(); // This read was not installed into the read cache. counters.get().nreadNotInstalled.increment(); return ret; } /** * Read the data from the backing file. * * We need to know the size of the data so we can allocate the buffer. * * @param offset * @return * @throws InterruptedException * @throws IllegalStateException */ // private ByteBuffer readBacking(final long offset, final int nbytes) // throws IllegalStateException, InterruptedException { // if (reader == null) // return null; // // if (nbytes > readCache.get().capacity()) // not possible to cache // return null; // // // allocate space in readCache and retrieve buffer into which we'll // // read the data // // ByteBuffer bb = null; // WriteCache installCache; // synchronized (readCache) { // final WriteCache cache = readCache.get(); // bb = cache.allocate(offset, nbytes); // if (bb == null) { // return readCache to clean list // addClean(cache, false/* add to front */); // installCache = getDirectCleanCache(); // readCache.set(installCache); // installCache.closeForWrites(); // // bb = installCache.allocate(offset, nbytes); // // assert bb != null; // } else { // installCache = cache; // } // } // // // must return new byte[] since original ByteBuffer will be updated // final byte[] ret = new byte[nbytes - 4]; // // // DEBUG readRaw into non-direct byte buffer // // final ByteBuffer trans = ByteBuffer.wrap(ret); // // reader.readRaw(offset, trans); // // reader.readRaw(offset, bb); // // recordMap.put(offset, installCache); // // // copy WriteCache data into return buffer // bb.get(ret); // // return ByteBuffer.wrap(ret); // } /** * Called to check if a write has already been flushed. This is only made if * a write has been made to previously committed data (in the current RW * session). * <p> * If dirty {@link WriteCache}s are flushed in order then it does not * matter, however, if we want to be able to combine {@link WriteCache}s * then it makes sense that there are no duplicate writes. * <p> * On reflection this is more likely needed since for the {@link RWStore}, * depending on session parameters, the same cached area could be * overwritten. We could still maintain multiple writes but we need a * guarantee of order when retrieving data from the write cache (newest * first). * <p> * So the question is, whether it is better to keep cache consistent or to * constrain with read order? * * @param offset * the address to check */ public boolean clearWrite(final long offset, final int latchedAddr) { try { counters.get().nclearAddrRequests++; while (true) { final WriteCache cache = serviceMap.get(offset); if (cache == null) { // Not found. return false; } cache.transferLock.lock(); try { // /** // * Note: The tests below require us to take the read lock on // * the WriteCache before we test the serviceMap again in // * order to guard against a concurrent reset() of the // * WriteCache. // * // * @see <a href= // * "https://sourceforge.net/apps/trac/bigdata/ticket/654" // * Rare AssertionError in WriteCache.clearAddrMap() // * </a> // */ // cache.acquire(); // try { final WriteCache cache2 = serviceMap.get(offset); if (cache2 != cache) { /* * Not found in this WriteCache. * * Record was (re-)moved before we got the lock. * * Note: We need to retry. WriteCache.transferTo() could * have just migrated the record to another WriteCache. */ continue; } // Remove entry from the recordMap. final WriteCache oldValue = serviceMap.remove(offset); if (oldValue == null) { /** * Note: The [WriteCache.transferLock] protects the * WriteCache against a concurrent transfer of a record * in WriteCache.transferTo(). However, * WriteCache.resetWith() does NOT take the * transferLock. Therefore, it is possible (and valid) * for the [recordMap] entry to be cleared to [null] for * this record by a concurrent resetWith() call. * * @see <a href= * "https://sourceforge.net/apps/trac/bigdata/ticket/654" * Rare AssertionError in WriteCache.clearAddrMap() * </a> */ continue; } if (oldValue != cache) { /* * Concurrent modification! */ throw new AssertionError("oldValue=" + oldValue + ", cache=" + cache + ", offset=" + offset + ", latchedAddr=" + latchedAddr); } /* * Note: clearAddrMap() is basically a NOP if the WriteCache * has been closedForWrites(). */ if (cache.clearAddrMap(offset, latchedAddr)) { // Found and cleared. counters.get().nclearAddrCleared++; debugAddrs(offset, 0, 'F'); return true; } // } finally { // cache.release(); // } } finally { cache.transferLock.unlock(); } } } catch (InterruptedException e) { throw new RuntimeException(e); } } // /** // * Debug method to verify that the {@link WriteCacheService} has flushed all // * {@link WriteCache} buffers. // * // * @return whether there are no outstanding writes buffered // */ // public boolean isFlushed() { // // final boolean clear = // dirtyList.size() == 0 // && compactingCacheRef.get() == null // && (current.get() == null || current.get().isEmpty()); // // return clear; // // } /** * An array of writeCache actions is maintained that can be used * to provide a breadcrumb of how that address has been written, saved, * freed or removed. * <p> * Write errors often show up as a checksum error, so the length of * data written to the address cab be crucial information in determining the * root of any problem. * * @param address for which info requested * @return summary of writeCache actions */ public String addrDebugInfo(final long paddr) { if (addrsUsed == null) { return "No WriteCache debug info"; } final StringBuffer ret = new StringBuffer(); // // first see if address was ever written // boolean written = false; for (int i = 0; i < addrsUsed.length; i++) { if (i == addrsUsedCurs) { ret.append("|...|"); } if (addrsUsed[i] == paddr) { ret.append(addrActions[i]); if (addrActions[i]=='A') { ret.append("[" + addrLens[i] + "]"); } } } /* * Note: I've added in the write cache service counters here for * information about the maximum #of buffers from the pool which have * been in use, #of flushes, etc. */ ret.append(":"); ret.append(getCounters().toString()); return ret.toString(); } /** * Return <code>true</code> iff the address is in the write * cache at the moment which the write cache is checked. * <p> * Note: Unless the caller is holding an appropriate lock * across this operation, the result is NOT guaranteed to * be correct at any time other than the moment when the * cache was tested. */ public boolean isPresent(final long addr) { // System.out.println("Checking address: " + addr); return serviceMap.get(addr) != null; } /** * Note: Atomic reference is used so the counters may be imposed from * outside. */ private final AtomicReference<WriteCacheServiceCounters> counters; /** * Return the performance counters for the {@link WriteCacheService}. */ public CounterSet getCounters() { return counters.get().getCounters(); } /** * Return the #of {@link WriteCache} blocks sent by the quorum leader to * the first downstream follower. */ public long getSendCount() { return counters.get().nsend; } /** * An instance of this exception is thrown if a thread notices that the * {@link WriteCacheService} was closed by a concurrent process. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> */ public static class AsynchronousCloseException extends IllegalStateException { private static final long serialVersionUID = 1L; } }