/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ package com.bigdata.rwstore; import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.RandomAccessFile; import java.lang.ref.WeakReference; import java.nio.ByteBuffer; import java.nio.channels.AsynchronousFileChannel; import java.nio.channels.Channel; import java.nio.channels.ClosedByInterruptException; import java.nio.channels.FileChannel; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardOpenOption; import java.security.DigestException; import java.security.MessageDigest; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedHashSet; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import org.apache.log4j.Logger; import org.apache.system.SystemUtil; import com.bigdata.btree.BTree.Counter; import com.bigdata.btree.IIndex; import com.bigdata.btree.ITuple; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.IndexMetadata; import com.bigdata.cache.ConcurrentWeakValueCache; import com.bigdata.counters.CounterSet; import com.bigdata.counters.Instrument; import com.bigdata.counters.striped.StripedCounters; import com.bigdata.ha.HAGlue; import com.bigdata.ha.HAPipelineGlue; import com.bigdata.ha.QuorumPipeline; import com.bigdata.ha.QuorumService; import com.bigdata.ha.msg.HAWriteMessage; import com.bigdata.ha.msg.IHALogRequest; import com.bigdata.ha.msg.IHARebuildRequest; import com.bigdata.ha.msg.IHAWriteMessage; import com.bigdata.io.ChecksumUtility; import com.bigdata.io.DirectBufferPool; import com.bigdata.io.FileChannelUtility; import com.bigdata.io.FileChannelUtility.AsyncTransfer; import com.bigdata.io.IBufferAccess; import com.bigdata.io.IReopenChannel; import com.bigdata.io.MergeStreamWithSnapshotData; import com.bigdata.io.compression.CompressorRegistry; import com.bigdata.io.compression.IRecordCompressor; import com.bigdata.io.writecache.BufferedWrite; import com.bigdata.io.writecache.IBackingReader; import com.bigdata.io.writecache.IBufferedWriter; import com.bigdata.io.writecache.WriteCache; import com.bigdata.io.writecache.WriteCacheService; import com.bigdata.journal.AbstractBufferStrategy; import com.bigdata.journal.AbstractJournal; import com.bigdata.journal.AbstractJournal.ISnapshotData; import com.bigdata.journal.CommitRecordIndex; import com.bigdata.journal.CommitRecordSerializer; import com.bigdata.journal.FileMetadata; import com.bigdata.journal.ForceEnum; import com.bigdata.journal.ICommitRecord; import com.bigdata.journal.ICommitter; import com.bigdata.journal.IHABufferStrategy; import com.bigdata.journal.IRootBlockView; import com.bigdata.journal.RootBlockView; import com.bigdata.journal.StoreState; import com.bigdata.journal.StoreTypeEnum; import com.bigdata.quorum.Quorum; import com.bigdata.quorum.QuorumException; import com.bigdata.rawstore.IAllocationContext; import com.bigdata.rawstore.IPSOutputStream; import com.bigdata.rawstore.IRawStore; import com.bigdata.rwstore.StorageStats.Bucket; import com.bigdata.service.AbstractTransactionService; import com.bigdata.util.BytesUtil; import com.bigdata.util.ChecksumError; /** * Storage class * <p> * Provides an interface to allocating storage within a disk file. * <p> * Essentially provides a DiskMalloc interface. * <p> * In addition to the DiskMalloc/ReAlloc mechanism, a single root address can be * associated. This can be used when opening an existing storage file to * retrieve some management object - such as an object manager! * <p> * The allocator also support atomic update via a simple transaction mechanism. * <p> * Updates are normally committed immediately, but by using startTransaction and * commitTransaction, the previous state of the store is retained until the * moment of commitment. * <p> * It would also be possible to add some journaling/version mechanism, where * snapshots of the allocation maps are retained for sometime. For a store which * was only added to this would not be an unreasonable overhead and would * support the rolling back of the database weekly or monthly if required. * <p> * The input/output mechanism uses ByteArray Input and Output Streams. * <p> * One difference between the disk realloc and in memory realloc is that the * disk realloc will always return a new address and mark the old address as * ready to be freed. * <p> * The method of storing the allocation headers has been changed from always * allocating at the end of the file (and moving them on file extend) to * allocation of fixed areas. The meta-allocation data, containing the bitmap * that controls these allocations, is itself stored in the heap, and is now * structured to include both the bit data and the list of meta-storage * addresses. * <p> * Sizing: 256 allocators would reference approximately 2M objects/allocations. * At 1K per allocator this would require 250K of store. The meta-allocation * data would therefore need a start address plus 32 bytes (or 8 ints) to * represent the meta-allocation bits. An array of such data referencing * sequentially allocated storage areas completes the meta-allocation * requirements. * <p> * A meta-allocation address can therefore be represented as a single bit offset * from which the block, providing start address, and bit offset can be directly * determined. * <p> * The m_metaBits int array used to be fully used as allocation bits, but now * stores both the start address plus the 8 ints used to manage that data block. * <p> * Allocation is reduced to sets of allocator objects which have a start address * and a bitmap of allocated storage maps. * <p> * Searching thousands of allocation blocks to find storage is not efficient, * but by utilizing roving pointers and sorting blocks with free space available * this can be made most efficient. * <p> * In order to provide optimum use of bitmaps, this implementation will NOT use * the BitSet class. * <p> * Using the meta-allocation bits, it is straightforward to load ALL the * allocation headers. A total of (say) 100 allocation headers might provide up * to 4000 allocations each -> 400 000 objects, while 1000 headers -> 4m objects * and 2000 -> 8m objects. * <p> * The allocators are split into a set of FixedAllocators and then * BlobAllocation. The FixedAllocators will allocate from 128 to 32K objects, * with a minimum block allocation of 64K, and a minimum bit number per block of * 32. * <p> * Where possible lists and roving pointers will be used to minimize searching * of the potentially large structures. * <p> * Since the memory is allocated on (at least) a 128 byte boundary, there is * some leeway on storing the address. Added to the address is the shift * required to make to the "standard" 128 byte block, e.g. blocksize = 128 << * (addr % 8) * <p> * NB Useful method on RandomAccessFile.setLength(newLength) * <p> * When session data is preserved two things must happen - the allocators must * not reallocate data that has been freed in this session, or more clearly can * only free data that has been allocated in this session. That should be it. * <p> * The ALLOC_SIZES table is the fibonacci sequence. We multiply by 64 bytes to * get actual allocation block sizes. We then allocate bits based on 8K * allocation rounding and 32 bits at a time allocation. Note that 4181 * 64 = * 267,584 and 256K is 262,144 * <p> * All data is checksummed, both allocated/saved data and the allocation blocks. * <p> * BLOB allocation is not handled using chained data buffers but with a blob * header record. This is indicated with a BlobAllocator that provides indexed * offsets to the header record (the address encodes the BlobAllocator and the * offset to the address). The header record stores the number of component * allocations and the address of each. * <p> * This approach makes for much more efficient freeing/re-allocation of Blob * storage, in particular avoiding the need to read in the component blocks to * determine chained blocks for freeing. This is particularly important for * larger stores where a disk cache could be flushed through simply freeing BLOB * allocations. * <h2> * Deferred Free List</h2> * <p> * The previous implementation has been amended to associate a single set of * deferredFree blocks with each CommitRecord. The CommitRecordIndex will then * provide access to the CommitRecords to support the deferred freeing of * allocations based on age/earliestTxReleaseTime. * <p> * The last release time processed is held with the MetaAllocation data * * @author Martyn Cutcher * * FIXME Release checklist: * <p> * Add metabits header record checksum field and verify on read back. * <p> * Done. Checksum fixed allocators (needs to be tested on read back). * <p> * Done. Add version field to the fixed allocator. * <p> * Done. Checksum delete blocks / blob records. * <p> * PSOutputStream - remove caching logic. It is unused and makes this * class much more complex. A separate per-RWStore caching class for * recycling PSOutputStreams can be added later. * <p> * Modify FixedAllocator to use arrayCopy() rather than clone and * declare more fields to be final. See notes on {@link AllocBlock}. * <p> * Done. Implement logic to "abort" a shadow allocation context. * <p> * Unit test to verify that we do not recycle allocations from the last * commit point even when the retention time is zero such that it is * always possible to re-open the store from the alternative root block * even after you have allocated things against the current root block * (but not yet committed). * <p> * Read-only mode. * <p> * Unit tests looking for persistent memory leaks (e.g., all allocated * space can be reclaimed). */ public class RWStore implements IStore, IBufferedWriter, IBackingReader { private static final transient Logger log = Logger.getLogger(RWStore.class); /** * @see http://sourceforge.net/apps/trac/bigdata/ticket/443 (Logger for * RWStore transaction service and recycler) */ private static final Logger txLog = Logger.getLogger("com.bigdata.txLog"); /** * Options understood by the {@link RWStore}. */ public interface Options { /** * Option defines the Allocation block sizes for the RWStore. The values * defined are multiplied by 64 to provide the actual allocations. The * list of allocations should be ',' delimited and in increasing order. * This array is written into the store so changing the values does not * break older stores. For example, * * <pre> * "1,2,4,8,116,32,64" * </pre> * * defines allocations from 64 to 4K in size. It is a good to define * block sizes on 4K boundaries as soon as possible to optimize IO. This * is particularly relevant for SSDs. A 1K boundary is expressed as * <code>16</code> in the allocation sizes, so a 4K boundary is * expressed as <code>64</code> and an 8k boundary as <code>128</code>. * <p> * The default allocations are {@value #DEFAULT_ALLOCATION_SIZES}. * * @see #DEFAULT_ALLOCATION_SIZES */ String ALLOCATION_SIZES = RWStore.class.getName() + ".allocationSizes"; /** * Note: The default allocation sizes SHOULD NOT provide for allocation * slots larger than an 8k page. This can lead to large allocation slots * when a B+Tree index is sparsely populated (less efficient prefix * compression) followed by a gradual reduction in the average page size * with the net effect that large allocators become unused and turn into * wasted and unrecoverable space on the backing file. Keeping to an 8k * maximum allocation slot size means that we have to do a few more IOs * if the page exceeds the 8k boundary, but we never wind up with those * large and (mostly) unused allocators. The B+Tree branching factors * should be tuned to target perhaps 80% of an 8k page in order to have * only a small number of pages that spill over into blobs. * * TODO: We should consider a more adaptable BLOB approach where we * specify the maximum "slop" in an allocation as the means to determine * a blob boundary. So, for example, a 5.5K allocation, with maximum slop of * 1K, would be allocated as a blob of 4K + 2K and not an 8K slot. * * @see #ALLOCATION_SIZES */ String DEFAULT_ALLOCATION_SIZES = "1, 2, 3, 5, 8, 12, 16, 32, 48, 64, 128"; // String DEFAULT_ALLOCATION_SIZES = "1, 2, 3, 5, 8, 12, 16, 32, 48, 64, 128, 192, 320, 512, 832, 1344, 2176, 3520"; // String DEFAULT_ALLOCATION_SIZES = "1, 2, 3, 5, 8, 12, 16, 32, 48, 64, 128, 192, 320, 512"; // private static final int[] DEFAULT_ALLOC_SIZES = { 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181 }; // private static final int[] ALLOC_SIZES = { 1, 2, 4, 8, 16, 32, 64, 128 }; /** * Option defines the initial size of the meta bits region and effects * how rapidly this region will grow (default * {@value #DEFAULT_META_BITS_SIZE}). * <p> * Note: A value of <code>9</code> may be used to stress the logic which * is responsible for the growth in the meta bits region. * <p> * This has now been deprecated since it adds complexity with no significant benefit */ @Deprecated String META_BITS_SIZE = RWStore.class.getName() + ".metaBitsSize"; @Deprecated String DEFAULT_META_BITS_SIZE = "9"; /** * Defines whether the metabits should be allocated an explicit demispace (default) * or if not, then to use a standard Allocation (which limits the metabits size to * the maximum FixedAllocator slot size). * <p> * The value should be either "true" or "false" */ String META_BITS_DEMI_SPACE = RWStore.class.getName() + ".metabitsDemispace"; String DEFAULT_META_BITS_DEMI_SPACE = "false"; /** * Defines whether blobs, which are stored in multiple slot locations, * are read concurrently using Async NIO. This was introduced * specifically to reduce commit latency in scenarios where large * transactions can lead to very large deferred free lists (>> 10 * million addresses), stored as blobs. * <p> * BLZG-1884 indicated a possible problem with this approach. The root * causes of that problem (poor handling of exceptions) have been dealt * with. This option was also introduced so the async IO support can * now be disabled if a problem does materialize. */ String READ_BLOBS_ASYNC = RWStore.class.getName() + ".readBlobsAsync"; /** * Note: Windows does not handle async IO channel reopens in the same * fashion as Linux, leading to "overlapping file exceptions" and other * weirdness. Therefore this option is explicitly disabled by default * on Windows. * * @see https://jira.blazegraph.com/browse/BLZG-1911 (Blazegraph 2.1 * version does not work on Windows (async IO causes file lock * errors)) */ String DEFAULT_READ_BLOBS_ASYNC = SystemUtil.isWindows() ? "false" : "true"; /** * Defines the number of bits that must be free in a FixedAllocator for * it to be added to the free list. This is used to ensure a level * of locality when making large numbers of allocations within a single * commit. * <p> * The value should be >= 1 and <= 5000 */ String FREE_BITS_THRESHOLD = RWStore.class.getName() + ".freeBitsThreshold"; String DEFAULT_FREE_BITS_THRESHOLD = "300"; /** * Defines the size of a slot that defines it as a small slot. * <p> * Any slot equal to or less than this is considered a small slot and * its availability for allocation is restricted to ensure a high * chance that contiguous allocations can be made. * <p> * This is arranged by only returning small slot allocators to the free list * if they have greater than 50% available slots, and then only allocating * slots from sparse regions with >= 50% free/committed bits. * <p> * Small slot processing can be disabled by setting the smallSlotType to zero. */ String SMALL_SLOT_TYPE = RWStore.class.getName() + ".smallSlotType"; /** * Enable the small slot optimization by default. * * @see BLZG-1596 (Enable small slot optimization by default) */ String DEFAULT_SMALL_SLOT_TYPE = "1024"; // standard default // String DEFAULT_SMALL_SLOT_TYPE = "0"; // initial default to no special processing /** * The #of free bits required to be free in a "small slot" allocator before * it is automatically returned to the free list. Once the small slot waste * threshold comes into play, the small slot allocator for a given slot size * having the maximum free bits will be automatically returned to the free * list if the percentage of waste in that slot size exceeds a threshold. * * @see BLZG-1278 (Implement maximum waste policy for small slot allocators) */ String SMALL_SLOT_THRESHOLD = RWStore.class.getName() + ".smallSlotThreshold"; String DEFAULT_SMALL_SLOT_THRESHOLD = "4096"; // 50% of available bits /** * We have introduced extra parameters to adjust allocator usage if we notice that * a significant amount of storage is wasted. * <p> * First we check how many allocators of a given slot size have been created. If * above {@value #SMALL_SLOT_WASTE_CHECK_ALLOCATORS} then we look a little closer. * <p> * We retrieve the allocation statistics and determine if the waste threshold is * exceeded, as determined by {@link SMALL_SLOT_HIGH_WASTE}. * <p> * If so, then we attempt to find an available allocator with more free bits as * determined by {@link SMALL_SLOT_THRESHOLD_HIGH_WASTE}. * * @see BLZG-1278 (Implement maximum waste policy for small slot allocators) */ String SMALL_SLOT_WASTE_CHECK_ALLOCATORS = RWStore.class.getName() + ".smallSlotWasteCheckAllocators"; String DEFAULT_SMALL_SLOT_WASTE_CHECK_ALLOCATORS = "100"; // Check waste when more than 100 allocators /** * Once there are at least {@link #SMALL_SLOT_WASTE_CHECK_ALLOCATORS} * for a given slot size, then the {@link #SMALL_SLOT_HIGH_WASTE} * specifies the maximum percentage of waste that will be allowed for * that slot size. This prevents the amount of waste for small slot * allocators from growing significantly as the size of the backing * store increases. * <p> * The dynamic policy for small slots can be thought of as follows. * <dl> * <li>A normal allocator will be dropped onto the free list once it has * {@link #FREE_BITS_THRESHOLD} bits free (default 300 bits out of 8192 * = 3.6%).</li> * <li>For a new store, a small slot allocator will be dropped onto the * free list once it has {@link #SMALL_SLOT_THRESHOLD} bits free * (default 4096 bits out of 8192 = 50%).</li> * <li>Once the #of small slots allocators for a given sized allocator * exceeds the {@link #DEFAULT_SMALL_SLOT_WASTE_CHECK_ALLOCATORS}, a * small slot allocator will be dropped onto the free list once it is * {@link #SMALL_SLOT_HIGH_WASTE} percent free (this amounts to 1638 * bits out of 8192).</li> * </dl> * Thus, the small slot allocators initially are created freely because * they need to be highly sparse before they can be on the free list. * Once we have "enough" small slot allocators, we create them less * freely - this is achieved by changing the sparsity threshold to a * value that still requires the small slot allocator to be * significantly more sparse than a general purpose allocator. * * @see BLZG-1278 (Implement maximum waste policy for small slot * allocators) */ String SMALL_SLOT_HIGH_WASTE = RWStore.class.getName() + ".smallSlotHighWaste"; String DEFAULT_SMALL_SLOT_HIGH_WASTE = "20.0f"; // 1638 bits: 20% waste, less than 80% usage /** * When <code>true</code>, scattered writes which are strictly ascending * will be coalesced within a buffer and written out as a single IO * (default {@value #DEFAULT_DOUBLE_BUFFER_WRITES}). This improves write * performance for SATA, SAS, and even SSD. */ String DOUBLE_BUFFER_WRITES = RWStore.class.getName() + ".doubleBuffer"; String DEFAULT_DOUBLE_BUFFER_WRITES = "true"; // /** // * When <code>true</code> fills recycled storage with a recognizable // * byte pattern. // */ // String OVERWRITE_DELETE = RWStore.class.getName() + ".overwriteDelete"; // // String DEFAULT_OVERWRITE_DELETE = "false"; // // /** // * When <code>true</code> the RWStore will protect any address from // * recycling, and generate an exception if the address is subsequently // * accessed // */ // String MAINTAIN_BLACKLIST = RWStore.class.getName() + ".maintainBlacklist"; // // String DEFAULT_MAINTAIN_BLACKLIST = "false"; } /* * Error messages. */ private static final String ERR_WRITE_CACHE_CREATE = "Unable to create write cache service"; /** * The fixed size of any allocator on the disk in bytes. The #of allocations * managed by an allocator is this value times 8 because each slot uses one * bit in the allocator. When an allocator is allocated, the space on the * persistent heap is reserved for all slots managed by that allocator. * However, the {@link FixedAllocator} only incrementally allocates the * {@link AllocBlock}s. */ static private final int ALLOC_BLOCK_SIZE = 1024; // // from 32 bits, need 13 to hold max offset of 8 * 1024, leaving 19 for number of blocks: 256K // static final int BLOCK_INDEX_BITS = 19; /** * The #of low bits in a latched address that encode the offset of the bit * in a {@link FixedAllocator}. The {@link FixedAllocator} will map the bit * onto an allocation slot. * <p> * The high bits of the latched address is index of the * {@link FixedAllocator}. The index of the {@link FixedAllocator} is the * order in which it was created. This is used to index into * {@link #m_allocs}, which are the {@link FixedAllocator}s. */ static final int OFFSET_BITS = 13; static final int OFFSET_BITS_MASK = 0x1FFF; // was 0xFFFF static final int ALLOCATION_SCALEUP = 16; // multiplier to convert allocations based on minimum allocation of 64k static private final int META_ALLOCATION = 8; // 8 * 32K is size of meta Allocation // If required, then allocate 1M direct buffers private static final int cDirectBufferCapacity = 1024 * 1024; private int cMaxDirectBuffers = 20; // 20M of direct buffers static final int cDirectAllocationOffset = 64 * 1024; // /////////////////////////////////////////////////////////////////////////////////////// // RWStore Data // /////////////////////////////////////////////////////////////////////////////////////// private final File m_fd; // private RandomAccessFile m_raf; // protected FileMetadata m_metadata; // protected int m_transactionCount; // private boolean m_committing; // /** // * When <code>true</code> the allocations will not actually be recycled // * until after a store restart. When <code>false</code>, the allocations are // * recycled once they satisfy the history retention requirement. // */ // private boolean m_preserveSession = false; // private boolean m_readOnly; /** * The UUID of the backing store. * * @see #initfromRootBlock(IRootBlockView) * @see IRawStore#getUUID() */ private UUID m_storeUUID; /** * lists of total alloc blocks. * * @todo examine concurrency and lock usage for {@link #m_alloc} and the * rest of these lists. */ private final ArrayList<FixedAllocator> m_allocs; /** * A fixed length array of lists of free {@link FixedAllocator}s with one * entry in the array for each configured allocator size. An allocator is * put onto this free list when it is initially created. When the store is * opened, it will be added to this list if {@link Allocator#hasFree()} * returns true. It will be removed when it has no free space remaining. It * will be added back to the free list when its free slots exceeds a * configured threshold. */ private ArrayList<FixedAllocator> m_freeFixed[]; // /** lists of free blob allocators. */ // private final ArrayList<BlobAllocator> m_freeBlobs; /** lists of blocks requiring commitment. */ // private final ArrayList<FixedAllocator> m_commitList; FixedAllocator m_commitHead; FixedAllocator m_commitTail; // private WriteBlock m_writes; private final Quorum<?,?> m_quorum; /** * The #of buffers that will be used by the {@link WriteCacheService}. * * @see com.bigdata.journal.Options#WRITE_CACHE_BUFFER_COUNT */ private final int m_writeCacheBufferCount; /** * @see com.bigdata.journal.Options#WRITE_CACHE_MIN_CLEAN_LIST_SIZE */ private final int m_minCleanListSize; /** * The #of read buffers that will be used by the {@link WriteCacheService}. * * @see com.bigdata.journal.Options#READ_CACHE_BUFFER_COUNT */ private final int m_readCacheBufferCount; /** * @see com.bigdata.journal.Options#WRITE_CACHE_COMPACTION_THRESHOLD */ private final int m_compactionThreshold; /** * @see com.bigdata.journal.Options#HOT_CACHE_THRESHOLD */ private final int m_hotCacheThreshold; /** * @see com.bigdata.journal.Options#HOT_CACHE_SIZE */ private final int m_hotCacheSize; /** * The key for the {@link CompressorRegistry} which identifies the * {@link IRecordCompressor} to be applied (optional). * * @see com.bigdata.journal.Options#HALOG_COMPRESSOR */ private final String m_compressorKey; /** * Note: This is not final because we replace the {@link WriteCacheService} * during {@link #reset(long)} in order to propagate the then current quorum * token to the {@link WriteCacheService}. */ private RWWriteCacheService m_writeCacheService; /** * Return the then current {@link WriteCacheService} object. * * @see IHABufferStrategy#getWriteCacheService() */ public RWWriteCacheService getWriteCacheService() { m_allocationReadLock.lock(); try { return m_writeCacheService; } finally { m_allocationReadLock.unlock(); } } /** * The actual allocation sizes as read from the store. * * @see #DEFAULT_ALLOCATION_SIZES */ private int[] m_allocSizes; /** * The maximum allocation size (bytes). */ final int m_maxFixedAlloc; /** * The minimum allocation size (bytes). */ final int m_minFixedAlloc; /** * We allow blob headers so the maximum blob size is Integer.MAX_VALUE. */ final int m_maxBlobAllocSize = Integer.MAX_VALUE; /** * This lock is used to exclude readers/writers performing IOs against the * backing file when the extent of the backing file is about to be changed. * Readers and writers take the {@link ReadLock}. The {@link WriteLock} is * taken when the file extent must be changed. This is a workaround for an * old (an unresolved as of February 2010) Sun bug. * <p> * Note: Any public method that ONLY takes the extensionLock MUST NOT make * calls that could take the {@link #m_allocationLock}. This would cause a * lock ordering problem. If both locks must be taken, then the * {@link #m_allocationLock} MUST be taken first. * * @see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6371642 * @see #m_allocationLock */ final private ReentrantReadWriteLock m_extensionLock = new ReentrantReadWriteLock(); /** * An explicit allocation lock supports exclusive access for allocator * mutation and shared access for readers. * <p> * Note: You must hold the {@link #m_allocationReadLock} to read the * allocators. * <p> * Note: You must hold the {@link #m_allocationWriteLock} while allocating * or clearing allocations. * <p> * Note: It is only when an allocation triggers a file extension that the * {@link WriteLock} of the {@link #m_extensionLock} needs to be taken. * * TODO: There is scope to take advantage of the different allocator sizes * and provide allocation locks on the fixed allocators. We will still need * a store-wide allocation lock when creating new allocation areas, but * significant contention may be avoided. */ final private ReentrantReadWriteLock m_allocationLock = new ReentrantReadWriteLock(); /** * Lock used for exclusive access to the allocators. * <p> * Note: Historically, this lock was only required for mutation and readers * did not content for a lock. */ final private WriteLock m_allocationWriteLock = m_allocationLock.writeLock(); /** * Lock used for shared access to allocators. * <p> * Note: Historically the allocators were unprotected for shared acccess * (readers) and protected by a single lock for mutation (writes). Shared * access by readers was safe since (a) old allocators were never replaced; * and (b) readers had access only to committed data. * <p> * This situation was changed when the {@link #postHACommit(IRootBlockView)} * method was introduced since it could replace allocators in a manner that * was not safe for shared access by readers. Methods that were historically * using unprotected shared access now require protected shared access using * this lock. * * @see #postHACommit(IRootBlockView) * @see #getData(long, int) * @see #getData(long, byte[]) * @see #getData(long, byte[], int, int) */ final private ReadLock m_allocationReadLock = m_allocationLock.readLock(); /** * The deferredFreeList is simply an array of releaseTime,freeListAddrs * stored at commit. * <p> * Note that when the deferredFreeList is saved, ONLY thefreeListAddrs * are stored, NOT the releaseTime. This is because on any open of * the store, all deferredFrees can be released immediately. This * mechanism may be changed in the future to enable explicit history * retention, but if so a different header structure would be used since * it would not be appropriate to retain a simple header linked to * thousands if not millions of commit points. */ // * // * If the current txn list exceeds the MAX_DEFERRED_FREE then it is // * incrementally saved and a new list begun. The master list itself // * serves as a BLOB header when there is more than a single entry with // * the same txReleaseTime. // private static final int MAX_DEFERRED_FREE = 4094; // fits in 16k block private final long m_minReleaseAge; /** * The #of open transactions (read-only or read-write). * * This is guarded by the {@link #m_allocationLock}. */ private int m_activeTxCount = 0; private volatile long m_lastDeferredReleaseTime = 0L; // private final ArrayList<Integer> m_currentTxnFreeList = new ArrayList<Integer>(); private final PSOutputStream m_deferredFreeOut; /** * Used to transparently re-open the backing channel if it has been closed * by an interrupt during an IO. */ private final ReopenFileChannel m_reopener; private volatile BufferedWrite m_bufferedWrite; /** * Our StoreageStats objects */ private StorageStats m_storageStats; private long m_storageStatsAddr = 0; /** * <code>true</code> iff the backing store is open. */ private volatile boolean m_open = true; // /** // * If m_blacklist is non-null then a request to blacklist as address will // * add the address to the blacklist. // * // * When a blacklisted address is freed and is re-allocated, the re-allocation // * is intercepted (see alloc()), the address is locked and a new allocation is made. // * // * The purpose of the blacklist is to trap erroneus references to an // * address that is retained (and used) after it should be. // */ // private ConcurrentHashMap<Integer, String> m_blacklist = null; private ConcurrentHashMap<Integer, Long> m_lockAddresses = null; class WriteCacheImpl extends WriteCache.FileChannelScatteredWriteCache { final private String compressorKey; public WriteCacheImpl(final IBufferAccess buf, final boolean useChecksum, final boolean bufferHasData, final IReopenChannel<FileChannel> opener, final long fileExtent, final String compressorKey) throws InterruptedException { super(buf, useChecksum, m_quorum != null /*&& m_quorum.isHighlyAvailable()*/, bufferHasData, opener, fileExtent, m_bufferedWrite); this.compressorKey = compressorKey; } @Override public String getCompressorKey() { return compressorKey; } /** * {@inheritDoc} * <p> * Note: The performance counters for writes to the disk are reported by * the {@link WriteCacheService}. The {@link RWStore} never writes * directly onto the disk (other than the root blocks). */ @Override protected boolean writeOnChannel(final ByteBuffer data, final long firstOffsetignored, final Map<Long, RecordMetadata> recordMap, final long nanos) throws InterruptedException, IOException { final Lock readLock = m_extensionLock.readLock(); readLock.lock(); try { boolean ret = super.writeOnChannel(data, firstOffsetignored, recordMap, nanos); return ret; } finally { readLock.unlock(); } } // Added to enable debug of rare problem // FIXME: disable by removal once solved protected void registerWriteStatus(long offset, int length, char action) { m_writeCacheService.debugAddrs(offset, length, action); } @Override protected void addAddress(int latchedAddr, int size) { // No longer valid // RWStore.this.addAddress(latchedAddr, size); } @Override protected void removeAddress(int latchedAddr) { // No longer valid // RWStore.this.removeAddress(latchedAddr); } }; /** * The ALLOC_SIZES must be initialized from either the file or the * properties associated with the fileMetadataView * * @param fileMetadataView * @param readOnly * @param quorum * @throws InterruptedException * * @todo support read-only open. */ public RWStore(final FileMetadata fileMetadata, final Quorum<?, ?> quorum) { if (fileMetadata == null) throw new IllegalArgumentException(); this.m_minReleaseAge = Long.valueOf(fileMetadata.getProperty( AbstractTransactionService.Options.MIN_RELEASE_AGE, AbstractTransactionService.Options.DEFAULT_MIN_RELEASE_AGE)); if (log.isInfoEnabled()) log.info(AbstractTransactionService.Options.MIN_RELEASE_AGE + "=" + m_minReleaseAge); // Remove parameterisation, we want to use fixed Allocator block sizing // there is no significant advantage to parameterize this since file cache // locality is handled by size of the allocation - 256K is a reasonable // number as 32 * 8 * 1K size. // // Equally there is no benefit to increasing the size of the Allocators beyond // 1K. // cDefaultMetaBitsSize = Integer.valueOf(fileMetadata.getProperty( // Options.META_BITS_SIZE, // Options.DEFAULT_META_BITS_SIZE)); // cDefaultMetaBitsSize = 9; // if (cDefaultMetaBitsSize < 9) // throw new IllegalArgumentException(Options.META_BITS_SIZE // + " : Must be GTE 9"); m_metaBitsSize = cDefaultMetaBitsSize; m_useMetabitsDemispace = Boolean.valueOf(fileMetadata.getProperty( Options.META_BITS_DEMI_SPACE, Options.DEFAULT_META_BITS_DEMI_SPACE)); cDefaultFreeBitsThreshold = Integer.valueOf(fileMetadata.getProperty( Options.FREE_BITS_THRESHOLD, Options.DEFAULT_FREE_BITS_THRESHOLD)); if (cDefaultFreeBitsThreshold < 1 || cDefaultFreeBitsThreshold > 5000) { throw new IllegalArgumentException(Options.FREE_BITS_THRESHOLD + " : Must be between 1 and 5000"); } m_readBlobsAsync = Boolean.valueOf(fileMetadata.getProperty( Options.READ_BLOBS_ASYNC, Options.DEFAULT_READ_BLOBS_ASYNC)); cSmallSlot = Integer.valueOf(fileMetadata.getProperty( Options.SMALL_SLOT_TYPE, Options.DEFAULT_SMALL_SLOT_TYPE)); cSmallSlotThreshold = Integer.valueOf(fileMetadata.getProperty( Options.SMALL_SLOT_THRESHOLD, Options.DEFAULT_SMALL_SLOT_THRESHOLD)); cSmallSlotWasteCheckAllocators = Integer.valueOf(fileMetadata.getProperty( Options.SMALL_SLOT_WASTE_CHECK_ALLOCATORS, Options.DEFAULT_SMALL_SLOT_WASTE_CHECK_ALLOCATORS)); cSmallSlotHighWaste = Float.valueOf(fileMetadata.getProperty( Options.SMALL_SLOT_HIGH_WASTE, Options.DEFAULT_SMALL_SLOT_HIGH_WASTE)); // cSmallSlotThresholdHighWaste = Integer.valueOf(fileMetadata.getProperty( // Options.SMALL_SLOT_THRESHOLD_HIGH_WASTE, // Options.DEFAULT_SMALL_SLOT_THRESHOLD_HIGH_WASTE)); /* * The highWasteThreshold is more sensibly calculated from * the high waste value. */ cSmallSlotThresholdHighWaste = (int) (cSmallSlotHighWaste * 8192 / 100); if (cSmallSlot < 0 || cSmallSlot > 2048) { throw new IllegalArgumentException(Options.SMALL_SLOT_TYPE + " : Must be between 0 and 2048"); } m_metaBits = new int[m_metaBitsSize]; m_metaTransientBits = new int[m_metaBitsSize]; m_quorum = quorum; m_fd = fileMetadata.file; // initialize striped performance counters for this store. this.storeCounters.set(new StoreCounters(10/* batchSize */)); final IRootBlockView m_rb = fileMetadata.rootBlock; m_allocs = new ArrayList<FixedAllocator>(); // m_freeBlobs = new ArrayList<BlobAllocator>(); try { final RandomAccessFile m_raf = fileMetadata.getRandomAccessFile(); m_reopener = new ReopenFileChannel(m_fd, m_raf, fileMetadata.readOnly); } catch (IOException e1) { throw new RuntimeException(e1); } if (Boolean.valueOf(fileMetadata.getProperty( Options.DOUBLE_BUFFER_WRITES, Options.DEFAULT_DOUBLE_BUFFER_WRITES))) { try { m_bufferedWrite = new BufferedWrite(this); } catch (InterruptedException e1) { m_bufferedWrite = null; } } else { m_bufferedWrite = null; } m_writeCacheBufferCount = fileMetadata.writeCacheBufferCount; m_readCacheBufferCount = Integer.valueOf(fileMetadata.getProperty( com.bigdata.journal.Options.READ_CACHE_BUFFER_COUNT, com.bigdata.journal.Options.DEFAULT_READ_CACHE_BUFFER_COUNT)); if (log.isInfoEnabled()) log.info(com.bigdata.journal.Options.WRITE_CACHE_BUFFER_COUNT + "=" + m_writeCacheBufferCount); this.m_minCleanListSize = Integer.valueOf(fileMetadata.getProperty( com.bigdata.journal.Options.WRITE_CACHE_MIN_CLEAN_LIST_SIZE, com.bigdata.journal.Options.DEFAULT_WRITE_CACHE_MIN_CLEAN_LIST_SIZE)); if (log.isInfoEnabled()) log.info(com.bigdata.journal.Options.WRITE_CACHE_MIN_CLEAN_LIST_SIZE + "=" + m_minCleanListSize); this.m_compactionThreshold = Double.valueOf(fileMetadata.getProperty( com.bigdata.journal.Options.WRITE_CACHE_COMPACTION_THRESHOLD, com.bigdata.journal.Options.DEFAULT_WRITE_CACHE_COMPACTION_THRESHOLD)).intValue(); if (log.isInfoEnabled()) log.info(com.bigdata.journal.Options.WRITE_CACHE_COMPACTION_THRESHOLD + "=" + m_compactionThreshold); this.m_hotCacheThreshold = Double.valueOf(fileMetadata.getProperty( com.bigdata.journal.Options.HOT_CACHE_THRESHOLD, com.bigdata.journal.Options.DEFAULT_HOT_CACHE_THRESHOLD)).intValue(); if (log.isInfoEnabled()) log.info(com.bigdata.journal.Options.HOT_CACHE_THRESHOLD + "=" + m_hotCacheThreshold); this.m_hotCacheSize = Double.valueOf(fileMetadata.getProperty( com.bigdata.journal.Options.HOT_CACHE_SIZE, com.bigdata.journal.Options.DEFAULT_HOT_CACHE_SIZE)).intValue(); if (log.isInfoEnabled()) log.info(com.bigdata.journal.Options.HOT_CACHE_SIZE + "=" + m_hotCacheSize); this.m_compressorKey = fileMetadata.getProperty( com.bigdata.journal.Options.HALOG_COMPRESSOR, com.bigdata.journal.Options.DEFAULT_HALOG_COMPRESSOR); if (log.isInfoEnabled()) log.info(com.bigdata.journal.Options.HALOG_COMPRESSOR + "=" + m_compressorKey); // m_writeCache = newWriteCache(); try { if (m_rb.getNextOffset() == 0) { // if zero then new file setAllocations(fileMetadata); /* * FIXME Martyn, the code paths here are crazy complicated. * defaultInit() is also invoked from initFromRootBlock(). * Simplify this. BBT */ m_storeUUID = m_rb.getUUID(); defaultInit(); m_maxFixedAlloc = m_allocSizes[m_allocSizes.length-1]*64; m_minFixedAlloc = m_allocSizes[0]*64; m_storageStats = new StorageStats(m_allocSizes); // // Check for overwrite option and set overwrite buffer if // // required // if (Boolean.valueOf(fileMetadata.getProperty( // Options.OVERWRITE_DELETE, // Options.DEFAULT_OVERWRITE_DELETE))) { // m_writeCache.setOverwriteBuffer(m_maxFixedAlloc); // } } else { initfromRootBlock(m_rb); m_maxFixedAlloc = m_allocSizes[m_allocSizes.length-1]*64; m_minFixedAlloc = m_allocSizes[0]*64; if (m_storageStatsAddr != 0) { final long statsAddr = m_storageStatsAddr >> 16; final int statsLen = ((int) m_storageStatsAddr) & 0xFFFF; final byte[] stats = new byte[statsLen + 4]; // allow for checksum getData(statsAddr, stats); final DataInputStream instr = new DataInputStream(new ByteArrayInputStream(stats)); m_storageStats = new StorageStats(instr); for (FixedAllocator fa: m_allocs) { m_storageStats.register(fa); } } else { m_storageStats = new StorageStats(m_allocSizes); } if (log.isTraceEnabled()) { final StringBuilder str = new StringBuilder(); this.showAllocators(str); log.trace(str); } } // Maximum theoretically addressable file size is determined by the // maximum allocator slot size multiplied by Integer.MAX_VALUE // FIXME: do we want to constrain this as a system property? m_maxFileSize = ((long) Integer.MAX_VALUE) * m_maxFixedAlloc; // setup write cache AFTER init to ensure filesize is correct! m_writeCacheService = newWriteCacheService(); final int maxBlockLessChk = m_maxFixedAlloc-4; assert m_maxFixedAlloc > 0; m_deferredFreeOut = PSOutputStream.getNew(this, m_maxFixedAlloc, null); // if (Boolean.valueOf(fileMetadata.getProperty( // Options.MAINTAIN_BLACKLIST, // Options.DEFAULT_MAINTAIN_BLACKLIST))) { // m_blacklist = new ConcurrentHashMap<Integer, String>(); // m_lockAddresses = new ConcurrentHashMap<Integer, Long>(); // } } catch (IOException e) { throw new StorageTerminalError("Unable to initialize store", e); } } /** * Called from WriteCache.resetRecordMapFromBuffer * * If a FixedAllocator already exists for the address then just set the * address as active, otherwise, create a new allocator and try again, which * should work second time around if we are correctly in sync. * * @param latchedAddr * The latched address. * @param size * The size of the application data -or- <code>-size</code> if * this provides notice of the existence of an allocator for that * <i>latchedAddr</i> but the address itself should not yet be * allocated. */ void addAddress(final int latchedAddr, final int size) { // ignore zero address if (latchedAddr == 0) return; m_allocationWriteLock.lock(); try { FixedAllocator alloc = null; try { alloc = getBlock(latchedAddr); } catch (final PhysicalAddressResolutionException par) { // Must create new allocator } final int size2 = size < 0 ? -size : size; if (alloc == null) { final int i = fixedAllocatorIndex(size2); final int block = 64 * m_allocSizes[i]; final ArrayList<FixedAllocator> list = m_freeFixed[i]; if (log.isTraceEnabled()) log.trace("Creating new Allocator for address: " + latchedAddr); final FixedAllocator allocator = new FixedAllocator(this, block); allocator.setFreeList(list); allocator.setIndex(m_allocs.size()); m_allocs.add(allocator); // Check correctly synchronized creation assert allocator == getBlock(latchedAddr); alloc = allocator; } assert size2 <= alloc.getSlotSize(); if (size > 0) { /* * This is a real allocation. */ alloc.setAddressExternal(latchedAddr); } } finally { m_allocationWriteLock.unlock(); } } /** * Called from WriteCache.resetRecordMapFromBuffer * * Must clear the bit in the allocator. * * @param latchedAddr */ void removeAddress(final int latchedAddr) { // ignore zero address if (latchedAddr == 0) return; m_allocationWriteLock.lock(); try { // assert m_commitList.size() == 0; final FixedAllocator alloc = getBlockByAddress(latchedAddr); assert alloc != null; final int addrOffset = getOffset(latchedAddr); if (alloc == null) { throw new IllegalArgumentException( "Invalid address provided to immediateFree: " + latchedAddr); } final long pa = alloc.getPhysicalAddress(addrOffset); if (log.isTraceEnabled()) log.trace("Freeing allocation at " + latchedAddr + ", physical address: " + pa); alloc.free(latchedAddr, 0, false); // assert m_commitList.size() == 0; } finally { m_allocationWriteLock.unlock(); } } /** * Create and return a new {@link RWWriteCacheService} instance. The caller * is responsible for closing out the old one and must be holding the * appropriate locks when it switches in the new instance. */ private RWWriteCacheService newWriteCacheService() { try { // final boolean highlyAvailable = m_quorum != null // && m_quorum.isHighlyAvailable(); final boolean prefixWrites = m_quorum != null; // highlyAvailable return new RWWriteCacheService(m_writeCacheBufferCount, m_minCleanListSize, m_readCacheBufferCount, prefixWrites, m_compactionThreshold, m_hotCacheSize, m_hotCacheThreshold, convertAddr(m_fileSize), m_reopener, m_quorum, this) { @Override @SuppressWarnings("unchecked") public WriteCache newWriteCache(final IBufferAccess buf, final boolean useChecksum, final boolean bufferHasData, final IReopenChannel<? extends Channel> opener, final long fileExtent) throws InterruptedException { return new WriteCacheImpl(buf, useChecksum, bufferHasData, (IReopenChannel<FileChannel>) opener, fileExtent, m_compressorKey); } }; } catch (InterruptedException e) { throw new IllegalStateException(ERR_WRITE_CACHE_CREATE, e); } catch (IOException e) { throw new IllegalStateException(ERR_WRITE_CACHE_CREATE, e); } } private void setAllocations(final FileMetadata fileMetadata) throws IOException { final String buckets = fileMetadata.getProperty( Options.ALLOCATION_SIZES, Options.DEFAULT_ALLOCATION_SIZES); final String[] specs = buckets.split("\\s*,\\s*"); m_allocSizes = new int[specs.length]; int prevSize = 0; for (int i = 0; i < specs.length; i++) { final int nxtSize = Integer.parseInt(specs[i]); if (nxtSize <= prevSize) throw new IllegalArgumentException( "Invalid AllocSizes property"); m_allocSizes[i] = nxtSize; prevSize = nxtSize; } } private void defaultInit() throws IOException { final int numFixed = m_allocSizes.length; m_freeFixed = new ArrayList[numFixed]; for (int i = 0; i < numFixed; i++) { m_freeFixed[i] = new ArrayList<FixedAllocator>(); } m_fileSize = convertFromAddr(m_fd.length()); // make space for meta-allocators m_metaBits[0] = -1; m_metaTransientBits[0] = -1; m_nextAllocation = -(1 + META_ALLOCATION); // keep on a minimum 8K boundary m_committedNextAllocation = m_nextAllocation; if (m_fileSize > m_nextAllocation) { m_fileSize = m_nextAllocation; } if (log.isInfoEnabled()) log.info("Set default file extent " + convertAddr(m_fileSize)); m_reopener.raf.setLength(convertAddr(m_fileSize)); } public boolean isOpen() { return m_open; } private void assertOpen() { if (!m_open) throw new IllegalStateException(AbstractBufferStrategy.ERR_NOT_OPEN); } synchronized public void close() { m_open = false; try { if (m_bufferedWrite != null) { m_bufferedWrite.release(); m_bufferedWrite = null; } m_writeCacheService.close(); m_reopener.raf.close(); } catch (Throwable t) { throw new RuntimeException(t); } } /** * Basic check on key root block validity * * @param rbv */ private void checkRootBlock(final IRootBlockView rbv) { final long nxtOffset = rbv.getNextOffset(); final int nxtalloc = -(int) (nxtOffset >> 32); final int metaBitsAddr = -(int) nxtOffset; final long metaAddr = rbv.getMetaStartAddr(); final long rawMetaBitsAddr = rbv.getMetaBitsAddr(); if (metaAddr == 0 || rawMetaBitsAddr == 0) { /* * possible when rolling back to empty file. */ log.warn("No meta allocation data included in root block for RWStore"); } // CANNOT check physicalAddress if follower if (m_quorum == null && log.isTraceEnabled()) { final int commitRecordAddr = (int) (rbv.getCommitRecordAddr() >> 32); log.trace("CommitRecord " + rbv.getCommitRecordAddr() + " at physical address: " + physicalAddress(commitRecordAddr)); } final long commitCounter = rbv.getCommitCounter(); // final int metaStartAddr = (int) -(metaAddr >> 32); // void // final int fileSize = (int) -(metaAddr & 0xFFFFFFFF); if (log.isTraceEnabled()) log.trace("m_allocation: " + nxtalloc + ", m_metaBitsAddr: " + metaBitsAddr + ", m_commitCounter: " + commitCounter); } /** * Utility to encapsulate RootBlock interpretation. */ static private class RootBlockInfo { // int nextAllocation(final IRootBlockView rb) { // final long nxtOffset = rb.getNextOffset(); // // // next allocation to be made (in -32K units). // final int ret = -(int) (nxtOffset >> 32); // // /* // * Skip the first 32K in the file. The root blocks live here but // * nothing else. // */ // return ret == 0 ? -(1 + META_ALLOCATION) : ret; // } /** * Used to transparently re-open the backing channel if it has been closed * by an interrupt during an IO. */ private final ReopenFileChannel m_reopener; /** * Meta-Allocations stored as {int address; int[8] bits}, so each block * holds 8*32=256 allocation slots of 1K totaling 256K. * <p> * The returned int array is a flattened list of these int[9] blocks */ private final int[] m_metabits; private final long m_storageStatsAddr; private final long m_lastDeferredReleaseTime; RootBlockInfo(final IRootBlockView rb, final ReopenFileChannel reopener) throws IOException { this.m_reopener = reopener; final long rawmbaddr = rb.getMetaBitsAddr(); /* * The #of int32 values in the metabits region. * * We get this by taking bottom 16 bits of the metaBitsAddr. This * gives the #of int32 values in the metabits regions (up to 64k * int32 values). */ final int metaBitsStore = (int) (rawmbaddr & 0xFFFF); // The byte offset of the metabits region in the file. final long pmaddr = rawmbaddr >> 16; /* * Read the metabits block, including a header and the int32[] * that encodes both startAddrs and bit vectors. */ final byte[] buf = new byte[metaBitsStore * 4]; FileChannelUtility.readAll(m_reopener, ByteBuffer.wrap(buf), pmaddr); final DataInputStream strBuf = new DataInputStream(new ByteArrayInputStream(buf)); // Can handle minor store version incompatibility strBuf.readInt(); // STORE VERSION m_lastDeferredReleaseTime = strBuf.readLong(); // Last Deferred Release Time strBuf.readInt(); // cDefaultMetaBitsSize final int allocBlocks = strBuf.readInt(); m_storageStatsAddr = strBuf.readLong(); // m_storageStatsAddr // step over those reserved ints for (int i = 0; i < cReservedMetaBits; i++) { strBuf.readInt(); } // step over the allocSizes for (int i = 0; i < allocBlocks; i++) { strBuf.readInt(); } final int metaBitsSize = metaBitsStore - allocBlocks - cMetaHdrFields; // allow for header fields // Must be multiple of 9 assert metaBitsSize % 9 == 0; final int[] ret = new int[metaBitsSize]; for (int i = 0; i < metaBitsSize; i++) { ret[i] = strBuf.readInt(); } /* * Meta-Allocations stored as {int address; int[8] bits}, so each block * holds 8*32=256 allocation slots of 1K totaling 256K. */ m_metabits = ret; } } /** * Should be called where previously initFileSpec was used. * * Rather than reading from file, instead reads from the current root block. * * We use the rootBlock fields, nextOffset, metaStartAddr, metaBitsAddr. * * metaBitsAddr indicates where the meta allocation bits are. * * metaStartAddr is the offset in the file where the allocation blocks are * allocated the long value also indicates the size of the allocation, such * that the address plus the size is the "filesize". * * Note that metaBitsAddr must be an absolute address, with the low order 16 * bits used to indicate the size. * * @throws IOException */ private void initfromRootBlock(final IRootBlockView rb) throws IOException { // m_rb = m_fmv.getRootBlock(); assert(rb != null); m_storeUUID = rb.getUUID(); if (rb.getNextOffset() == 0) { defaultInit(); } else { /* * The RWStore stores in IRootBlock.getNextOffset() two distinct * int32 words. * * The high int32 word is the next allocation that will handed out * and is represented in units of -32K. This is used for things like * getting a new metabits region or a new region from which fixed * allocators will be recruited (through the metabits). * * The low int32 word is the latched address of the current metabits * region. It must be interpreted using the metaBits and the * FixedAllocators in order to turn it into a byte offset on the * file. */ final long nxtOffset = rb.getNextOffset(); // next allocation to be made (in -32K units). m_nextAllocation = -(int) (nxtOffset >> 32); if (m_nextAllocation == 0) { /* * Skip the first 32K in the file. The root blocks live here but * nothing else. */ m_nextAllocation = -(1 + META_ALLOCATION); } m_committedNextAllocation = m_nextAllocation; // latched offset of the metabits region. m_metaBitsAddr = -(int) nxtOffset; if (log.isInfoEnabled()) { log.info("MetaBitsAddr: " + m_metaBitsAddr); } /* * Get the fileSize in -32K units from the root block. */ { final long metaAddr = rb.getMetaStartAddr(); // in units of -32K. m_fileSize = (int) -(metaAddr & 0xFFFFFFFF); if (log.isInfoEnabled()) log.info("InitFromRootBlock m_fileSize: " + convertAddr(m_fileSize)); } /* * This stores the byte offset and length of the metabits region in * the file. The bottom 16-bits are the length (see below). The top * 48-bits are the byte offset. */ long rawmbaddr = rb.getMetaBitsAddr(); /* * The #of int32 values in the metabits region. * * We get this by taking bottom 16 bits of the metaBitsAddr. This * gives the #of int32 values in the metabits regions (up to 64k * int32 values). Each int32 value in the metaBits[] gives us 32 * allocators. So, 16-bits gives us up 64k * 32 = 2M allocators. * Except, that the total #of allocators is reduced by the presence * of a startAddr every N positions in the metaBits[]. * * The theoretical maximum number is also reduced since the number * of "committed" bits could be half the total number of bits. * * The theoretical restriction is also limited by the maximum indexable * allocator, since only 19 bits is available to the index, which, once * the sign is removed reduces the maximum number of addressable * allocators to 256K. */ final int metaBitsStore = (int) (rawmbaddr & 0xFFFF); if (metaBitsStore > 0) { // The byte offset of the metabits region in the file. rawmbaddr >>= 16; /* * Read the metabits block, including a header and the int32[] * that encodes both startAddrs and bit vectors. */ final byte[] buf = new byte[metaBitsStore * 4]; FileChannelUtility.readAll(m_reopener, ByteBuffer.wrap(buf), rawmbaddr); final DataInputStream strBuf = new DataInputStream(new ByteArrayInputStream(buf)); // Can handle minor store version incompatibility final int storeVersion = strBuf.readInt(); switch ((storeVersion & 0xFF00)) { case (cVersion & 0xFF00): case (cVersionDemispace & 0xFF00): break; default: throw new IllegalStateException( "Incompatible RWStore header version: storeVersion=" + storeVersion + ", cVersion=" + cVersion + ", demispace: " + isUsingDemiSpace()); } m_lastDeferredReleaseTime = strBuf.readLong(); if (strBuf.readInt() != cDefaultMetaBitsSize) { throw new IllegalStateException("Store opened with unsupported metabits size"); } final int allocBlocks = strBuf.readInt(); m_storageStatsAddr = strBuf.readLong(); // and let's read in those reserved ints for (int i = 0; i < cReservedMetaBits; i++) { strBuf.readInt(); } m_allocSizes = new int[allocBlocks]; for (int i = 0; i < allocBlocks; i++) { m_allocSizes[i] = strBuf.readInt(); } m_metaBitsSize = metaBitsStore - allocBlocks - cMetaHdrFields; // allow for header fields m_metaBits = new int[m_metaBitsSize]; if (log.isInfoEnabled()) { log.info("Raw MetaBitsAddr: " + rawmbaddr); } for (int i = 0; i < m_metaBitsSize; i++) { m_metaBits[i] = strBuf.readInt(); } // m_metaTransientBits = (int[]) m_metaBits.clone(); syncMetaTransients(); final int numFixed = m_allocSizes.length; m_freeFixed = new ArrayList[numFixed]; for (int i = 0; i < numFixed; i++) { m_freeFixed[i] = new ArrayList<FixedAllocator>(); } checkCoreAllocations(); readAllocationBlocks(); } if (log.isInfoEnabled()) log.info("restored from RootBlock: " + m_nextAllocation + ", " + m_metaBitsAddr); } } /** * Uses System.arraycopy rather than clone() to duplicate the * metaBits to the metaTransientBits, which will be faster. */ private void syncMetaTransients() { if (m_metaTransientBits == null || m_metaTransientBits.length != m_metaBits.length) { m_metaTransientBits = (int[]) m_metaBits.clone(); } else { System.arraycopy(m_metaBits, 0, m_metaTransientBits, 0, m_metaTransientBits.length); } } // /* // * Called when store is opened to make sure any deferred frees are // * cleared. // * // * Stored persistently is only the list of addresses of blocks to be freed, // * the knowledge of the txn release time does not need to be held persistently, // * this is only relevant for transient state while the RWStore is open. // * // * The deferredCount is the number of entries - integer address and integer // * count at each address // */ // private void clearOutstandingDeferrels(final int deferredAddr, final int deferredCount) { // if (deferredAddr != 0) { // assert deferredCount != 0; // final int sze = deferredCount * 8 + 4; // include space for checksum // // if (log.isDebugEnabled()) // log.debug("Clearing Outstanding Deferrals: " + deferredCount); // // byte[] buf = new byte[sze]; // getData(deferredAddr, buf); // // final byte[] blockBuf = new byte[8 * 1024]; // maximum size required // // ByteBuffer in = ByteBuffer.wrap(buf); // for (int i = 0; i < deferredCount; i++) { // int blockAddr = in.getInt(); // int addrCount = in.getInt(); // // // now read in this block and free all addresses referenced // getData(blockAddr, blockBuf, 0, addrCount*4 + 4); // ByteBuffer inblock = ByteBuffer.wrap(blockBuf); // for (int b = 0; b < addrCount; b++) { // final int defAddr = inblock.getInt(); // Allocator alloc = getBlock(defAddr); // if (alloc instanceof BlobAllocator) { // b++; // assert b < addrCount; // alloc.free(defAddr, inblock.getInt()); // } else { // alloc.free(defAddr, 0); // size ignored for FreeAllocators // } // } // // once read then free the block allocation // free(blockAddr, 0); // } // // // lastly free the deferredAddr // free(deferredAddr, 0); // } // // } /********************************************************************* * make sure resource is closed! **/ protected void finalize() { close(); } @SuppressWarnings("unchecked") protected void readAllocationBlocks() throws IOException { assert m_allocs.size() == 0; if (log.isInfoEnabled()) log.info("readAllocationBlocks, m_metaBits.length: " + m_metaBits.length); /** * Allocators are sorted in StartAddress order (which MUST be the order * they were created and therefore will correspond to their index) The * comparator also checks for equality, which would indicate an error in * the metaAllocation if two allocation blocks were loaded for the same * address (must be two version of same Allocator). * * Meta-Allocations stored as {int address; int[8] bits}, so each block * holds 8*32=256 allocation slots of 1K totaling 256K. */ for (int b = 0; b < m_metaBits.length; b += cDefaultMetaBitsSize) { final long blockStart = convertAddr(m_metaBits[b]); final int startBit = (b * 32) + 32; final int endBit = startBit + ((cDefaultMetaBitsSize-1)*32); for (int i = startBit; i < endBit; i++) { if (tstBit(m_metaBits, i)) { final long addr = blockStart + ((i-startBit) * ALLOC_BLOCK_SIZE); final FixedAllocator allocator = readAllocator(addr); allocator.setDiskAddr(i); // store bit, not physical address! m_allocs.add(allocator); if (m_storageStats != null) { m_storageStats.register(allocator); } } } } // add sorted blocks into index array and set index number for address // encoding // m_allocs.addAll(blocks); Collections.sort(m_allocs); for (int index = 0; index < m_allocs.size(); index++) { ((Allocator) m_allocs.get(index)).setIndex(index); } } private FixedAllocator readAllocator(final long addr) throws IOException { final byte buf[] = new byte[ALLOC_BLOCK_SIZE]; FileChannelUtility.readAll(m_reopener, ByteBuffer.wrap(buf), addr); final ByteArrayInputStream baBuf = new ByteArrayInputStream(buf); final DataInputStream strBuf = new DataInputStream(baBuf); final int allocSize = strBuf.readInt(); // if Blob < 0 assert allocSize > 0; final int slotSizeIndex = slotSizeIndex(allocSize); if (slotSizeIndex == -1) { throw new IllegalStateException("Unexpected allocation size of: " + allocSize); } final FixedAllocator fa = new FixedAllocator(this, allocSize);//, m_writeCache); fa.read(strBuf); final int chk = ChecksumUtility.getCHK().checksum(buf, buf.length - baBuf.available()); int tstChk = strBuf.readInt(); if (tstChk != chk) { throw new IllegalStateException("FixedAllocator checksum error"); } if (slotSizeIndex == -1) { throw new IllegalStateException("Unexpected allocation size of: " + allocSize); } final ArrayList<? extends Allocator> freeList; freeList = m_freeFixed[slotSizeIndex]; fa.setFreeList(freeList); return fa; } /** * Computes the slot size index given the absolute slot size. * * If the slotSizes are [1,2,4] this corresponds to absolute sizes by * multiplying by 64 of [64, 128, 256], so slotSizeIndex(64) would return 0, * and any parameter other than 64, 128 or 256 would return -1. * * @param allocSize - absolute slot size * @return */ private int slotSizeIndex(final int allocSize) { if (allocSize % 64 != 0) return -1; final int slotSize = allocSize / 64; int slotSizeIndex = -1; for (int index = 0; index < m_allocSizes.length; index++) { if (m_allocSizes[index] == slotSize) { slotSizeIndex = index; break; } } return slotSizeIndex; } /** * Required for HA to support post commit message to synchronize allocators * with new state. By this time the new allocator state will have been flushed * to the disk, so should be 1) On disk, 2) Probably in OS cache and 3) Possibly * in the WriteCache. * * For efficiency we do not want to default to reading from disk. * * If there is an existing allocator, then we can compare the old with the new state * to determine which addresses have been freed and hence which addresses should be * removed from the external cache. * * @param index of Alloctor to be updated * @param addr on disk to be read * @throws InterruptedException * @throws ChecksumError * @throws IOException */ private void updateFixedAllocator(final int index, final long addr) throws ChecksumError, InterruptedException, IOException { final ByteBuffer buf = m_writeCacheService.read(addr, ALLOC_BLOCK_SIZE); final ByteArrayInputStream baBuf = new ByteArrayInputStream(buf.array()); final DataInputStream strBuf = new DataInputStream(baBuf); final int allocSize = strBuf.readInt(); // if Blob < 0 assert allocSize > 0; final int slotIndex = slotSizeIndex(allocSize); if (slotIndex == -1) throw new IllegalStateException("Invalid allocation size: " + allocSize); final FixedAllocator allocator = new FixedAllocator(this, allocSize); final ArrayList<? extends Allocator> freeList = m_freeFixed[slotIndex]; if (index < m_allocs.size()) { final FixedAllocator old = m_allocs.get(index); freeList.remove(old); m_allocs.set(index, allocator); allocator.setFreeList(freeList); // Need to iterate over all allocated bits in "old" and see if they // are clear in "new". If so then clear from externalCache } else { assert index == m_allocs.size(); m_allocs.add(allocator); } } /** * Called from ContextAllocation when no free FixedAllocator is immediately * available. First the free list will be checked to see if one is * available, otherwise it will be created. When the calling * ContextAllocation is released, its allocators will be added to the * global free lists. * * @param block - the index of the Fixed size allocation * @return the FixedAllocator */ private FixedAllocator establishFreeFixedAllocator(final int block) { final ArrayList<FixedAllocator> list = m_freeFixed[block]; for (int i = 0; i < list.size(); i++) { FixedAllocator f = list.get(i); if (!isOnCommitList(f)) { list.remove(i); return f; } } // no valid free allocators, so create a new one final int allocSize = 64 * m_allocSizes[block]; final FixedAllocator allocator = new FixedAllocator(this, allocSize);//, m_writeCache); allocator.setIndex(m_allocs.size()); m_allocs.add(allocator); if (m_storageStats != null) { m_storageStats.register(allocator, true); } return allocator; } // // Root interface // public long getRootAddr() { // return m_rootAddr; // } // // // Root interface // public PSInputStream getRoot() { // try { // return getData(m_rootAddr); // } catch (Exception e) { // throw new StorageTerminalError("Unable to read root data", e); // } // } // // public void setRootAddr(long rootAddr) { // m_rootAddr = (int) rootAddr; // } // // Limits // public void setMaxFileSize(final int maxFileSize) { // m_maxFileSize = maxFileSize; // } public long getMaxFileSize() { return m_maxFileSize; } // // Allocators // public PSInputStream getData(final long addr) { // return getData((int) addr, addr2Size((int) addr)); // } // // // Allocators // public PSInputStream getData(final int addr, final int size) { // final Lock readLock = m_extensionLock.readLock(); // // readLock.lock(); // // try { // try { // m_writeCache.flush(false); // } catch (InterruptedException e1) { // throw new RuntimeException(e1); // } // // if (addr == 0) { // return null; // } // // final PSInputStream instr = PSInputStream.getNew(this, size); // // try { //// m_raf.seek(physicalAddress(addr)); //// m_raf.readFully(instr.getBuffer(), 0, size); //// m_raf.getChannel().read(ByteBuffer.wrap(instr.getBuffer(), 0, size), physicalAddress(addr)); // FileChannelUtility.readAll(m_reopener, ByteBuffer.wrap(instr.getBuffer(), 0, size), // physicalAddress(addr)); // } catch (IOException e) { // throw new StorageTerminalError("Unable to read data", e); // } // // return instr; // } finally { // readLock.unlock(); // } // } volatile private long m_cacheReads = 0; volatile private long m_diskReads = 0; volatile private int m_allocations = 0; volatile private int m_frees = 0; volatile private long m_nativeAllocBytes = 0; /** * Alternative method signature returning a ByteBuffer rather than receiving a * byte array. * <p> * If a blob then an extra byte array is required in which to build the data, * but otherwise extra buffering could be avoided be reading directly from * the WriteCacheService. * * @param rwaddr * @param sze * @return */ public ByteBuffer getData(final long rwaddr, final int sze) { /* * Note: Contend with postHACommit(). */ final Lock lock = m_allocationReadLock; lock.lock(); try { // must allow for checksum if (sze > (m_maxFixedAlloc-4) || m_writeCacheService == null) { final byte buf[] = new byte[sze + 4]; // 4 bytes for checksum getData(rwaddr, buf, 0, sze+4); return ByteBuffer.wrap(buf, 0, sze); } else { final long paddr = physicalAddress((int) rwaddr); if (paddr == 0) { assertAllocators(); throw new PhysicalAddressResolutionException(rwaddr); } assert paddr > 0; try { return m_writeCacheService.read(paddr, sze+4); } catch (Throwable e) { /* * Note: ClosedByInterruptException can be thrown out of * FileChannelUtility.readAll(), typically because the LIMIT on * a query was satisfied, but we do not want to log that as an * error. */ // log.error(e,e); throw new RuntimeException("addr=" + rwaddr + " : cause=" + e, e); } } } finally { lock.unlock(); } } /** * If the buf[] size is greater than the maximum fixed allocation, then the * direct read will be the blob header record. In this case we should hand * over the streaming to a PSInputStream. * * FIXME: Javadoc update (was: For now we do not use the PSInputStream but instead process * directly...) * * If it is a BlobAllocation, then the BlobAllocation address points to the * address of the BlobHeader record. */ public void getData(final long addr, final byte buf[]) { getData(addr, buf, 0, buf.length); } /** * Set the option below to true to enable asynchronous reads of blob data. * The aim is to reduce latency when reading blobs from disk as it will * enable the disk controllers to re-order IO requests nd where possible * process in parallel. This should benefit all Blob reads but specifically * helps large deferredFree data to reduce commit latency as described in * BLZG-1663. * * @see BLZG-1663 * @see BLZG-1884 RWStore ASYNC IO fails to make progress (apparent deadlock) */ final private boolean m_readBlobsAsync; public void getData(final long addr, final byte buf[], final int offset, final int length) { assertOpen(); if (addr == 0) { return; } final long begin = System.nanoTime(); /* * Note: Contend with postHACommit(). */ final Lock lock = m_allocationReadLock; lock.lock(); try { assertOpen(); // check again after taking lock // assertNoRebuild(); // length includes space for the checksum if (length > m_maxFixedAlloc) { try { final int alloc = m_maxFixedAlloc-4; final int nblocks = (alloc - 1 + (length-4))/alloc; if (nblocks < 0) throw new IllegalStateException( "Allocation error, m_maxFixedAlloc: " + m_maxFixedAlloc); final byte[] hdrbuf = new byte[4 * (nblocks + 1) + 4]; // plus 4 bytes for checksum if (hdrbuf.length > m_maxFixedAlloc) { if (log.isInfoEnabled()) { log.info("LARGE BLOB - header is BLOB"); } } getData(addr, hdrbuf); // will work even if header is also a blob final DataInputStream hdrstr = new DataInputStream(new ByteArrayInputStream(hdrbuf)); final int rhdrs = hdrstr.readInt(); if (rhdrs != nblocks) { throw new IllegalStateException( "Incompatible BLOB header record, expected: " + nblocks + ", got: " + rhdrs); } final int[] blobHdr = new int[nblocks]; for (int i = 0; i < nblocks; i++) { blobHdr[i] = hdrstr.readInt(); } // Now we have the header addresses, we can read MAX_FIXED_ALLOCS until final buffer if (!m_readBlobsAsync) { // synchronous read of blob data int cursor = 0; int rdlen = m_maxFixedAlloc; for (int i = 0; i < nblocks; i++) { if (i == (nblocks - 1)) { rdlen = length - cursor; } getData(blobHdr[i], buf, cursor, rdlen); // include space for checksum cursor += rdlen-4; // but only increase cursor by data } // } else { // s_readBlobsAsync // final AsynchronousFileChannel channel = m_reopener.getAsyncChannel(); // final ArrayList<Future<Integer>> reads = new ArrayList<Future<Integer>>(); // try { // int cursor = 0; // int rdlen = m_maxFixedAlloc; // int cacheReads = 0; // for (int i = 0; i < nblocks; i++) { // if (i == (nblocks - 1)) { // rdlen = length - cursor; // } // final ByteBuffer bb = ByteBuffer.wrap(buf, // cursor, rdlen-4); // strip off checksum to avoid overlapping buffer reads! // final long paddr = physicalAddress(blobHdr[i]); // final ByteBuffer cache = m_writeCacheService._readFromCache(paddr, rdlen); // if (cache != null) { // bb.put(cache); // write cached data! // cacheReads++; // } else { // reads.add(channel.read(bb, // paddr)); // } // cursor += rdlen - 4; // but only increase cursor by data // } // for (Future<Integer> r : reads) { // r.get(); // } // } catch (Exception e) { // throw new IOException("Error from async IO", e); // } finally { // for (Future r : reads) { // r.cancel(true); // } // } } else { // read non-cached data with FileChannelUtility final ArrayList<AsyncTransfer> transfers = new ArrayList<AsyncTransfer>(); int cursor = 0; int rdlen = m_maxFixedAlloc; for (int i = 0; i < nblocks; i++) { if (i == (nblocks - 1)) { rdlen = length - cursor; } final ByteBuffer bb = ByteBuffer.wrap(buf, cursor, rdlen - 4); // strip off // checksum to avoid // overlapping // buffer reads! final long paddr = physicalAddress(blobHdr[i]); final ByteBuffer cache; try { cache = m_writeCacheService._readFromCache(paddr, rdlen); } catch (Exception e) { throw new IOException("Error from async IO", e); } if (cache != null) { bb.put(cache); // write cached data! } else { transfers.add(new AsyncTransfer(paddr, bb)); } cursor += rdlen - 4; // but only increase cursor // by data } FileChannelUtility.readAllAsync(m_reopener, transfers); } return; } catch (IOException e) { log.error(e,e); throw new IllegalStateException("Unable to restore Blob allocation", e); } } { final StoreCounters<?> storeCounters = (StoreCounters<?>) this.storeCounters .get().acquire(); try { final int nbytes = length; if (nbytes > storeCounters.maxReadSize) { storeCounters.maxReadSize = nbytes; } } finally { storeCounters.release(); } } try { final int slotSize = getBlock((int) addr).getBlockSize(); if (slotSize < length) { throw new IllegalStateException("Bad Address: length requested greater than allocated slot: " + slotSize + " < " + length); } final long paddr = physicalAddress((int) addr); if (paddr == 0) { assertAllocators(); throw new PhysicalAddressResolutionException(addr); } assert paddr > 0; /** * Check WriteCache first * * Note that the buffer passed in should include the checksum * value, so the cached data is 4 bytes less than the buffer * size. */ final ByteBuffer bbuf; try { bbuf = m_writeCacheService != null ? m_writeCacheService.read(paddr, length) : null; } catch (Throwable t) { throw new IllegalStateException( "Error reading from WriteCache addr: " + paddr + " length: " + (length - 4) + ", writeCacheDebug: " + m_writeCacheService.addrDebugInfo(paddr), t); } if (bbuf != null) { if (bbuf.limit() != length-4) { assertAllocators(); throw new IllegalStateException( "Incompatible buffer size for addr: " + paddr + ", " + bbuf.limit() + " != " + (length - 4) + " writeCacheDebug: " + m_writeCacheService.addrDebugInfo(paddr)); } final byte[] in = bbuf.array(); // reads in with checksum - no need to check if in cache for (int i = 0; i < length-4; i++) { buf[offset+i] = in[i]; } m_cacheReads++; /* * Hit on the write cache. * * Update the store counters. */ final StoreCounters<?> c = (StoreCounters<?>) storeCounters .get().acquire(); try { final int nbytes = length; c.nreads++; c.bytesRead += nbytes; c.elapsedReadNanos += (System.nanoTime() - begin); } finally { c.release(); } } else { // Read through to the disk. // With a non-null WCS, the actual read should be via a callback to readRaw, it should not get here // unless it is not possible to cache - but maybe even then the WCS should read into a temporary // buffer // If checksum is required then the buffer should be sized to include checksum in final 4 bytes final ByteBuffer bb = ByteBuffer.wrap(buf, offset, length); // Use ReadRaw - should be the same read all readRaw(paddr, bb); final int chk = ChecksumUtility.getCHK().checksum(buf, offset, length-4); // read checksum final int tstchk = bb.getInt(offset + length-4); if (chk != tstchk) { assertAllocators(); if (m_writeCacheService != null) { final String cacheDebugInfo = m_writeCacheService.addrDebugInfo(paddr); log.warn("Invalid data checksum for addr: " + paddr + ", chk: " + chk + ", tstchk: " + tstchk + ", length: " + length + ", first bytes: " + toHexString(buf, 32) + ", successful reads: " + m_diskReads + ", at last extend: " + m_readsAtExtend + ", cacheReads: " + m_cacheReads + ", writeCacheDebug: " + cacheDebugInfo); } throw new IllegalStateException( "Invalid data checksum from address: " + paddr + ", size: " + (length - 4)); } // do not explicitly cache the read, it will be cached by the WCS! // if (m_writeCache != null) { // cache the read! // m_writeCache.cache(paddr, bb); // } } } catch (PhysicalAddressResolutionException e) { throw new IllegalArgumentException("Unable to read data: "+e, e); } catch (Throwable e) { /* * Note: ClosedByInterruptException can be thrown out of * FileChannelUtility.readAll(), typically because the LIMIT on * a query was satisfied, but we do not want to log that as an * error. */ // log.error(e,e); throw new RuntimeException("addr=" + addr + " : cause=" + e, e); } } finally { lock.unlock(); } } // /** // * Convenience check for thoseA batch invoice public methods that must be restricted if a rebuild is in progress // */ // private void assertNoRebuild() { // if (m_rebuildRequest != null) // throw new IllegalStateException("Invalid when rebuilding"); // } private void assertAllocators() { final Lock lock = m_allocationReadLock; lock.lock(); try { for (int i = 0; i < m_allocs.size(); i++) { if (m_allocs.get(i).getIndex() != i) { throw new IllegalStateException("Allocator at invalid index: " + i + ", index stored as: " + m_allocs.get(i).getIndex()); } } } finally { lock.unlock(); } } // static private final char[] HEX_CHAR_TABLE = { // '0', '1','2','3', // '4','5','6','7', // '8','9','a','b', // 'c','d','e','f' // }; // utility to display byte array of maximum i bytes as hexString static private String toHexString(final byte[] buf, int n) { // n = n < buf.length ? n : buf.length; // final StringBuffer out = new StringBuffer(); // for (int i = 0; i < n; i++) { // final int v = buf[i] & 0xFF; // out.append(HEX_CHAR_TABLE[v >>> 4]); // out.append(HEX_CHAR_TABLE[v &0xF]); // } // return out.toString(); return BytesUtil.toHexString(buf, n); } public void free(final long laddr, final int sze) { free(laddr, sze, null/* AlocationContext */); } // private long m_unsafeFrees = 0; /** * free * <p> * If the address is greater than zero than it is interpreted as a physical * address and the allocators are searched to find the allocations. * Otherwise the address directly encodes the allocator index and bit * offset, allowing direct access to clear the allocation. * <p> * A blob allocator contains the allocator index and offset, so an allocator * contains up to 245 blob references. * * @param laddr * @param sze * @param context */ public void free(final long laddr, final int sze, final IAllocationContext context) { assertOpen(); // assertNoRebuild(); final int addr = (int) laddr; switch (addr) { case 0: case -1: case -2: return; } m_allocationWriteLock.lock(); try { checkContext(context); if (m_lockAddresses != null && m_lockAddresses.containsKey((int)laddr)) throw new IllegalStateException("address locked: " + laddr); if (sze > m_maxFixedAlloc-4) { freeBlob(addr, sze, context); } else { final FixedAllocator alloc = getBlockByAddress(addr); /* * There are a few conditions here. If the context owns the * allocator and the allocation was made by this context then it * can be freed immediately. The problem comes when the context * is null and the allocator is NOT owned, BUT there are active * AllocationContexts, in this situation, the free must ALWAYS * be deferred. * * If the MIN_RELEASE_AGE is ZERO then we can protect allocations * and read-only transactions with Session protection, avoiding * the need to manage deferred frees. * * FIXME We need unit tests when MIN_RELEASE_AGE is GT ZERO. * * FIXME We need unit test when MIN_RELEASE_AGE is ZERO AND * there are open read-only transactions. */ if (m_minReleaseAge == 0) { /* * The session protection is complicated by the mix of * transaction protection and isolated AllocationContexts. * * If this is the first use of an IAllocationContext then * then isSessionProtected may return false, so check the * context first. */ if (context != null && context.isIsolated()) { if (alloc.canImmediatelyFree(addr, sze, context)) { immediateFree(addr, sze, true); } else { getContextAllocation(context).deferFree(encodeAddr(addr, sze)); } } else if (this.isSessionProtected()) { immediateFree(addr, sze, false); } else { immediateFree(addr, sze); } } else if (context != null && (context.isIsolated()) && alloc.canImmediatelyFree(addr, sze, context)){ immediateFree(addr, sze); } else { // if a free request is made within a context not managed by // the allocator then it is not safe to free boolean alwaysDefer = m_activeTxCount > 0; if (!alwaysDefer) alwaysDefer = context == null && !m_contexts.isEmpty(); if (alwaysDefer) if (log.isDebugEnabled()) log.debug("Should defer " + addr + " real: " + physicalAddress(addr)); if (alwaysDefer || !alloc.canImmediatelyFree(addr, sze, context)) { // If the context is != null, then the deferral must be against that context! if (context != null && context.isIsolated()) { getContextAllocation(context).deferFree(encodeAddr(addr, sze)); } else { deferFree(addr, sze); } } else { immediateFree(addr, sze); } } } } finally { m_allocationWriteLock.unlock(); } } private void checkContext(final IAllocationContext context) { if (context != null) { context.checkActive(); } } private long encodeAddr(long alloc, final int nbytes) { alloc <<= 32; alloc += nbytes; return alloc; } long getHistoryRetention() { return m_minReleaseAge; } /** * Session protection can only be used in preference to deferred frees when * the minReleaseAge is zero. If so then two protection states are checked: * either a positive activeTxCount incremented by the TransactionManager * or if there are active AllocationContexts. * * The activeTxCount essentially protects read-only transactions while the * AllocationContexts enable concurrent store allocations, whilst also * supporting immediate re-cycling of localized allocations (those made * and released within the same AllocationContext). * * Also check to see if there is an uncomplete quorum being established, in * which case provide default session protection to avoid recycling. * * @return whether there is a logical active session */ boolean isSessionProtected() { if (!m_allocationWriteLock.isHeldByCurrentThread()) { /* * In order for changes to m_activeTxCount to be visible the caller * MUST be holding the lock. */ throw new IllegalMonitorStateException(); } // backoff until synchronization is implemented // // protect recyling with unmet quorum // if (m_quorum != null && !m_quorum.isQuorumMet()) { // return true; // } return m_minReleaseAge == 0 && (m_activeTxCount > 0 || !m_contexts.isEmpty()); } /** * Sessions will only be used to protect transactions and read-only views * when the m_minReleaseAge is no zero, otherwise the deferredFree * approach will be used. * * When called, will call through to the Allocators to re-sync the * transient bits with the committed and live. * * The writeCache is passed into the allocator to enable any "now free" * allocations to be cleared from the cache. Until the session is released * the writeCache must be maintained to support readers of uncommitted and * unwritten allocations. */ private void releaseSessions() { assert(m_activeTxCount == 0 && m_contexts.isEmpty()); if (m_minReleaseAge == 0) { if (log.isDebugEnabled()) log.debug("RELEASE SESSIONS"); for (FixedAllocator fa : m_allocs) { fa.releaseSession(m_writeCacheService); } } } private boolean freeBlob(final int hdr_addr, final int sze, final IAllocationContext context) { if (sze <= (m_maxFixedAlloc-4)) throw new IllegalArgumentException("Unexpected address size"); if (m_storageStats != null) { m_storageStats.deleteBlob(sze); } final int alloc = m_maxFixedAlloc-4; final int blcks = (alloc - 1 + sze)/alloc; // read in header block, then free each reference final byte[] hdr = new byte[(blcks+1) * 4 + 4]; // add space for checksum getData(hdr_addr, hdr); final DataInputStream instr = new DataInputStream( new ByteArrayInputStream(hdr, 0, hdr.length-4) ); try { final int allocs = instr.readInt(); int rem = sze; for (int i = 0; i < allocs; i++) { final int nxt = instr.readInt(); free(nxt, rem < alloc ? rem : alloc, context); rem -= alloc; } free(hdr_addr, hdr.length, context); return true; } catch (IOException ioe) { throw new RuntimeException(ioe); } } private boolean freeImmediateBlob(final int hdr_addr, final int sze) { if (sze <= (m_maxFixedAlloc-4)) throw new IllegalArgumentException("Unexpected address size"); if (m_storageStats != null) { m_storageStats.deleteBlob(sze); } final int alloc = m_maxFixedAlloc-4; final int blcks = (alloc - 1 + sze)/alloc; // read in header block, then free each reference final byte[] hdr = new byte[(blcks+1) * 4 + 4]; // add space for checksum getData(hdr_addr, hdr); final DataInputStream instr = new DataInputStream( new ByteArrayInputStream(hdr, 0, hdr.length-4) ); // retain lock for all frees m_allocationWriteLock.lock(); try { final int allocs = instr.readInt(); int rem = sze; for (int i = 0; i < allocs; i++) { final int nxt = instr.readInt(); immediateFree(nxt, rem <= alloc ? rem : alloc); rem -= alloc; } immediateFree(hdr_addr, hdr.length); return true; } catch (IOException ioe) { throw new RuntimeException(ioe); } finally { m_allocationWriteLock.unlock(); } } // private long immediateFreeCount = 0; private void immediateFree(final int addr, final int sze) { immediateFree(addr, sze, false); } private void immediateFree(final int addr, final int sze, final boolean overrideSession) { switch (addr) { case 0: case -1: case -2: return; } if (sze > (this.m_maxFixedAlloc-4)) { freeImmediateBlob(addr, sze); return; } m_allocationWriteLock.lock(); try { final FixedAllocator alloc = getBlockByAddress(addr); final int addrOffset = getOffset(addr); if (alloc == null) { throw new IllegalArgumentException("Invalid address provided to immediateFree: " + addr + ", size: " + sze); } final long pa = alloc.getPhysicalAddress(addrOffset); // In a tight loop, this log level test shows up as a hotspot // if (log.isTraceEnabled()) // log.trace("Freeing allocation at " + addr + ", physical address: " + pa); alloc.free(addr, sze, overrideSession); // must clear after free in case is a blobHdr that requires reading! // the allocation lock protects against a concurrent re-allocation // of the address before the cache has been cleared assert pa != 0; // only clear any existing write to cache if no active session if (overrideSession || !this.isSessionProtected()) { // Only overwrite if NOT committed if (!alloc.isCommitted(addrOffset)) { m_writeCacheService.clearWrite(pa,addr); // m_writeCache.overwrite(pa, sze); /* * Pass the size of the allocator, NOT the size of the * allocation. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/586" * > RWStore immedateFree() not removing Checkpoint * addresses from the historical index cache. </a> */ // removeFromExternalCache(pa, sze); removeFromExternalCache(pa, alloc.m_size); } } m_frees++; if (alloc.isAllocated(addrOffset)) throw new IllegalStateException("Reallocation problem with WriteCache"); if (alloc.isUnlocked()) { addToCommit(alloc); } m_recentAlloc = true; } finally { m_allocationWriteLock.unlock(); } } /** * We need to remove entries from the historicalIndexCache for checkpoint * records when the allocations associated with those checkpoint records are * freed. * * @param clr * The physical address that is being deleted. * @param slotSize * The size of the allocator slot for that physical address. * * @see <a href="https://sourceforge.net/apps/trac/bigdata/ticket/586"> * RWStore immedateFree() not removing Checkpoint addresses from the * historical index cache. </a> */ void removeFromExternalCache(final long clr, final int slotSize) { assert m_allocationWriteLock.isHeldByCurrentThread(); if (m_externalCache == null) return; if (slotSize == 0 || slotSize == m_cachedDatasize) { /* * Either known to be the same slot size as a checkpoint record -or- * the slot size is not known. */ m_externalCache.remove(clr); } } /** * alloc * * Alloc always allocates from a FixedAllocation. Blob allocations are * implemented using largest Fixed blocks as specified in MAX_FIXED_ALLOC. * * The previous Stream method chained blocks together, but the new approach * uses a master block and a list of allocations. Since we now have a * MAX-FIXED_ALLOC of 256K this means that we would represent a 1MB * allocation as a 64byte masters and four 256K blocks. For BigData 1MB * bloom filters we would probably handle all in a single FixedAllocator of * 256K allocations since we would hold 4096 of these in a single allocator, * which with (say) 12 1MB bloom filters with 2-phase commit would only * require 2 * (4 * 12) = 48 bits plus 12 64 byte headers. The maximum BLOB * would be determined by a 256K header record with 64K * 256K allocations * or 16GB, which is larger than MAXINT (we use an int to store allocation * size in the address). * * The use of a IAllocationContext adds some complexity to the previous * simple freelist management. The problem is two-fold. * * Firstly it is okay for an Allocator on the free list to return a null * address, since it may be managing storage for a specific context. * * Secondly we must try and ensure that Allocators used by a specific * context can be found again. For example, if allocator#1 is assigned to * context#1 and allocator#2 to context#2, when context#1 is detached we * want context#2 to first find allocator#2. This is further complicated * by the finer granularity of the AllocBlocks within a FixedAllocator. */ // private volatile long m_maxAllocation = 0; private volatile long m_spareAllocation = 0; /** Core allocation method. */ public int alloc(final int size, final IAllocationContext context) { if (size > m_maxFixedAlloc) { throw new IllegalArgumentException("Allocation size to big: " + size + " > " + m_maxFixedAlloc); } m_allocationWriteLock.lock(); try { checkContext(context); try { final FixedAllocator allocator; final int i = fixedAllocatorIndex(size); if (context != null && context.isIsolated()) { allocator = getContextAllocation(context).getFreeFixed(i); if (allocator.checkBlock0()) { if (log.isInfoEnabled()) log.info("Adding new shadowed allocator, index: " + allocator.getIndex() + ", diskAddr: " + allocator.getDiskAddr()); addToCommit(allocator); } } else { final int block = 64 * m_allocSizes[i]; m_spareAllocation += (block - size); // Isn't adjusted by frees! final ArrayList<FixedAllocator> list = m_freeFixed[i]; if (list.size() == 0) { /* * No allocator on the free list for that slot size. */ final FixedAllocator candidate; if (size < this.cSmallSlot) { /* * Check to see if can locate a good enough * Allocator * * @see BLZG-1278 (Small slot optimization to * minimize waste). */ candidate = findAllocator(block); } else { candidate = null; } if (candidate != null) { candidate.addToFreeList(); allocator = candidate; } else { /* * We need a new allocator. */ allocator = new FixedAllocator(this, block); allocator.setFreeList(list); allocator.setIndex(m_allocs.size()); if (log.isTraceEnabled()) log.trace("New FixedAllocator for " + block); m_allocs.add(allocator); if (m_storageStats != null) { m_storageStats.register(allocator, true); } } if (allocator.checkBlock0()) { addToCommit(allocator); } } else { // Verify free list only has allocators with free bits if (log.isDebugEnabled()){ int tsti = 0; final Iterator<FixedAllocator> allocs = list.iterator(); while (allocs.hasNext()) { final Allocator tstAlloc = allocs.next(); if (!tstAlloc.hasFree()) { throw new IllegalStateException("Free list contains full allocator, " + tsti + " of " + list.size()); } tsti++; } } allocator = list.get(0); } } final int addr = allocator.alloc(this, size, context); if (addr == 0) { throw new IllegalStateException("Free Allocator unable to allocate address: " + allocator.getSummaryStats()); } if (allocator.isUnlocked()) { addToCommit(allocator); } m_recentAlloc = true; final long pa = physicalAddress(addr); if (pa == 0L) { throw new IllegalStateException( "No physical address found for " + addr); } m_allocations++; m_nativeAllocBytes += size; return addr; } catch (Throwable t) { log.error(t,t); throw new RuntimeException(t); } } finally { m_allocationWriteLock.unlock(); } } /** * For a small slot size only, look for an existing allocator that has a * sufficient percentage of free bits and add it to the free list. If this * test fails then the caller must allocate a new allocator. * * @param block * * @return * * @see BLZG-1278 (Small slot optimization to minimize waste). */ private FixedAllocator findAllocator(final int block) { // only look if small slot if (block > cSmallSlot) { return null; } // Look up the statistics for that slot size. final Bucket stats = m_storageStats.findBucket(block); if (stats == null) { // Can't do anything. This is not an expected code path. return null; } /* * Only check waste if number of allocators is greater than some * configurable amount. * * The thought here is that it is not necessary to focus on minimizing * waste for small stores and that by allowing that waste we permit * better locality (co-location on a page) for small slots. Once we * start to limit the small slot waste we essentially just change the * #of free bits before we are willing to allow a small slot allocator * onto the free list. */ if (stats.m_allocators < cSmallSlotWasteCheckAllocators) { return null; } // only check small slots if total waste is larger than some configurable amount final float slotWaste = stats.slotsUnused(); if (slotWaste < cSmallSlotHighWaste) { return null; } // Now find candidate allocator with maximum free slots above a minimum threshold FixedAllocator candidate = null; int candidateFreeBits = cSmallSlotThresholdHighWaste; // minimum threshold for (int i = 0; i < m_allocs.size(); i++) { final FixedAllocator tst = m_allocs.get(i); if (tst.getBlockSize() == block) { // right size if (tst.m_freeBits > candidateFreeBits) { candidate = tst; candidateFreeBits = candidate.m_freeBits; } } } if (candidate != null) { candidate.m_smallSlotHighWaste = true; if (log.isDebugEnabled()) { log.debug("Found candidate small slot allocator"); } } return candidate; } private int fixedAllocatorIndex(final int size) { int i = 0; int cmp = m_minFixedAlloc; while (size > cmp) { i++; cmp = 64 * m_allocSizes[i]; } return i; } /**************************************************************************** * The base realloc method that returns a stream for writing to rather than * handle the reallocation immediately. **/ public PSOutputStream realloc(final long oldAddr, final int size) { free(oldAddr, size); return PSOutputStream.getNew(this, m_maxFixedAlloc, null); } /**************************************************************************** * Called by PSOutputStream to make to actual allocation or directly by * lower level API clients. * <p> * If the allocation is for greater than MAX_FIXED_ALLOC, then a * PSOutputStream is used to manage the chained buffers. * * TODO: Instead of using PSOutputStream, manage allocations written to the * WriteCacheService, building BlobHeader as you go. **/ public long alloc(final byte buf[], final int size, final IAllocationContext context) { m_allocationWriteLock.lock(); try { checkContext(context); final long begin = System.nanoTime(); if (size > (m_maxFixedAlloc - 4)) { if (size > getMaxBlobSize()) throw new IllegalArgumentException( "Allocation request beyond maximum BLOB of " + getMaxBlobSize()); if (log.isTraceEnabled()) log.trace("BLOB ALLOC: " + size); if (m_storageStats != null) { m_storageStats.allocateBlob(size); } final PSOutputStream psout = PSOutputStream.getNew(this, m_maxFixedAlloc, context); try { int i = 0; final int blocks = size / 512; for (int b = 0; b < blocks; b++) { psout.write(buf, i, 512); // add 512 bytes at a time i += 512; } psout.write(buf, i, size - i); return psout.save(); } catch (IOException e) { throw new RuntimeException("Closed Store?", e); } finally { try { psout.close(); // return stream } catch (IOException ioe) { // should not happen, since this should only be // recycling log.warn("Unexpected error closing PSOutputStream", ioe); } } } final int newAddr = alloc(size + 4, context); // allow size for // checksum if (newAddr == 0) throw new IllegalStateException("NULL address allocated"); final int chk = ChecksumUtility.getCHK().checksum(buf, size); final long pa = physicalAddress(newAddr); try { m_writeCacheService.write(pa, ByteBuffer.wrap(buf, 0, size), chk, true/* writeChecksum */, newAddr/* latchedAddr */); } catch (InterruptedException e) { throw new RuntimeException("Closed Store?", e); } // Update counters. final StoreCounters<?> c = (StoreCounters<?>) storeCounters.get() .acquire(); try { final int nwrite = size + 4;// size plus checksum. c.nwrites++; c.bytesWritten += nwrite; c.elapsedWriteNanos += (System.nanoTime() - begin); if (nwrite > c.maxWriteSize) { c.maxWriteSize = nwrite; } } finally { c.release(); } return newAddr; } finally { m_allocationWriteLock.unlock(); } } // /**************************************************************************** // * Fixed buffer size reallocation // **/ // public long realloc(final long oldAddr, final int oldSize, final byte buf[]) { // // free(oldAddr, oldSize); // // return alloc(buf, buf.length); // } // /** // * Must handle valid possibility that a request to start/commit transaction // * could be made within a commitCallback request // */ // synchronized public void startTransaction() { // if (m_committing) { // return; // } // // m_transactionCount++; // } // // synchronized public void commitTransaction() { // if (m_committing) { // return; // } // // if (log.isDebugEnabled()) // log.debug("Commit Transaction"); // // if (--m_transactionCount <= 0) { // commitChanges(); // // m_transactionCount = 0; // } // } // // public int getTransactionCount() { // return m_transactionCount; // } // // // -------------------------------------------------------------------------------------------- // // rollbackTransaction // // // // clear write cache // // read in last committed header // synchronized public void rollbackTransaction() { // if (m_transactionCount > 0 || m_readOnly) { // hack for resync // baseInit(); // // try { // m_writeCache.reset(); // dirty writes are discarded // // readAllocationBlocks(); // } catch (Exception e) { // throw new StorageTerminalError("Unable to rollback transaction", e); // } // } // } // /* // * Slug // */ // private int fibslug(int n) { // if (n < 2) // return 1; // else // return fibslug(n-1) + fibslug(n-2); // } /** * The semantics of reset are to revert unisolated writes to committed * state. * <p> * Unisolated writes must also be removed from the write cache. * <p> * The AllocBlocks of the FixedAllocators maintain the state to determine * the correct reset behavior. * <p> * If the store is using DirectFixedAllocators then an IllegalStateException * is thrown. * <p> * If there is an active {@link #m_commitStateRef}, then this indicates a * failure after the {@link RWStore#commit()} had "succeeded". */ public void reset() { if (log.isInfoEnabled()) { log.info("RWStore Reset"); } m_allocationWriteLock.lock(); try { // DEBUG // fibslug(40); // slug to improve odds of interruption of reset (if possible) assertOpen(); // assertNoRebuild(); final CommitState commitState = m_commitStateRef .getAndSet(null/* newValue */); if (commitState != null) { commitState.reset(); // restore state values on RWStore. } boolean isolatedWrites = false; /** * Clear all allocators, not just dirty allocators, since we also * need to reset the transient bits associated with session * protection. * * Need to know if there are any isolated modifications, in which case * we must remember so that we avoid clearing down the store. */ for (FixedAllocator fa : m_allocs) { isolatedWrites |= fa.reset(m_writeCacheService, m_committedNextAllocation); } /** * Now clone the transient metabits for protection if this service becomes leader */ syncMetaTransients(); if (!isolatedWrites) { /** * Now we should be able to unwind any unused allocators and unused * alloc blocks. An unused allocator is one with no diskAddr (never * committed). But it may be more difficult to determine if * an alloc block has never been used, for that we really need to * know what the nextAllocationOffset was at the previous commit. * This could be cached as lastCommittedOffset, in which case we can unwind any * allocBlocks with addresses >= to that. */ int origAllocs = m_allocs.size(); while (m_allocs.size() > 0) { final int last = m_allocs.size()-1; final FixedAllocator fa = m_allocs.get(last); if (fa.getDiskAddr() == 0) { fa.setIndex(-1); // must remove from free list! m_freeFixed[fixedAllocatorIndex(fa.m_size)].remove(fa); // ..and then from main allocation list m_allocs.remove(last); } else { break; } } m_nextAllocation = m_committedNextAllocation; if (log.isDebugEnabled()) log.debug("Reset allocators, old: " + origAllocs + ", now: " + m_allocs.size()); // Clear the dirty list. // FIXME: we should be able to clear the dirty list, but this currently causes // problems in HA. // If the allocators are torn down correctly, we should be good to clear the commitList clearCommitList(); // Flag no allocations since last commit m_recentAlloc = false; } else { // there are isolated writes, so we must not clear the commit list since otherwise // the Alloction index wil get out of sync as per Ticket #1136 } if (m_quorum != null) { /** * When the RWStore is part of an HA quorum, we need to close * out and then reopen the WriteCacheService every time the * quorum token is changed. For convienence, this is handled by * extending the semantics of abort() on the Journal and reset() * on the RWStore. * * @see <a * href="https://sourceforge.net/apps/trac/bigdata/ticket/530"> * HA Journal </a> */ m_writeCacheService.close(); m_writeCacheService = newWriteCacheService(); } else if (m_writeCacheService != null) { /* * Note: We DO NOT need to reset() the WriteCacheService. If a * record was already flushed to the disk, then it is on the * disk and clearing the record from the cache will not change * that. If the record has not yet been flushed to the disk, * then we already cleared it from the WCS when we reset the * FixedAllocators (above). */ // m_writeCacheService.reset(); // m_writeCacheService.setExtent(convertAddr(m_fileSize)); } /* * Discard any writes on the delete blocks. Those deletes MUST NOT * be applied after a reset() on the RWStore. * * @see https://sourceforge.net/apps/trac/bigdata/ticket/602 * (RWStore does not discard deferred deletes on reset) */ m_deferredFreeOut.reset(); /* * Reset any storage stats */ if (m_storageStatsAddr != 0) { m_storageStats.reset(); } else { m_storageStats = new StorageStats(m_allocSizes); } } catch (Exception e) { throw new IllegalStateException("Unable to reset the store", e); } finally { m_allocationWriteLock.unlock(); } } // synchronized public boolean isActiveTransaction() { // return m_transactionCount > 0; // } /** * writeMetaBits must be called after all allocations have been made, the * last one being the allocation for the metabits themselves (allowing for * an extension!). * * Ticket #936: The meta-bits allocation is currently made from the FixedAllocator * region. This works well providing the required allocation bits is less than * the maximum FixedAllocator slot size. While this is neat, there are problems at scale * for maximum slot sizes less than 64K. * * To address the 8K bits in a 1K alloctor, 13 bits are required, this leaves 19 bits * to index an Allocator, or 18 bits without the sign => 256K maximum index. * * To be able to commit changes to all 256K allocators requires 512K metabits => 64K bytes. * We would like to associate the 64K allocations with the root block, so a single 128K * allocation would be split into 64K demi-spaces, one for each root block. * * While a negative address indicates a standard RW allocation a ositive address can be used * to indicate an explicitly allocated region. The trick is to ensure that the region is * allocated on a 128K boundary, then the lower bits can indicate which demi-space is used with * a simple XOR. * * Note that we must ensure that any previous demi-space write is removed from the WCS. * * @throws IOException */ private void writeMetaBits() throws IOException { final byte buf[] = genMetabitsData(); /* * Note: this address is set by commit() prior to calling * writeMetaBits(). */ //final long addr = physicalAddress(m_metaBitsAddr); final long addr = m_metaBitsAddr < 0 ? physicalAddress(m_metaBitsAddr) : ((long) m_metaBitsAddr) << ALLOCATION_SCALEUP; if (addr == 0) { throw new IllegalStateException("Invalid metabits address: " + m_metaBitsAddr); } assert addr > 0; try { if (log.isDebugEnabled()) log.debug("writing metabits at: " + addr); // Similar to writeMetaBits, we are no longer writing to a FixedAllocator managed region, // so no latched address is provided m_writeCacheService.write(addr, ByteBuffer.wrap(buf), 0/*chk*/, false/*useChecksum*/, m_metaBitsAddr < 0 ? m_metaBitsAddr : 0 /*latchedAddr*/); } catch (InterruptedException e) { throw new RuntimeException(e); } } private byte[] genMetabitsData() throws IOException { // the metabits is now prefixed by a long specifying the lastTxReleaseTime // used to free the deferedFree allocations. This is used to determine // which commitRecord to access to process the nextbatch of deferred // frees. // the cDefaultMetaBitsSize is also written since this can now be // parameterized. final int len = 4 * (cMetaHdrFields + m_allocSizes.length + m_metaBits.length); final byte buf[] = new byte[len]; final FixedOutputStream str = new FixedOutputStream(buf); try { str.writeInt(m_metaBitsAddr > 0 ? cVersionDemispace : cVersion); str.writeLong(m_lastDeferredReleaseTime); str.writeInt(cDefaultMetaBitsSize); str.writeInt(m_allocSizes.length); str.writeLong(m_storageStatsAddr); // Let's reserve ourselves some space for (int i = 0; i < cReservedMetaBits; i++) { str.writeInt(0); } /* * Write out the size of the allocation slots as defined by * Options.ALLOCATION_SIZES (this is where we store that * information). */ for (int i = 0; i < m_allocSizes.length; i++) { str.writeInt(m_allocSizes[i]); } /* * Write out XXX */ for (int i = 0; i < m_metaBits.length; i++) { str.writeInt(m_metaBits[i]); } str.flush(); } finally { str.close(); } return buf; } /** * * @return */ public boolean isDirty() { return requiresCommit(); } /** * Object recording the undo state for the {@link RWStore#commit()} ... * {@link RWStore#postCommit()} sequence. The {@link CommitState} must * either {@link CommitState#commit()} or {@link CommitState#reset()}. Those * {@link CommitState} methods are invoked out of the corresponding * {@link RWStore} methods. * * @see <a href="http://trac.blazegraph.com/ticket/973" >RWStore commit is not * robust to internal failure.</a> */ private class CommitState { /* * Critical pre-commit state that must be restored if a commit is * discarded. */ private final int m_lastCommittedNextAllocation; private final long m_storageStatsAddr; private final int m_metaBitsAddr; CommitState() { // retain copy of critical pre-commit state if (!m_allocationWriteLock.isHeldByCurrentThread()) throw new IllegalMonitorStateException(); m_lastCommittedNextAllocation = RWStore.this.m_committedNextAllocation; m_storageStatsAddr = RWStore.this.m_storageStatsAddr; m_metaBitsAddr = RWStore.this.m_metaBitsAddr; } void postCommit() { // NOP } /** Reset pre-commit state to support reset/abort/rollback. */ void reset() { if (!m_allocationWriteLock.isHeldByCurrentThread()) throw new IllegalMonitorStateException(); RWStore.this.m_storageStatsAddr = m_storageStatsAddr; RWStore.this.m_committedNextAllocation = m_lastCommittedNextAllocation; RWStore.this.m_metaBitsAddr = m_metaBitsAddr; } } /** * @see <a href="http://trac.blazegraph.com/ticket/973" >RWStore commit is not * robust to internal failure.</a> */ private final AtomicReference<CommitState> m_commitStateRef = new AtomicReference<CommitState>(); /** * Package private method used by the test suite. */ void clearCommitStateRef() { m_commitStateRef.set(null/* newValue */); } @Override public void commit() { assertOpen(); // assertNoRebuild(); checkCoreAllocations(); // take allocation lock to prevent other threads allocating during commit m_allocationWriteLock.lock(); try { /* * Create a transient object to retain values of previous * commitState to support abort/reset/rollback if requested after * this commit() is requested. */ if (!m_commitStateRef.compareAndSet(null/* expect */, new CommitState())) { throw new IllegalStateException( "RWStore commitState found, incomplete previous commit must be rolled back/aborted"); } // final int totalFreed = checkDeferredFrees(true, journal); // free now if possible // // if (totalFreed > 0 && log.isInfoEnabled()) { // log.info("Freed " + totalFreed + " deferralls on commit"); // } // free old storageStatsAddr if (m_storageStatsAddr != 0) { final int len = (int) (m_storageStatsAddr & 0xFFFF); final int addr = (int) (m_storageStatsAddr >> 16); immediateFree(addr, len); } if (m_storageStats != null) { final byte[] buf = m_storageStats.getData(); final long addr = alloc(buf, buf.length, null); m_storageStatsAddr = (addr << 16) + buf.length; } /* * Pre-allocate storage for metaBits from FixedAllocators (ensure * that we do not need to reallocate the metabits region when we are * writing out the updated versions of the FixedAllocators). */ if (m_metaBitsAddr > 0) { // already using demi-space, remove from WCS m_writeCacheService.removeWriteToAddr(convertAddr(-m_metaBitsAddr), 0); } else { final int reqmbc = getRequiredMetaBitsStorage(); int nmbaddr = 0; // if > max alloc or explicitly use the demi-space, then drop through for demi-space if ((!m_useMetabitsDemispace) && reqmbc < m_maxFixedAlloc) { nmbaddr = alloc(reqmbc, null); } // If existing allocation, then free it if (m_metaBitsAddr < 0) { final int oldMetaBitsSize = (m_metaBits.length + m_allocSizes.length + 1) * 4; // Call immediateFree - no need to defer freeof metaBits, this // has to stop somewhere! // No more allocations must be made immediateFree((int) m_metaBitsAddr, oldMetaBitsSize); } m_metaBitsAddr = nmbaddr; } if (m_metaBitsAddr == 0) { // Allocate special region to be able to store maximum metabits (128k of 2 64K demi-space // Must be aligned on 128K boundary and allocations are made in units of 64K. // // May need to extend the file for teh demi-space! while (m_nextAllocation % 2 != 0) { m_nextAllocation--; } m_metaBitsAddr = -m_nextAllocation; // must be positive to differentiate from FixedAllocator address m_nextAllocation -= 2; // allocate 2 * 64K // Check for file extension while (m_nextAllocation <= m_fileSize) { extendFile(); } if (log.isInfoEnabled()) log.info("Using Demi-space metabits"); } if (m_metaBitsAddr > 0) { // Demi-Space // Now "toggle" m_metaBitsAddr - 64K boundary m_metaBitsAddr ^= 0x01; // toggle zero or 64K offset } if (log.isDebugEnabled()) { final long mbaddr; if (m_metaBitsAddr < 0) { mbaddr = physicalAddress((int) m_metaBitsAddr); } else { mbaddr = convertAddr(-m_metaBitsAddr); // maximum 48 bit address range } log.debug("Writing metabits at " + mbaddr); } // There must be no buffered deferred frees // assert m_deferredFreeOut.getBytesWritten() == 0; // save allocation headers FixedAllocator fa = m_commitHead; while (fa != null) { final FixedAllocator allocator = fa; // the bit in metabits for the old allocator version. final int old = allocator.getDiskAddr(); // mark old version - reclaimed after commit. metaFree(old); // the bit in metabits for the new allocator version. final int naddr = metaAlloc(); // set that bit on the allocator. allocator.setDiskAddr(naddr); if (log.isTraceEnabled()) log.trace("Update allocator " + allocator.getIndex() + ", old addr: " + old + ", new addr: " + naddr); try { // do not use checksum m_writeCacheService.write(metaBit2Addr(naddr), ByteBuffer .wrap(allocator.write()), 0/*chk*/, false/*useChecksum*/,0/*latchedAddr*/); } catch (InterruptedException e) { throw new RuntimeException(e); } fa = fa.m_nextCommit; } // DO NOT clear the commit list until the writes have been flushed // m_commitList.clear(); writeMetaBits(); try { m_writeCacheService.flush(true); lastBlockSequence = m_writeCacheService.resetSequence(); } catch (InterruptedException e) { log.error(e, e); throw new RuntimeException(e); } // Should not write rootBlock, this is responsibility of client // to provide control // writeFileSpec(); syncMetaTransients(); // Must be called from AbstractJournal commitNow after writeRootBlock // postCommit(); // if (m_commitCallback != null) { // m_commitCallback.commitComplete(); // } // The Journal handles the force in doubleSync // m_reopener.reopenChannel().force(false); // TODO, check if required! } catch (IOException e) { throw new StorageTerminalError("Unable to commit transaction", e); } finally { m_recentAlloc = false; m_allocationWriteLock.unlock(); } checkCoreAllocations(); if (log.isTraceEnabled()) log.trace("commitChanges for: " + m_nextAllocation + ", " + m_metaBitsAddr + ", active contexts: " + m_contexts.size()); if (log.isDebugEnabled() && m_quorum != null && m_quorum.isHighlyAvailable()) { log.debug(showAllocatorList()); } } /** * {@inheritDoc} */ @Override public Lock getCommitLock() { return m_allocationWriteLock; } /** * {@inheritDoc} * <p> * Commits the FixedAllocator bits */ @Override public void postCommit() { if (!m_allocationWriteLock.isHeldByCurrentThread()) throw new IllegalMonitorStateException(); final CommitState commitState = m_commitStateRef.getAndSet(null/* newValue */); if (commitState == null) { throw new IllegalStateException( "No current CommitState found on postCommit"); } else { commitState.postCommit(); } { FixedAllocator fa = m_commitHead; while (fa != null) { fa.postCommit(); fa = fa.m_nextCommit; } } if (m_storageStats != null) { m_storageStats.commit(); } clearCommitList(); } @Override public int checkDeferredFrees(final AbstractJournal journal) { if (journal == null) return 0; /* * Note: since this is now called directly from the AbstractJournal * commit method (and is part of a public API) we must take the * allocation lock. * * This may have adverse effects wrt concurrency deadlock issues, but * none have been noticed so far. */ m_allocationWriteLock.lock(); try { /** * if session protected then do not free any deferrals! */ if (isSessionProtected()) { return 0; } final AbstractTransactionService transactionService = (AbstractTransactionService) journal .getLocalTransactionManager().getTransactionService(); // the previous commit point. final long lastCommitTime = journal.getLastCommitTime(); if (lastCommitTime == 0L) { // Nothing committed. return 0; } /* * The timestamp for which we may release commit state. */ final long latestReleasableTime = transactionService.getReleaseTime(); if (lastCommitTime <= latestReleasableTime) { throw new AssertionError("lastCommitTime=" + lastCommitTime + ", latestReleasableTime=" + latestReleasableTime + ", lastDeferredReleaseTime=" + m_lastDeferredReleaseTime + ", activeTxCount=" + m_activeTxCount); } // Note: This is longer true. Delete blocks are attached to the // commit point in which the deletes were made. // /* // * add one because we want to read the delete blocks for all // * commit points up to and including the first commit point that // * we may not release. // */ // latestReleasableTime++; // /* // * add one to give this inclusive upper bound semantics to the // * range scan. // */ // latestReleasableTime++; if (txLog.isInfoEnabled()) txLog.info("RECYCLER: lastCommitTime=" + lastCommitTime + ", latestReleasableTime=" + latestReleasableTime + ", lastDeferredReleaseTime=" + m_lastDeferredReleaseTime + ", activeTxCount=" + m_activeTxCount); /* * Free deferrals. * * Note: Per ticket#480, we can not begin recycling from the first * commit point in the commit record index as there are some bigdata * versions (1.0.4) where we did not prune the commit record index. * Therefore, this relies on the (lastDeferredReleaseTime+1) for the * exclusive lower bound. This is avoids triggering an exception * from an attempt to process deferred free blocks which have * already been released. * * @see https://sourceforge.net/apps/trac/bigdata/ticket/480 */ if (m_lastDeferredReleaseTime >= latestReleasableTime) { /** * Note: Added for HA. I have observed both values equal to * ZERO. Since we add ONE (1) to the lastDeferredReleaseTime it * MUST BE LT the latestReleasableTime or we will get a * "toKey LT fromKey" exception. * * @see <a href= * "https://sourceforge.net/apps/trac/bigdata/ticket/530#comment:116"> * Journal HA </a> */ return 0; } return freeDeferrals(journal, m_lastDeferredReleaseTime + 1, latestReleasableTime); } finally { m_allocationWriteLock.unlock(); } } /** * * @return conservative requirement for metabits storage, mindful that the * request to allocate the metabits may require an increase in the * number of allocation blocks and therefore an extension to the * number of metabits. */ private int getRequiredMetaBitsStorage() { int ints = cMetaHdrFields; ints += m_allocSizes.length + m_metaBits.length; // add the maximum number of new metaBits storage that may be // needed to save the current committed objects final int commitInts = ((32 + commitListSize()) / 32); final int allocBlocks = (cDefaultMetaBitsSize - 1 + commitInts)/(cDefaultMetaBitsSize-1); ints += cDefaultMetaBitsSize * allocBlocks; return ints*4; // return as bytes } // Header Data // volatile private long m_curHdrAddr = 0; // volatile private int m_rootAddr; /** * {@link #m_fileSize} is in units of -32K. */ volatile private int m_fileSize; volatile private int m_nextAllocation; /** * The value of nextAllocation at commit is cached and used * in reset() to unwind new FixedAllocators and/or AllocBlocks */ volatile private int m_committedNextAllocation; final private long m_maxFileSize; // private int m_headerSize = 2048; /* * Meta Allocator */ /** * MetaBits HEADER version must be changed when the header or allocator * serialization changes * * Use BCD-style numbering so * 0x0200 == 2.00 * 0x0320 == 3.20 * * The minor byte values should maintain binary compatibility, with * major bytes * Versions * 0x0300 - extended header to include reserved ints * 0x0400 - removed explicit BlobAllocators * 0x0500 - using metaBits demi-space */ final private int cVersion = 0x0400; /** * The {@link #cVersion} value corresponding to the use of the demi-space * for the metabits. * * @see <a href="http://trac.blazegraph.com/ticket/936"> Support larger metabit * allocations</a> * @see <a href="http://wiki.blazegraph.com/wiki/index.php/DataMigration" > * Data migration </a> */ final private int cVersionDemispace = 0x0500; /** * cReservedMetaBits is the reserved space in the metaBits header * to alloc for binary compatibility moving forward. * * If we need to add int values to the header we can do so and reduce the * reservation by 1 each time */ final static int cReservedMetaBits = 20; /** * MetaBits Header * 0 int version * 1-2 int[2] long deferredFree * 3 int defaultMetaBitsSize * 4 int length of allocation sizes * 5-6 int[2] storage stats addr * + 20 reserved */ final static private int cMetaHdrFields = 7 + cReservedMetaBits; /** * @see Options#META_BITS_SIZE */ final private int cDefaultMetaBitsSize = 9; /** * @see Options#META_BITS_SIZE */ volatile private int m_metaBitsSize; volatile private boolean m_useMetabitsDemispace = true; /** * Package private since is uded by FixedAllocators * * @see Options#META_BITS_SIZE */ final int cDefaultFreeBitsThreshold; /** * The smallSlotThreshold, when activated, is intended to ensure improve the * opportunity for write elissions (to mechanical disks) whilst also reducing * the read-backs on current generation (2014-15) SSDs that can impact * write throughput. * Given that the objective is to statistically improve write elission, * the number of required free bits needs to be large - around 50%. * However, this can result in a large amount of store waste for certain * patterns of data - for example when small slots are used to store large * literals that will not be recycled. In this scenario it is possible * that allocators are not recycled. * Some further thoughts: * 1) The more efficient elission of small slots for the allocation of large literals * is probably the major throughput benefit * 2) OTOH, at a lower level, small sparse but localised writes (eg 16 64 byte writes to a 4k * sector) may only incur a single read-back with good firmware. * To address the concern for high waste, when a statistically large number of allocators have * been created, and the waste is beyond some threshold, then a lower small slot threshold * is used. The logic for this is implemented in {@link FixedAllocator#meetsSmallSlotThreshold()} */ int cSmallSlot = 1024; // @see from Options#SMALL_SLOT_TYPE int cSmallSlotThreshold = 4096; // @see from Options#SMALL_SLOT_THRESHOLD /** * High Waste Criteria */ int cSmallSlotThresholdHighWaste = 2048; // @see from Options#SMALL_SLOT_THRESHOLD_HIGH_WASTE int cSmallSlotWasteCheckAllocators = 100; // @see from Options#SMALL_SLOT_WASTE_CHECK_ALLOCATORS float cSmallSlotHighWaste = 0.2f; // @see from Options#SMALL_SLOT_HIGH_WASTE /** * Each "metaBit" is a file region */ private int m_metaBits[]; private int m_metaTransientBits[]; // volatile private int m_metaStartAddr; private volatile int m_metaBitsAddr; // @todo javadoc please. volatile private boolean m_recentAlloc = false; /** * Return the address of a contiguous region on the persistent heap. * * @param size * The size of that region (this is not bytes, but something a * bit more complicated). */ protected int allocBlock(final int size) { // minimum 1 if (size <= 0) { throw new Error("allocBlock called with zero size request"); } final int allocAddr = m_nextAllocation; m_nextAllocation -= size; while (convertAddr(m_nextAllocation) >= convertAddr(m_fileSize)) { extendFile(); } checkCoreAllocations(); if (log.isTraceEnabled()) log.trace("allocation created at " + convertAddr(allocAddr) + " for " + convertAddr(-size)); return allocAddr; } private void checkCoreAllocations() { final long lfileSize = convertAddr(m_fileSize); final long lnextAlloc = convertAddr(m_nextAllocation); if (lnextAlloc >= lfileSize) { throw new IllegalStateException("Core Allocation Error - file size: " + lfileSize + ", nextAlloc: " + lnextAlloc); } } /** * meta allocation/free * * Allocates persistent store for allocation blocks. * * grows data from the top to the file, e.g. bit 0 is 1024 from end-of-file. * * If metaStart <= nextAllocation, then the file must be extended. All the * allocation blocks are moved to the new end of file area, and the * metaStartAddress is incremented by the same delta value. * * NB the metaStart calculation uses an address rounded to 8k, so on * extension the new metaStart may be up to 8K less than the true start * address. * * The updated approach to metaAllocation uses native allocation from * the heap (by simply incrementing from m_nextAllocation) to provide * space for the allocation blocks. * * This approach means that the file only needs to be extended when * m_nextAllocation passes the m_fileSize, since we no longer store * the allocation blocks at the end of the file. */ int metaAlloc() { int bit = fndMetabit(); if (bit < 0) { // reallocate metaBits and recalculate m_headerSize // extend m_metaBits by 8 ints of bits plus start address! final int nsize = m_metaBits.length + cDefaultMetaBitsSize; // arrays initialized to zero by default final int[] nbits = new int[nsize]; final int[] ntransients = new int[nsize]; // copy existing values for (int i = 0; i < m_metaBits.length; i++) { nbits[i] = m_metaBits[i]; ntransients[i] = m_metaTransientBits[i]; } m_metaBits = nbits; m_metaTransientBits = ntransients; m_metaBits[m_metaBitsSize] = m_nextAllocation; m_nextAllocation -= META_ALLOCATION; // 256K m_metaBitsSize = nsize; // now get new allocation! bit = fndMetabit(); assert bit >= 0; } setBit(m_metaTransientBits, bit); setBit(m_metaBits, bit); if (m_nextAllocation <= m_fileSize) { if (log.isInfoEnabled()) log.info("ExtendFile called from metaAlloc"); extendFile(); } // cat.info("meta allocation at " + addr); checkCoreAllocations(); return bit; } /** * Search the metabits for a bit that is free for allocation of space that * an allocator could write on. * * @return The bit -or- <code>-1</code> if the meta bits region is currently * ful. */ private int fndMetabit() { final int blocks = m_metaBits.length / cDefaultMetaBitsSize; for (int b = 0; b < blocks; b++) { final int ret = fndBit(m_metaTransientBits, (b * cDefaultMetaBitsSize) + 1, cDefaultMetaBitsSize-1); if (ret != -1) { // The assumption is that this bit is also NOT set in m_metaBits assert !tstBit(m_metaBits, ret); return ret; } } return -1; // none found } void metaFree(final int bit) { if (!m_allocationWriteLock.isHeldByCurrentThread()) { /* * Must hold the allocation lock while allocating or clearing * allocations. */ throw new IllegalMonitorStateException(); } if (bit <= 0) { return; } if (tstBit(m_metaBits, bit)) { clrBit(m_metaBits, bit); } else { clrBit(m_metaTransientBits, bit); } m_writeCacheService.clearWrite(metaBit2Addr(bit),0/*latchedAddr*/); } /** * The metabits are encoded in {@link #cDefaultMetaBitsSize} int runs as * follows * * <pre> * [startAddr1][bits0][bits1]...[bitsN] * [startAddr2]... * ... * </pre> * * where <code>N</code> is {@link #cDefaultMetaBitsSize} MINUS TWO and * <code>[bits0]...[bitsN]</code> are interpreted as a bit map. * <p> * The bit parameter is processed to determine which run it is part of. * <p> * Note that the bit offsets are not contiguous since there are "holes" * where the meta allocation [startAddr] are stored. * <p> * When the metabits region is first created, and each time it is grown, a * region is reserved at the then current nextOffset on the file that is * used for {@link FixedAllocator}s associated with the bit vector in the * next run of the metabits block. Those {@link FixedAllocator}s will be * recruited and used as needed. Note that {@link FixedAllocator}s are * always written onto an unused "bit" at each commit, and the old "bit" is * then freed. Thus dirty {@link FixedAllocator}s move at each commit and * can move between runs in the metabits. */ long metaBit2Addr(final int bit) { // final int bitsPerBlock = 9 * 32; /* * The integer index into the m_metaBits[]. */ final int intIndex = bit / 32; // divide 32; /* * Make sure that the [bit] is a bit that falls into one of the bit * regions (versus one of the startAddr int32 values). */ assert intIndex % cDefaultMetaBitsSize != 0; // used by the start addrs! /* * The index into the metabits region corresponding to the int32 value * before the start of the bit vector in which this bit falls. This * offset is relative to the start of the m_metaBits[]. */ final int addrIndex = (intIndex / cDefaultMetaBitsSize) * cDefaultMetaBitsSize; /* * Pull out convert the startAddr for the bit vector addressed by that * bit. This gives us the int64 byte offset of some region on the * backing file. */ final long addr = convertAddr(m_metaBits[addrIndex]); /* * The bit index of this bit in the bit vector for this region in the * metaBits[]. */ final int intOffset = bit - ((addrIndex + 1) * 32); /* * The byte offset into the backing file of the FixedAllocator for that * bit. All FixedAllocators are the same size [ALLOC_BLOCK_SIZE]. The * FixedAllocator knows what size allocations it makes and manages the * regions on the backing store in which those allocation are made. */ final long ret = addr + (ALLOC_BLOCK_SIZE * intOffset); return ret; } /** * Convert an implicitly scaled int32 offset into the backing file into an * int64 address into the backing file. * * @param addr * An int32 offset into the backing file formed by * {@link #convertFromAddr(long)}. The representation is a * negative integer that has been left shifted by * {@link #ALLOCATION_SCALEUP} to reduce its bit size. * * @return A byte offset in the backing file. * * @see #convertFromAddr(long) * @see #ALLOCATION_SCALEUP */ public static long convertAddr(final int addr) { final long laddr = addr; if (laddr < 0) { final long ret = (-laddr) << ALLOCATION_SCALEUP; return ret; } else { return laddr & 0xFFFFFFF0; } } /** * Convert an int64 address into the backing file into an int32 offset that * is implicitly scaled by {@link #ALLOCATION_SCALEUP}. * * @param addr * An int64 offset into the backing file. * * @return The implicitly scaled int32 offset. * @see #convertAddr(int) * @see #ALLOCATION_SCALEUP */ public int convertFromAddr(final long addr) { return (int) -(addr >> ALLOCATION_SCALEUP); } private volatile boolean m_extendingFile = false; /** * extendFile will extend by 10% and round up to be a multiple of 16k * * The allocation blocks must also be moved. Note that it would be a bad * idea if these were moved with an overlapping copy! * * After moving the physical memory the in-memory allocation blocks must * then be updated with their new position. * * Note that since version 3.0 the size of the metaBits is variable. This * must be taken into consideration when moving data. - Has the location * changed as a result of the "reallocation". If this is incorrect then the * wrong commit blocks will be copied, resulting in a corrupt data file. * * There are two approaches to this problem. The first is only to copy the * known committed (written) allocation blocks - but this cannot be implied * by "zero'd" bits since this can indicate that memory has been cleared. * * Synchronization * * The writecache may contain allocation block writes that must be flushed * before the file can be extended. The extend file explicitly moves the * written allocation blocks to there new location at the new end of the * file and then updates the rootblocks to ensure they point to the new * allocation areas. * * Extend file is only triggered by either alloc or metaAlloc which are s * synchronized by the allocation lock. So extend file ends up being * synchronized by the same lock. * * If we knew that the write cache had no writes to the allocation areas, * we would not need to flush, but calling flush prior to the extend is * sufficient to guarantee, in conjunction with holding the allocation lock, * that no new writes to the allocation areas will be made. * * Once the flush is complete we take the extension writeLock to prevent * further reads or writes, extend the file, moving the allocation areas on * the disk, then force the new rootblocks to disk. */ private void extendFile() { final int adjust = -1200 + (m_fileSize / 10); extendFile(adjust); } private volatile long m_readsAtExtend = 0; private void extendFile(final int adjust) { if (m_extendingFile) { throw new IllegalStateException("File concurrently extended"); } /** * Note: Synchronous flush of the WriteCacheService should not be * required. It has been commented out in support of * * <a href="https://sourceforge.net/apps/trac/bigdata/ticket/621"> * Coalesce records in write cache</a> */ // try { // /* // * The call to flush the cache cannot be made while holding the // * extension writeLock, since the writeOnChannel takes the // * extension readLock. // */ // m_writeCache.flush(true); // } catch (InterruptedException e) { // throw new RuntimeException("Flush interrupted in extend file"); // } final Lock lock = this.m_extensionLock.writeLock(); lock.lock(); try { m_extendingFile = true; // final long curSize = convertAddr(m_fileSize); m_fileSize += adjust; final long toAddr = convertAddr(m_fileSize); if (getMaxFileSize() < toAddr) { // whoops!! How to exit more gracefully? throw new Error("System greater than maximum size"); } if (log.isInfoEnabled()) log.info("Extending file to: " + toAddr); m_reopener.reopenChannel(); m_reopener.raf.setLength(toAddr); storeCounters.get().ntruncate++; // must ensure writeCache is in sync for HA m_writeCacheService.setExtent(toAddr); if (log.isInfoEnabled()) log.info("Extend file done"); } catch (Throwable t) { throw new RuntimeException("Force Reopen", t); } finally { m_extendingFile = false; m_readsAtExtend = this.m_diskReads; lock.unlock(); } } static void setBit(final int[] bits, final int bitnum) { final int index = bitnum / 32; final int bit = bitnum % 32; bits[(int) index] |= 1 << bit; } static boolean tstBit(final int[] bits, final int bitnum) { final int index = bitnum / 32; final int bit = bitnum % 32; if (index >= bits.length) throw new IllegalArgumentException("Accessing bit index: " + index + " of array length: " + bits.length); return (bits[(int) index] & 1 << bit) != 0; } static void clrBit(final int[] bits, final int bitnum) { final int index = bitnum / 32; final int bit = bitnum % 32; int val = bits[index]; val &= ~(1 << bit); bits[index] = val; } static int fndBit(final int[] bits, final int size) { return fndBit(bits, 0, size); } static int fndBit(final int[] bits, final int offset, final int size) { final int eob = size + offset; for (int i = offset; i < eob; i++) { final int b = fndBit(bits[i]); if (b != -1) { return (i * 32) + b; } } return -1; } static int fndBit(final int bits) { if (bits != 0xFFFFFFFF) { for (int k = 0; k < 32; k++) { if ((bits & (1 << k)) == 0) { return k; } } } return -1; } public static class AllocationStats { public AllocationStats(final int i) { m_blockSize = i; } long m_blockSize; long m_reservedSlots; long m_filledSlots; } /** * Utility debug outputing the allocator array, showing index, start * address and alloc type/size * * Collected statistics are against each Allocation Block size: * total number of slots | store size * number of filled slots | store used * <dl> * <dt>AllocatorSize</dt><dd>The #of bytes in the allocated slots issued by this allocator.</dd> * <dt>AllocatorCount</dt><dd>The #of fixed allocators for that slot size.</dd> * <dt>SlotsInUse</dt><dd>The difference between the two previous columns (net slots in use for this slot size).</dd> * <dt>SlotsReserved</dt><dd>The #of slots in this slot size which have had storage reserved for them.</dd> * <dt>SlotsAllocated</dt><dd>Cumulative allocation of slots to date in this slot size (regardless of the transaction outcome).</dd> * <dt>SlotsRecycled</dt><dd>Cumulative recycled slots to date in this slot size (regardless of the transaction outcome).</dd> * <dt>SlotsChurn</dt><dd>How frequently slots of this size are re-allocated (SlotsInUse/SlotsAllocated).</dd> * <dt>%SlotsUnused</dt><dd>The percentage of slots of this size which are not in use (1-(SlotsInUse/SlotsReserved)).</dd> * <dt>BytesReserved</dt><dd>The space reserved on the backing file for those allocation slots</dd> * <dt>BytesAppData</dt><dd>The #of bytes in the allocated slots which are used by application data (including the record checksum).</dd> * <dt>%SlotWaste</dt><dd>How well the application data fits in the slots (BytesAppData/(SlotsInUse*AllocatorSize)).</dd> * <dt>%AppData</dt><dd>How much of your data is stored by each allocator (BytesAppData/Sum(BytesAppData)).</dd> * <dt>%StoreFile</dt><dd>How much of the backing file is reserved for each allocator (BytesReserved/Sum(BytesReserved)).</dd> * <dt>%StoreWaste</dt><dd>How much of the total waste on the store is waste for this allocator size ((BytesReserved-BytesAppData)/(Sum(BytesReserved)-Sum(BytesAppData))).</dd> * </dl> * @see StorageStats#showStats(StringBuilder) */ public void showAllocators(final StringBuilder str) { m_storageStats.showStats(str); str.append("\nChecking regions....."); // Now check all allocators to confirm that each file region maps to only one allocator final Lock lock = m_allocationLock.readLock(); lock.lock(); try { final HashMap<Integer, FixedAllocator> map = new HashMap<Integer, FixedAllocator>(); for (FixedAllocator fa : m_allocs) { fa.addToRegionMap(map); } str.append("okay\n"); } catch (IllegalStateException is) { str.append(is.getMessage() + "\n"); } finally { lock.unlock(); } } /** * Given a physical address (byte offset on the store), return true if that * address could be managed by an allocated block. * * @param a * the storage address to be tested. */ public boolean verify(final long laddr) { final int addr = (int) laddr; if (addr == 0) { return false; } return getBlockByAddress(addr) != null; } /***************************************************************************** * Address transformation: latched2Physical */ /** * Return the byte offset in the file. * * @param addr * The latched address. * * @return The byte offset in the file. */ final private long physicalAddress(final int addr, final boolean nocheck) { /* * Guard against concurrent mutation. * * Note: Taking the lock here is necessary since physicalAddress/1 is * public. */ final Lock lock = m_allocationReadLock; lock.lock(); try { if (addr >= 0) { return addr & 0xFFFFFFE0; } else { // Find the allocator. final FixedAllocator allocator = getBlock(addr); // Get the bit index into the allocator. final int offset = getOffset(addr); // Translate the bit index into a byte offset on the file. final long laddr = allocator .getPhysicalAddress(offset, nocheck); return laddr; } } finally { lock.unlock(); } } /** * Return the byte offset in the file. * * @param addr * A latched address. * * @return The byte offset. */ final public long physicalAddress(final int addr) { return physicalAddress(addr, false/* nocheck */); } /******************************************************************************** * handle dual address format, if addr is positive then it is the physical * address, so the Allocators must be searched. **/ FixedAllocator getBlockByAddress(final int addr) { if (addr < 0) { return getBlock(addr); } final Iterator<FixedAllocator> allocs = m_allocs.iterator(); FixedAllocator alloc = null; while (allocs.hasNext()) { alloc = allocs.next(); if (alloc.addressInRange(addr)) { break; } alloc = null; } return alloc; } /** * Get the {@link FixedAllocator} for a latched address. * * @param addr * The latched address. * * @return The {@link FixedAllocator} for that latched address. */ private FixedAllocator getBlock(final int addr) { // index of the FixedAllocator for that latched address. final int index = (-addr) >>> OFFSET_BITS; if (index >= m_allocs.size()) { throw new PhysicalAddressResolutionException(addr); } // Return the FixedAllocator for that index. return m_allocs.get(index); } /** * Return the bit index into a {@link FixedAllocator}. * <p> * Note: This is directly encoded by the latched address. You do not need to * know which {@link FixedAllocator} is being addressed in order to figure * this out. * * @param addr * A latched address. * * @return The bit index into the {@link FixedAllocator}. */ private int getOffset(final int addr) { return (-addr) & OFFSET_BITS_MASK; // OFFSET_BITS } /** * The {@link RWStore} always generates negative address values. * * @return whether the address given is a native IStore address */ public boolean isNativeAddress(final long addr) { return addr <= 0; } public File getStoreFile() { return m_fd; } public boolean requiresCommit() { return m_recentAlloc; } /** * Since we need to store the absolute address and the size can be * a maximum of 64K, the absolute address is limited to 48 bits, setting * the maximum address as 140T, which is sufficient. * * @return long representation of metaBitsAddr PLUS the size */ public long getMetaBitsAddr() { long ret = 0; if (m_metaBitsAddr < 0) { ret = physicalAddress((int) m_metaBitsAddr); } else { // long ret = physicalAddress((int) m_metaBitsAddr); ret = convertAddr(-m_metaBitsAddr); // maximum 48 bit address range } ret <<= 16; // include space for version, allocSizes and deferred free info AND // cDefaultMetaBitsSize final int metaBitsSize = cMetaHdrFields + m_metaBits.length + m_allocSizes.length; ret += metaBitsSize; if (log.isTraceEnabled()) log.trace("Returning metabitsAddr: " + ret + ", for " + m_metaBitsAddr + " - " + m_metaBits.length + ", " + metaBitsSize); return ret; } /** * * @return the address of the metaBits */ public long getMetaBitsStoreAddress() { if (m_metaBitsAddr < 0) { return physicalAddress((int) m_metaBitsAddr); } else { return convertAddr(-m_metaBitsAddr); // maximum 48 bit address range } } /** * @return long representation of metaStartAddr PLUS the size where addr + * size is fileSize (not necessarily physical size) */ public long getMetaStartAddr() { return -m_fileSize; } /** * * @return the nextAllocation from the file Heap to be provided to an * Allocation Block */ public long getNextOffset() { long ret = -m_nextAllocation; if (m_metaBitsAddr > 0) { // FIX for sign use in m_metaBitsAddr when packing into long ret++; } ret <<= 32; ret += -m_metaBitsAddr; if (log.isTraceEnabled()) log.trace("Returning nextOffset: " + ret + ", for " + m_metaBitsAddr); return ret; } public void flushWrites(final boolean metadata) throws IOException { assertOpen(); try { m_writeCacheService.flush(metadata); // sync the disk. m_reopener.reopenChannel().force(metadata); final StoreCounters<?> c = (StoreCounters<?>) storeCounters.get() .acquire(); try { c.nforce++; } finally { c.release(); } } catch (InterruptedException e) { throw new ClosedByInterruptException(); } } /** The # of allocation requests made. */ public long getTotalAllocations() { return m_allocations; } /** * The # of free requests made */ public long getTotalFrees() { return m_frees; } /** * The # of bytes requested - as opposed to the size of the slots allocated. */ public long getTotalAllocationsSize() { return m_nativeAllocBytes; } /** * A Blob Allocator maintains a list of Blob headers. The allocator stores * up to 255 blob headers plus a checksum. When a request is made to read the * blob data, the blob allocator retrieves the blob header and reads the * data from that into the passed byte array. */ // public int registerBlob(final int addr) { // m_allocationLock.lock(); // try { // BlobAllocator ba = null; // if (m_freeBlobs.size() > 0) { // ba = (BlobAllocator) m_freeBlobs.get(0); // } // if (ba == null) { // final Allocator lalloc = (Allocator) m_allocs.get(m_allocs.size() - 1); // // previous block start address // final int psa = lalloc.getRawStartAddr(); // assert (psa - 1) > m_nextAllocation; // ba = new BlobAllocator(this, psa - 1); // ba.setFreeList(m_freeBlobs); // will add itself to the free list // ba.setIndex(m_allocs.size()); // m_allocs.add(ba); // } // // if (!m_commitList.contains(ba)) { // m_commitList.add(ba); // } // // return ba.register(addr); // } finally { // m_allocationLock.unlock(); // } // } void addToCommit(final FixedAllocator allocator) { if (allocator.m_prevCommit == null && m_commitHead != allocator) { // not on list allocator.m_prevCommit = m_commitTail; if (allocator.m_prevCommit != null) { allocator.m_prevCommit.m_nextCommit = allocator; m_commitTail = allocator; } else { m_commitHead = m_commitTail = allocator; } } } final boolean isOnCommitList(final FixedAllocator allocator) { return allocator.m_prevCommit != null || allocator == m_commitHead; } final void clearCommitList() { FixedAllocator cur = m_commitHead; while (cur != null) { final FixedAllocator t = cur; cur = t.m_nextCommit; t.m_prevCommit = t.m_nextCommit = null; } m_commitHead = m_commitTail = null; } final int commitListSize() { int count = 0; FixedAllocator cur = m_commitHead; while (cur != null) { count++; cur = cur.m_nextCommit; } return count; } // void removeFromCommit(final Allocator allocator) { // m_commitList.remove(allocator); // } public Allocator getAllocator(final int i) { return (Allocator) m_allocs.get(i); } /** * Simple implementation for a {@link RandomAccessFile} to handle the direct * backing store. */ private class ReopenFileChannel implements IReopenChannel<FileChannel>, FileChannelUtility.IAsyncOpener { final private File file; private final boolean readOnly; private final String mode; private volatile RandomAccessFile raf; private final Path path; private volatile AsynchronousFileChannel asyncChannel; private int asyncChannelOpenCount = 0;; public ReopenFileChannel(final File file, final RandomAccessFile raf, final boolean readOnly) throws IOException { this.file = file; this.readOnly = readOnly; this.mode = readOnly == true ? "r" : "rw"; this.raf = raf; this.path = Paths.get(file.getAbsolutePath()); reopenChannel(); } @Override public AsynchronousFileChannel getAsyncChannel() { if (asyncChannel != null) { if (asyncChannel.isOpen()) return asyncChannel; } synchronized(this) { if (asyncChannel != null) { // check again while synchronized if (asyncChannel.isOpen()) return asyncChannel; } try { if(readOnly) { asyncChannel = AsynchronousFileChannel.open(path, StandardOpenOption.READ); } else { asyncChannel = AsynchronousFileChannel.open(path, StandardOpenOption.READ, StandardOpenOption.WRITE); } } catch (IOException e) { throw new RuntimeException(e); } asyncChannelOpenCount++; return asyncChannel; } } public int getAsyncChannelOpenCount() { return asyncChannelOpenCount; } @Override public String toString() { return file.toString(); } @Override public FileChannel reopenChannel() throws IOException { /* * Note: This is basically a double-checked locking pattern. It is * used to avoid synchronizing when the backing channel is already * open. */ { final RandomAccessFile tmp = raf; if (tmp != null) { final FileChannel channel = tmp.getChannel(); if (channel.isOpen()) { // The channel is still open. return channel; } } } synchronized(this) { if (raf != null) { final FileChannel channel = raf.getChannel(); if (channel.isOpen()) { /* * The channel is still open. If you are allowing * concurrent reads on the channel, then this could * indicate that two readers each found the channel * closed and that one was able to re-open the channel * before the other such that the channel was open again * by the time the 2nd reader got here. */ return channel; } } // open the file. this.raf = new RandomAccessFile(file, mode); // Update counters. final StoreCounters<?> c = (StoreCounters<?>) storeCounters .get().acquire(); try { c.nreopen++; } finally { c.release(); } return raf.getChannel(); } } } /** * If the current file extent is different from the required extent then the * call is made to {@link #extendFile(int)}. * * @param extent * The new file extent. */ public void establishExtent(final long extent) { assertOpen(); final long currentExtent = convertAddr(m_fileSize); if (extent > currentExtent) { extendFile(convertFromAddr(extent - currentExtent)); } else if (extent < currentExtent) { //See https://github.com/SYSTAP/db-enterprise/issues/12 //TODO: Determine if there is a more graceful way to handle this. // throw new IllegalArgumentException( log.warn("Cannot shrink RWStore extent: currentExtent=" + currentExtent + ", fileSize=" + m_fileSize + ", newValue=" + extent); } } /** * @return number of FixedAllocators */ public int getFixedAllocatorCount() { final Lock lock = m_allocationReadLock; lock.lock(); try { int fixed = 0; final Iterator<FixedAllocator> allocs = m_allocs.iterator(); while (allocs.hasNext()) { if (allocs.next() instanceof FixedAllocator) { fixed++; } } return fixed; } finally { lock.unlock(); } } /** * @return the number of heap allocations made to the FixedAllocators. */ public int getAllocatedBlocks() { final Lock lock = m_allocationReadLock; lock.lock(); try { int allocated = 0; final Iterator<FixedAllocator> allocs = m_allocs.iterator(); while (allocs.hasNext()) { final Allocator alloc = allocs.next(); if (alloc instanceof FixedAllocator) { allocated += ((FixedAllocator) alloc).getAllocatedBlocks(); } } return allocated; } finally { lock.unlock(); } } /** * @return the amount of heap storage assigned to the FixedAllocators. */ public long getFileStorage() { final Lock lock = m_allocationReadLock; lock.lock(); try { long allocated = 0; final Iterator<FixedAllocator> allocs = m_allocs.iterator(); while (allocs.hasNext()) { final FixedAllocator alloc = allocs.next(); allocated += ((FixedAllocator) alloc).getFileStorage(); } return allocated; } finally { lock.unlock(); } } /** * Computes the amount of utilised storage * * @return the amount of storage to alloted slots in the allocation blocks */ public long getAllocatedSlots() { final Lock lock = m_allocationReadLock; lock.lock(); try { long allocated = 0; final Iterator<FixedAllocator> allocs = m_allocs.iterator(); while (allocs.hasNext()) { final Allocator alloc = allocs.next(); if (alloc instanceof FixedAllocator) { allocated += ((FixedAllocator) alloc).getAllocatedSlots(); } } return allocated; } finally { lock.unlock(); } } /** * Adds the address for later freeing to the deferred free list. * <p> * If the allocation is for a BLOB then the sze is also stored * <p> * The deferred list is checked on AllocBlock and prior to commit. * <p> * DeferredFrees are written to the deferred PSOutputStream */ public void deferFree(final int rwaddr, final int sze) { m_allocationWriteLock.lock(); try { if (sze > (this.m_maxFixedAlloc-4)) { m_deferredFreeOut.writeInt(-rwaddr); m_deferredFreeOut.writeInt(sze); /* * rather than write out blob address, instead flatten the blob addresses and * write all to remove the latency on commit caused by reading potentially many blob headers. * * This idea was propposed to support BLZG-641/BLZG-1663 to redcue commit latency. * * However, it appears that deferFree is not called with the raw blob size and is already * reduced to the blob part addrs. */ log.debug("Unexpected code path deferring free of direct blob address"); // final int alloc = m_maxFixedAlloc-4; // final int nblocks = (alloc - 1 + (sze-4))/alloc; // if (nblocks < 0) // throw new IllegalStateException( // "Allocation error, m_maxFixedAlloc: " // + m_maxFixedAlloc); // // final byte[] hdrbuf = new byte[4 * (nblocks + 1) + 4]; // plus 4 bytes for checksum // if (hdrbuf.length > m_maxFixedAlloc) { // if (log.isInfoEnabled()) { // log.info("LARGE BLOB - header is BLOB"); // } // } // // getData(rwaddr, hdrbuf); // will work even if header is also a blob // // // deferFree header // deferFree(rwaddr, hdrbuf.length); // // // Now read all blob part addresses // final DataInputStream hdrstr = new DataInputStream(new ByteArrayInputStream(hdrbuf)); // final int rhdrs = hdrstr.readInt(); // if (rhdrs != nblocks) { // throw new IllegalStateException( // "Incompatible BLOB header record, expected: " // + nblocks + ", got: " + rhdrs); // } // // int remaining = sze; // int partSize = alloc; // for (int i = 0; i < nblocks; i++) { // final int blobpartAddr = hdrstr.readInt(); // // deferFree(blobpartAddr, partSize); // m_deferredFreeOut.writeInt(blobpartAddr); // // remaining -= partSize; // // if (remaining < partSize) { // partSize = remaining; // } // } } else { m_deferredFreeOut.writeInt(rwaddr); } } catch (IOException e) { throw new RuntimeException("Could not free: rwaddr=" + rwaddr + ", size=" + sze, e); } finally { m_allocationWriteLock.unlock(); } } // private void checkFreeable(final JournalTransactionService transactionService) { // if (transactionService == null) { // return; // } // // try { // final Long freeTime = transactionService.tryCallWithLock(new Callable<Long>() { // // public Long call() throws Exception { // final long now = transactionService.nextTimestamp(); // final long earliest = transactionService.getEarliestTxStartTime(); // final long aged = now - transactionService.getMinReleaseAge(); // // if (transactionService.getActiveCount() == 0) { // return aged; // } else { // return aged < earliest ? aged : earliest; // } // } // // }, 5L, TimeUnit.MILLISECONDS); // } catch (RuntimeException e) { // // fine, will try again later // } catch (Exception e) { // throw new RuntimeException(e); // } // } public long saveDeferrals() { m_allocationWriteLock.lock(); try { if (m_deferredFreeOut.getBytesWritten() == 0) { return 0; } m_deferredFreeOut.writeInt(0); // terminate! final int outlen = m_deferredFreeOut.getBytesWritten(); long addr = m_deferredFreeOut.save(); addr <<= 32; addr += outlen; // Ensure added to blob allocation stats: BLZG-1646 if (outlen > this.m_maxFixedAlloc && m_storageStats != null) { m_storageStats.allocateBlob(outlen); } m_deferredFreeOut.reset(); return addr; } catch (IOException e) { throw new RuntimeException("Cannot write to deferred free", e); } finally { m_allocationWriteLock.unlock(); } } /** * Provided with the address of a block of addresses to be freed * @param blockAddr * @return the total number of addresses freed */ private int freeDeferrals(final long blockAddr, final long lastReleaseTime) { final int addr = (int) (blockAddr >> 32); final int sze = (int) blockAddr & 0xFFFFFFFF; // Resolution for BLZG-1236 (recycler error) if (log.isTraceEnabled()) log.trace("freeDeferrals at " + physicalAddress(addr) + ", size: " + sze + " releaseTime: " + lastReleaseTime); final byte[] buf = new byte[sze+4]; // allow for checksum getData(addr, buf); final DataInputStream strBuf = new DataInputStream(new ByteArrayInputStream(buf)); m_allocationWriteLock.lock(); int totalFreed = 0; try { int nxtAddr = strBuf.readInt(); int cnt = 0; while (nxtAddr != 0) { // while (false && addrs-- > 0) { if (nxtAddr > 0) { // Blob final int bloblen = strBuf.readInt(); assert bloblen > 0; // a Blob address MUST have a size immediateFree(-nxtAddr, bloblen); } else { // The lack of size messes with the stats immediateFree(nxtAddr, 1); // size ignored for FixedAllocators } totalFreed++; nxtAddr = strBuf.readInt(); } // now free delete block immediateFree(addr, sze); m_lastDeferredReleaseTime = lastReleaseTime; if (log.isTraceEnabled()) log.trace("Updated m_lastDeferredReleaseTime=" + m_lastDeferredReleaseTime); } catch (IOException e) { throw new RuntimeException("Problem freeing deferrals", e); } finally { m_allocationWriteLock.unlock(); } return totalFreed; } /** * Provided with an iterator of CommitRecords, process each and free any * deferred deletes associated with each. * * @param journal * @param fromTime * The inclusive lower bound. * @param toTime * The exclusive upper bound. */ private int freeDeferrals(final AbstractJournal journal, final long fromTime, final long toTime) { final ITupleIterator<CommitRecordIndex.Entry> commitRecords; /* * Commit can be called prior to Journal initialisation, in which * case the commitRecordIndex will not be set. */ final IIndex commitRecordIndex = journal.getReadOnlyCommitRecordIndex(); if (commitRecordIndex == null) { // TODO Why is this here? return 0; } final IndexMetadata metadata = commitRecordIndex .getIndexMetadata(); final byte[] fromKey = metadata.getTupleSerializer() .serializeKey(fromTime); final byte[] toKey = metadata.getTupleSerializer() .serializeKey(toTime); commitRecords = commitRecordIndex .rangeIterator(fromKey, toKey); int totalFreed = 0; int commitPointsRecycled = 0; while (commitRecords.hasNext()) { final ITuple<CommitRecordIndex.Entry> tuple = commitRecords.next(); final CommitRecordIndex.Entry entry = tuple.getObject(); try { final ICommitRecord record = CommitRecordSerializer.INSTANCE .deserialize(journal.read(entry.addr)); final long blockAddr = record .getRootAddr(AbstractJournal.DELETEBLOCK); if (blockAddr != 0) { totalFreed += freeDeferrals(blockAddr, record.getTimestamp()); } // Note: This is releasing the ICommitRecord itself. I've moved the responsibilty // for that into AbstractJournal#removeCommitRecordEntries() (invoked below). // // immediateFree((int) (entry.addr >> 32), (int) entry.addr); commitPointsRecycled++; } catch (RuntimeException re) { throw new RuntimeException("Problem with entry at " + entry.addr, re); } } /* * * * @see https://sourceforge.net/apps/trac/bigdata/ticket/440 */ // Now remove the commit record entries from the commit record index. final int commitPointsRemoved = journal.removeCommitRecordEntries( fromKey, toKey); if (txLog.isInfoEnabled()) txLog.info("RECYCLED: fromTime=" + fromTime + ", toTime=" + toTime + ", totalFreed=" + totalFreed + ", commitPointsRecycled=" + commitPointsRecycled + ", commitPointsRemoved=" + commitPointsRemoved ); if (commitPointsRecycled != commitPointsRemoved) throw new AssertionError("commitPointsRecycled=" + commitPointsRecycled + " != commitPointsRemoved=" + commitPointsRemoved); return totalFreed; } /** * {@inheritDoc} * <p> * The {@link ContextAllocation} object manages a freeList of associated * allocators and an overall list of allocators. When the context is * detached, all allocators must be released and any that has available * capacity will be assigned to the global free lists. See * {@link AllocBlock #releaseSession} * * @param context * The context to be released from all {@link FixedAllocator}s. */ public void detachContext(final IAllocationContext context) { assertOpen(); m_allocationWriteLock.lock(); try { context.release(); if (context.isIsolated()) { final ContextAllocation alloc = m_contexts.remove(context); if (alloc != null) { alloc.release(); } else { throw new IllegalStateException("Multiple call to detachContext"); } if (m_contexts.isEmpty() && this.m_activeTxCount == 0) { releaseSessions(); } } } finally { m_allocationWriteLock.unlock(); } } /** * The ContextAllocation object manages a freeList of associated allocators * and an overall list of allocators. When the context is aborted then * allocations made by that context should be released. * See {@link AllocBlock #abortShadow} * * @param context * The context to be released from all FixedAllocators. */ public void abortContext(final IAllocationContext context) { assertOpen(); m_allocationWriteLock.lock(); try { context.release(); if (context.isIsolated()) { final ContextAllocation alloc = m_contexts.remove(context); if (alloc != null) { alloc.abort(); } } } finally { m_allocationWriteLock.unlock(); } } /** * The ContextAllocation class manages a set of Allocators. * * A ContextAllocation can have a parent ContextAllocation such that when * it is released it will transfer its Allocators to its parent. * * * * @author Martyn Cutcher * */ static class ContextAllocation { private final RWStore m_store; private final ArrayList<FixedAllocator> m_freeFixed[]; private final ArrayList<FixedAllocator> m_allFixed; private final ArrayList<Long> m_deferredFrees; // lists of free blob allocators // private final ArrayList<BlobAllocator> m_freeBlobs; private final ContextAllocation m_parent; private final IAllocationContext m_context; @SuppressWarnings("unchecked") ContextAllocation(final RWStore store, final int fixedBlocks, final ContextAllocation parent, final IAllocationContext acontext) { m_store = store; m_parent = parent; m_context = acontext; m_freeFixed = new ArrayList[fixedBlocks]; for (int i = 0; i < m_freeFixed.length; i++) { m_freeFixed[i] = new ArrayList<FixedAllocator>(); } m_allFixed = new ArrayList<FixedAllocator>(); m_deferredFrees = new ArrayList<Long>(); // m_freeBlobs = new ArrayList<BlobAllocator>(); } /** * For frees made against a shadowed FixedAlocator that is NOT owned * by the context, the physical free must be deferred until the * context is deshadowed or aborted. * * @param encodeAddr */ public void deferFree(final long encodeAddr) { m_deferredFrees.add(encodeAddr); } /** * Must return the shadowed allocators to the parent/global * environment, resetting the freeList association. */ void release() { final ArrayList<FixedAllocator> freeFixed[] = m_parent != null ? m_parent.m_freeFixed : m_store.m_freeFixed; final IAllocationContext pcontext = m_parent == null ? null : m_parent.m_context; for (FixedAllocator f : m_allFixed) { f.setAllocationContext(pcontext); // will add to free list if required f.setFreeList(freeFixed[m_store.fixedAllocatorIndex(f.m_size)]); } // for (int i = 0; i < m_freeFixed.length; i++) { // freeFixed[i].addAll(m_freeFixed[i]); // } // freeBlobs.addAll(m_freeBlobs); // now free all deferred frees made within this context for other // allocators if (log.isDebugEnabled()) log.debug("Releasing " + m_deferredFrees.size() + " deferred frees"); final boolean defer = m_store.m_minReleaseAge > 0 || m_store.m_activeTxCount > 0 || m_store.m_contexts.size() > 0; for (Long l : m_deferredFrees) { final int addr = (int) (l >> 32); final int sze = l.intValue(); if (defer) { m_store.deferFree(addr, sze); } else { m_store.immediateFree(addr, sze); } } m_deferredFrees.clear(); } void abort() { final ArrayList<FixedAllocator> freeFixed[] = m_parent != null ? m_parent.m_freeFixed : m_store.m_freeFixed; final IAllocationContext pcontext = m_parent == null ? null : m_parent.m_context; for (FixedAllocator f : m_allFixed) { f.abortAllocationContext(pcontext, m_store.m_writeCacheService); f.setFreeList(freeFixed[m_store.fixedAllocatorIndex(f.m_size)]); } if (log.isDebugEnabled()) log.debug("Aborting " + m_deferredFrees.size() + " deferred frees"); m_deferredFrees.clear(); } FixedAllocator getFreeFixed(final int i) { final ArrayList<FixedAllocator> free = m_freeFixed[i]; if (free.size() == 0) { final FixedAllocator falloc = establishFixedAllocator(i); if (falloc.m_pendingContextCommit) { throw new IllegalStateException("Allocator on free list while pendingContextCommit"); } falloc.setAllocationContext(m_context); // The normal check for adding to the free list is whether to return to the free list, // but in this case, we are moving to another free list, so we should not need to // check for the smallAllocation threshold. falloc.setFreeList(free, true/*force*/); if (free.size() == 0 ) { throw new IllegalStateException("Free list should not be empty, pendingContextCommit: " + falloc.m_pendingContextCommit); } m_allFixed.add(falloc); } return free.get(0); // take first in list } /** * * @param i - the block-index for the allocator required * @return */ FixedAllocator establishFixedAllocator(final int i) { if (m_parent == null) { return m_store.establishFreeFixedAllocator(i); } else { return m_parent.establishFixedAllocator(i); } } } /** * A map of the {@link IAllocationContext}s. * <p> * Note: This map must be thread-safe since it is referenced from various * methods outside of the governing {@link #m_allocationLock}. */ private final Map<IAllocationContext, ContextAllocation> m_contexts = new ConcurrentHashMap<IAllocationContext, ContextAllocation>(); private ContextAllocation getContextAllocation( final IAllocationContext context) { /* * The allocation lock MUST be held to make changes in the membership of * m_contexts atomic with respect to free(). */ assert m_allocationWriteLock.isHeldByCurrentThread(); ContextAllocation ret = m_contexts.get(context); if (ret == null) { // This is no longer a valid state throw new IllegalStateException("No associated ContextAllocation"); // ret = new ContextAllocation(this, m_freeFixed.length, null, context); // // if (m_contexts.put(context, ret) != null) { // // throw new AssertionError(); // // } // // if (log.isTraceEnabled()) // log.trace("Establish ContextAllocation: " + ret // + ", total: " + m_contexts.size() // + ", requests: " + ++m_contextRequests // + ", removals: " + m_contextRemovals // + ", allocators: " + m_allocs.size() ); // // // if (log.isInfoEnabled()) // log.info("Context: ncontexts=" + m_contexts.size() // + ", context=" + context); } return ret; } public int getSlotSize(final int data_len) { int i = 0; int ret = m_minFixedAlloc; while (data_len > ret) { i++; // If we write directly to the writeCache then the data_len // may be larger than largest slot if (i == m_allocSizes.length) return data_len; ret = 64 * m_allocSizes[i]; } return ret; } /** * The maximum allocation size (bytes). */ public int getMaxAllocSize() { return m_maxFixedAlloc; } /** * This can be called as part of the HA downstream replication. * * FIXME: If part of downstream replication then the current metabits * held by the RWStore will not be in sync with that stored on disk. * * This will only be a problem if the RWStore needs to take over as * leader and be able to allocate and write to the store. * * Note that the metabits are not needed in order to determine the * physical address mapping of an rw-native address. * * @param rootBlock * @param forceOnCommit */ public void writeRootBlock(final IRootBlockView rootBlock, final ForceEnum forceOnCommit) { if (rootBlock == null) throw new IllegalArgumentException(); checkRootBlock(rootBlock); assertOpen(); if (log.isTraceEnabled()) { log.trace("Writing new rootblock with commitCounter: " + rootBlock.getCommitCounter() + ", commitRecordAddr: " + rootBlock.getCommitRecordAddr() + ", commitRecordIndexAddr: " + rootBlock.getCommitRecordIndexAddr()); } try { final ByteBuffer data = rootBlock.asReadOnlyBuffer(); final long pos = rootBlock.isRootBlock0() ? FileMetadata.OFFSET_ROOT_BLOCK0 : FileMetadata.OFFSET_ROOT_BLOCK1; /* * Note: This uses the [opener] to automatically retry the operation * in case concurrent readers are interrupting, causing an * asynchronous close of the backing channel. */ // Note: extensionLock required for file IO. final Lock lock = m_extensionLock.readLock(); lock.lock(); try { // Update the root block. FileChannelUtility.writeAll(m_reopener, data, pos); /* * Generally, you want to force the file data to the disk here. * The file metadata MIGHT not matter since we always force it * to the disk when we change the file size (unless the file * system updates other aspects of file metadata during normal * writes). */ // sync the disk. m_reopener.reopenChannel().force(forceOnCommit == ForceEnum.ForceMetadata); // Update counters. final StoreCounters<?> c = (StoreCounters<?>) storeCounters.get() .acquire(); try { c.nwriteRootBlock++; } finally { c.release(); } // ensure cached commitNextAllocation if (m_committedNextAllocation != m_nextAllocation ) { if (log.isTraceEnabled()) log.trace("Updating committedNextAllocation from writeRootBlock"); m_committedNextAllocation = m_nextAllocation; } } finally { lock.unlock(); } } catch (IOException ex) { throw new RuntimeException(ex); } if (log.isDebugEnabled()) log.debug("wrote root block: "+rootBlock); } public ByteBuffer readRootBlock(final boolean rootBlock0) { assertOpen(); // assertNoRebuild(); final ByteBuffer tmp = ByteBuffer .allocate(RootBlockView.SIZEOF_ROOT_BLOCK); // Guard IO against concurrent file extension. final Lock lock = m_extensionLock.readLock(); lock.lock(); try { FileChannelUtility.readAll(m_reopener, tmp, rootBlock0 ? FileMetadata.OFFSET_ROOT_BLOCK0 : FileMetadata.OFFSET_ROOT_BLOCK1); tmp.position(0); // resets the position. } catch (IOException ex) { throw new RuntimeException(ex); } finally { lock.unlock(); } return tmp; } /** * Striped performance counters for {@link IRawStore} access, including * operations that read or write through to the underlying media. * <p> * Note: The performance counters for writes to the disk are reported by the * {@link WriteCacheService}. The {@link RWStore} never writes directly onto * the disk (other than the root blocks). * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @param <T> * * @todo report elapsed time and average latency for force, reopen, and * writeRootBlock. * * FIXME CAT may be much faster than striped locks (2-3x faster). */ static public class StoreCounters<T extends StoreCounters<T>> extends StripedCounters<T> { /** * #of read requests. */ public volatile long nreads; /** * #of read requests that read through to the backing file. */ public volatile long ndiskRead; /** * #of bytes read. */ public volatile long bytesRead; /** * #of bytes that have been read from the disk. */ public volatile long bytesReadFromDisk; /** * Total elapsed time for reads. */ public volatile long elapsedReadNanos; /** * Total elapsed time for reading on the disk. */ public volatile long elapsedDiskReadNanos; /** * The #of checksum errors while reading on the local disk. */ public volatile long checksumErrorCount; /** * #of write requests. */ public volatile long nwrites; // This is reported by the WriteCacheService. // /** // * #of write requests that write through to the backing file. // */ // public volatile long ndiskWrite; /** * The size of the largest record read. */ public volatile long maxReadSize; /** * The size of the largest record written. */ public volatile long maxWriteSize; /** * #of bytes written. */ public volatile long bytesWritten; // This is reported by the WriteCacheService. // /** // * #of bytes that have been written on the disk. // */ // public volatile long bytesWrittenOnDisk; /** * Total elapsed time for writes. */ public volatile long elapsedWriteNanos; // This is reported by the WriteCacheService. // /** // * Total elapsed time for writing on the disk. // */ // public volatile long elapsedDiskWriteNanos; /** * #of times the data were forced to the disk. */ public volatile long nforce; /** * #of times the length of the file was changed (typically, extended). */ public volatile long ntruncate; /** * #of times the file has been reopened after it was closed by an * interrupt. */ public volatile long nreopen; /** * #of times one of the root blocks has been written. */ public volatile long nwriteRootBlock; /** * buffer counters */ public volatile long bufferDataBytes; public volatile long bufferDataWrites; public volatile long bufferFileWrites; /** * {@inheritDoc} */ public StoreCounters() { super(); } /** * {@inheritDoc} */ public StoreCounters(final int batchSize) { super(batchSize); } /** * {@inheritDoc} */ public StoreCounters(final int nstripes, final int batchSize) { super(nstripes, batchSize); } @Override public void add(final T o) { super.add(o); nreads += o.nreads; ndiskRead += o.ndiskRead; bytesRead += o.bytesRead; bytesReadFromDisk += o.bytesReadFromDisk; maxReadSize = Math.max(maxReadSize, o.maxReadSize); elapsedReadNanos += o.elapsedReadNanos; elapsedDiskReadNanos += o.elapsedDiskReadNanos; checksumErrorCount += o.checksumErrorCount; nwrites += o.nwrites; // ndiskWrite += o.ndiskWrite; maxWriteSize = Math.max(maxWriteSize, o.maxWriteSize); bytesWritten += o.bytesWritten; // bytesWrittenOnDisk += o.bytesWrittenOnDisk; elapsedWriteNanos += o.elapsedWriteNanos; // elapsedDiskWriteNanos += o.elapsedDiskWriteNanos; nforce += o.nforce; ntruncate += o.ntruncate; nreopen += o.nreopen; nwriteRootBlock += o.nwriteRootBlock; } @Override public T subtract(final T o) { // make a copy of the current counters. final T t = super.subtract(o); // subtract out the given counters. t.nreads -= o.nreads; t.ndiskRead -= o.ndiskRead; t.bytesRead -= o.bytesRead; t.bytesReadFromDisk -= o.bytesReadFromDisk; t.maxReadSize -= o.maxReadSize; // @todo report max? min? t.elapsedReadNanos -= o.elapsedReadNanos; t.elapsedDiskReadNanos -= o.elapsedDiskReadNanos; t.checksumErrorCount -= o.checksumErrorCount; t.nwrites -= o.nwrites; // t.ndiskWrite -= o.ndiskWrite; t.maxWriteSize -= o.maxWriteSize; // @todo report max? min? t.bytesWritten -= o.bytesWritten; // t.bytesWrittenOnDisk -= o.bytesWrittenOnDisk; t.elapsedWriteNanos -= o.elapsedWriteNanos; // t.elapsedDiskWriteNanos -= o.elapsedDiskWriteNanos; t.nforce -= o.nforce; t.ntruncate -= o.ntruncate; t.nreopen -= o.nreopen; t.nwriteRootBlock -= o.nwriteRootBlock; return t; } @Override public void clear() { // subtract out the given counters. nreads = 0; ndiskRead = 0; bytesRead = 0; bytesReadFromDisk = 0; maxReadSize = 0; elapsedReadNanos = 0; elapsedDiskReadNanos = 0; checksumErrorCount = 0; nwrites = 0; // ndiskWrite = 0; maxWriteSize = 0; bytesWritten = 0; // bytesWrittenOnDisk = 0; elapsedWriteNanos = 0; // elapsedDiskWriteNanos = 0; nforce = 0; ntruncate = 0; nreopen = 0; nwriteRootBlock = 0; } @Override public CounterSet getCounters() { final CounterSet root = super.getCounters(); // IRawStore API { /* * reads */ root.addCounter("nreads", new Instrument<Long>() { public void sample() { setValue(nreads); } }); root.addCounter("bytesRead", new Instrument<Long>() { public void sample() { setValue(bytesRead); } }); root.addCounter("readSecs", new Instrument<Double>() { public void sample() { final double elapsedReadSecs = (elapsedReadNanos / 1000000000.); setValue(elapsedReadSecs); } }); root.addCounter("bytesReadPerSec", new Instrument<Double>() { public void sample() { final double readSecs = (elapsedReadNanos / 1000000000.); final double bytesReadPerSec = (readSecs == 0L ? 0d : (bytesRead / readSecs)); setValue(bytesReadPerSec); } }); root.addCounter("maxReadSize", new Instrument<Long>() { public void sample() { setValue(maxReadSize); } }); root.addCounter("checksumErrorCount", new Instrument<Long>() { public void sample() { setValue(checksumErrorCount); } }); /* * writes */ root.addCounter("nwrites", new Instrument<Long>() { public void sample() { setValue(nwrites); } }); root.addCounter("bytesWritten", new Instrument<Long>() { public void sample() { setValue(bytesWritten); } }); root.addCounter("writeSecs", new Instrument<Double>() { public void sample() { final double writeSecs = (elapsedWriteNanos / 1000000000.); setValue(writeSecs); } }); root.addCounter("bytesWrittenPerSec", new Instrument<Double>() { public void sample() { final double writeSecs = (elapsedWriteNanos / 1000000000.); final double bytesWrittenPerSec = (writeSecs == 0L ? 0d : (bytesWritten / writeSecs)); setValue(bytesWrittenPerSec); } }); root.addCounter("maxWriteSize", new Instrument<Long>() { public void sample() { setValue(maxWriteSize); } }); } // IRawStore // BufferedWriter final CounterSet bc = root.makePath("buffer"); bc.addCounter("ndataWrites", new Instrument<Long>() { public void sample() { setValue(bufferDataWrites); } }); bc.addCounter("nfileWrites", new Instrument<Long>() { public void sample() { setValue(bufferFileWrites); } }); // disk statistics { final CounterSet disk = root.makePath("disk"); /* * read */ disk.addCounter("nreads", new Instrument<Long>() { public void sample() { setValue(ndiskRead); } }); disk.addCounter("bytesRead", new Instrument<Long>() { public void sample() { setValue(bytesReadFromDisk); } }); disk.addCounter("bytesPerRead", new Instrument<Double>() { public void sample() { final double bytesPerDiskRead = (ndiskRead == 0 ? 0d : (bytesReadFromDisk / (double) ndiskRead)); setValue(bytesPerDiskRead); } }); disk.addCounter("readSecs", new Instrument<Double>() { public void sample() { final double diskReadSecs = (elapsedDiskReadNanos / 1000000000.); setValue(diskReadSecs); } }); disk.addCounter("bytesReadPerSec", new Instrument<Double>() { public void sample() { final double diskReadSecs = (elapsedDiskReadNanos / 1000000000.); final double bytesReadPerSec = (diskReadSecs == 0L ? 0d : bytesReadFromDisk / diskReadSecs); setValue(bytesReadPerSec); } }); disk.addCounter("secsPerRead", new Instrument<Double>() { public void sample() { final double diskReadSecs = (elapsedDiskReadNanos / 1000000000.); final double readLatency = (diskReadSecs == 0 ? 0d : diskReadSecs / ndiskRead); setValue(readLatency); } }); /* * write */ // disk.addCounter("nwrites", new Instrument<Long>() { // public void sample() { // setValue(ndiskWrite); // } // }); // // disk.addCounter("bytesWritten", new Instrument<Long>() { // public void sample() { // setValue(bytesWrittenOnDisk); // } // }); // // disk.addCounter("bytesPerWrite", new Instrument<Double>() { // public void sample() { // final double bytesPerDiskWrite = (ndiskWrite == 0 ? 0d // : (bytesWrittenOnDisk / (double) ndiskWrite)); // setValue(bytesPerDiskWrite); // } // }); // // disk.addCounter("writeSecs", new Instrument<Double>() { // public void sample() { // final double diskWriteSecs = (elapsedDiskWriteNanos / 1000000000.); // setValue(diskWriteSecs); // } // }); // // disk.addCounter("bytesWrittenPerSec", new Instrument<Double>() { // public void sample() { // final double diskWriteSecs = (elapsedDiskWriteNanos / 1000000000.); // final double bytesWrittenPerSec = (diskWriteSecs == 0L ? 0d // : bytesWrittenOnDisk / diskWriteSecs); // setValue(bytesWrittenPerSec); // } // }); // // disk.addCounter("secsPerWrite", new Instrument<Double>() { // public void sample() { // final double diskWriteSecs = (elapsedDiskWriteNanos / 1000000000.); // final double writeLatency = (diskWriteSecs == 0 ? 0d // : diskWriteSecs / ndiskWrite); // setValue(writeLatency); // } // }); /* * other */ disk.addCounter("nforce", new Instrument<Long>() { public void sample() { setValue(nforce); } }); disk.addCounter("nextend", new Instrument<Long>() { public void sample() { setValue(ntruncate); } }); disk.addCounter("nreopen", new Instrument<Long>() { public void sample() { setValue(nreopen); } }); disk.addCounter("rootBlockWrites", new Instrument<Long>() { public void sample() { setValue(nwriteRootBlock); } }); } // disk return root; } // getCounters() } // class StoreCounters /** * Striped performance counters for this class. */ @SuppressWarnings("unchecked") private final AtomicReference<StoreCounters> storeCounters = new AtomicReference<StoreCounters>(); /** * Returns the striped performance counters for the store. */ public StoreCounters<?> getStoreCounters() { return storeCounters.get(); } /** * Replaces the {@link StoreCounters} object. * * @param storeCounters * The new {@link Counter}s. * * @throws IllegalArgumentException * if the argument is <code>null</code>. */ public void setStoreCounters(final StoreCounters<?> storeCounters) { if (storeCounters == null) throw new IllegalArgumentException(); this.storeCounters.set(storeCounters); } /** * Return interesting information about the write cache and file operations. * * @todo allocations data? user extent allocated? user extent used? etc. */ public CounterSet getCounters() { final CounterSet root = new CounterSet(); // root.addCounter("nextOffset", new Instrument<Long>() { // public void sample() { // setValue(nextOffset.get()); // } // }); root.addCounter("extent", new Instrument<Long>() { public void sample() { setValue(getStoreFile().length()); } }); // attach the most recently updated values from the striped counters. root.attach(storeCounters.get().getCounters()); if (m_writeCacheService != null) { final CounterSet tmp = root.makePath("writeCache"); tmp.attach(m_writeCacheService.getCounters()); } return root; } public void writeRawBuffer(final IHAWriteMessage msg, final IBufferAccess b) throws IOException, InterruptedException { // expand buffer before writing on the store. final ByteBuffer xb = msg.expand(b.buffer()); if (log.isTraceEnabled()) { log.trace("expanded buffer, position: " + xb.position() + ", limit: " + xb.limit()); } final IBufferAccess ba = new IBufferAccess() { @Override public ByteBuffer buffer() { return xb; } @Override public void release() throws InterruptedException { } @Override public void release(long timeout, TimeUnit unit) throws InterruptedException { } }; /* * Wrap up the data from the message as a WriteCache object. This will * build up a RecordMap containing the allocations to be made, and * including a ZERO (0) data length if any offset winds up being deleted * (released). * * Note: We do not need to pass in the compressorKey here. It is ignored * by WriteCache.flush(). We have expanded the payload above. Now we are * just flushing the write cache onto the disk. */ final WriteCache writeCache = m_writeCacheService.newWriteCache(ba, true/* useChecksums */, true/* bufferHasData */, m_reopener, msg.getFileExtent()); // Ensure that replicated buffers are not compacted. writeCache.closeForWrites(); /* * Setup buffer for writing. We receive the buffer with pos=0, * limit=#ofbyteswritten. However, flush() expects pos=limit, will * clear pos to zero and then write bytes up to the limit. So, * we set the position to the limit before calling flush. */ final ByteBuffer bb = ba.buffer(); final int limit = bb.limit(); bb.position(limit); /* * Flush the scattered writes in the write cache to the backing store. * * Note: WriteCacheImpl.writeOnChannel() will take the extensionLock for * the IO against the channel. */ // final Lock lock = m_allocationReadLock; // TODO Is the allocation lock required here? I doubt it. // lock.lock(); // try { // Flush writes. writeCache.flush(false/* force */); // } finally { // lock.unlock(); // } // install reads into readCache (if any) m_writeCacheService.installReads(writeCache); } public Future<Void> sendHALogBuffer(final IHALogRequest req, final IHAWriteMessage msg, final IBufferAccess buf) throws IOException, InterruptedException { final ByteBuffer b = buf.buffer(); assert b.remaining() > 0 : "Empty buffer: " + b; @SuppressWarnings("unchecked") final QuorumPipeline<HAPipelineGlue> quorumMember = (QuorumPipeline<HAPipelineGlue>) m_quorum .getMember(); final Future<Void> remoteWriteFuture = quorumMember.replicate(req, msg, b); return remoteWriteFuture; } /** * @see IHABufferStrategy#sendRawBuffer(IHARebuildRequest, long, * long, long, long, int, ByteBuffer) */ public Future<Void> sendRawBuffer(final IHARebuildRequest req, // final long commitCounter, final long commitTime, final long sequence, final long quorumToken, final long fileExtent, final long offset, final int nbytes, final ByteBuffer b) throws IOException, InterruptedException { // read direct from store final ByteBuffer clientBuffer = b; clientBuffer.position(0); clientBuffer.limit(nbytes); readRaw(/*nbytes,*/ offset, clientBuffer); assert clientBuffer.remaining() > 0 : "Empty buffer: " + clientBuffer; @SuppressWarnings("unchecked") final QuorumPipeline<HAPipelineGlue> quorumMember = (QuorumPipeline<HAPipelineGlue>) m_quorum .getMember(); final int chk = ChecksumUtility.threadChk.get().checksum(b); final IHAWriteMessage msg = new HAWriteMessage(m_storeUUID, -1L/* commitCounter */, -1L/* commitTime */, sequence, nbytes, chk, StoreTypeEnum.RW, quorumToken, fileExtent, offset/* firstOffset */); final Future<Void> remoteWriteFuture = quorumMember.replicate(req, msg, clientBuffer); return remoteWriteFuture; } public void writeOnStream(final OutputStream os, final ISnapshotData snapshotData, final Quorum<HAGlue, QuorumService<HAGlue>> quorum, final long token) throws IOException, QuorumException, InterruptedException { // final FileInputStream filein = new FileInputStream(this.m_fd); final FileChannelUtility.ReopenerInputStream filein = new FileChannelUtility.ReopenerInputStream(m_reopener); try { MergeStreamWithSnapshotData.process(filein, snapshotData, os); } finally { filein.close(); } if (quorum!=null&&!quorum.getClient().isJoinedMember(token)) { // See #1172 throw new QuorumException(); } } public void writeOnStream2(final OutputStream os, final Set<java.util.Map.Entry<Long, byte[]>> snapshotData, final Quorum<HAGlue, QuorumService<HAGlue>> quorum, final long token) throws IOException, QuorumException { IBufferAccess buf = null; try { try { // Acquire a buffer. buf = DirectBufferPool.INSTANCE.acquire(); } catch (InterruptedException ex) { // Wrap and re-throw. throw new IOException(ex); } // The backing ByteBuffer. final ByteBuffer b = buf.buffer(); // The capacity of that buffer (typically 1MB). final int bufferCapacity = b.capacity(); // A big enough byte[]. final byte[] a = new byte[bufferCapacity]; // The size of the root blocks (which we skip). final int headerSize = FileMetadata.headerSize0; /* * The size of the file at the moment we begin. We will not * replicate data on new extensions of the file. Those data will * be captured by HALog files that are replayed by the service * that is doing the rebuild. */ // final long fileExtent = getExtent(); final long fileExtent = getStoreFile().length(); // The #of bytes to be transmitted. final long totalBytes = fileExtent - headerSize; // The #of bytes remaining. long remaining = totalBytes; // The offset from which data is retrieved. long offset = headerSize; long sequence = 0L; if (log.isInfoEnabled()) log.info("Writing on stream: nbytes=" + totalBytes); while (remaining > 0) { int nbytes = (int) Math.min((long) bufferCapacity, remaining); if (sequence == 0L && nbytes == bufferCapacity && remaining > bufferCapacity) { /* * Adjust the first block so the remainder will be * aligned on the bufferCapacity boundaries (IO * efficiency). */ nbytes -= headerSize; } if (log.isDebugEnabled()) log.debug("Writing block: sequence=" + sequence + ", offset=" + offset + ", nbytes=" + nbytes); // read direct from store final ByteBuffer clientBuffer = b; clientBuffer.position(0); clientBuffer.limit(nbytes); readRaw(/*nbytes,*/ offset, clientBuffer); assert clientBuffer.remaining() > 0 : "Empty buffer: " + clientBuffer; if (BytesUtil .toArray(clientBuffer, false/* forceCopy */, a/* dst */) != a) { // Should have copied into our array. throw new AssertionError(); } // write onto the stream. os.write(a, 0/* off */, nbytes/* len */); remaining -= nbytes; offset += nbytes; sequence++; if (!quorum.getClient().isJoinedMember(token)) throw new QuorumException(); } if (log.isInfoEnabled()) log.info("Wrote on stream: #blocks=" + sequence + ", #bytes=" + (fileExtent - headerSize)); } finally { if (buf != null) { try { // Release the direct buffer. buf.release(); } catch (InterruptedException e) { log.warn(e); } } } } /** * Read on the backing file. {@link ByteBuffer#remaining()} bytes will be * read into the caller's buffer, starting at the specified offset in the * backing file. * * @param offset * The offset of the first byte (relative to the start of the * data region). * @param dst * Where to put the data. Bytes will be written at position until * limit. * * @return The caller's buffer, prepared for reading. */ public ByteBuffer readRaw(final long offset, final ByteBuffer dst) { // Guard against concurrent file extension. final Lock lock = m_extensionLock.readLock(); lock.lock(); try { final int position = dst.position(); try { final long beginDisk = System.nanoTime(); // the offset into the disk file. // final long pos = FileMetadata.headerSize0 + offset; final long pos = offset; final int length = dst.limit(); // read on the disk. final int ndiskRead = FileChannelUtility.readAll(m_reopener, dst, pos); m_diskReads += ndiskRead; final long now = System.nanoTime(); // update performance counters. final StoreCounters<?> c = (StoreCounters<?>) storeCounters .get().acquire(); try { c.ndiskRead += ndiskRead; final int nbytes = length; c.nreads++; c.bytesRead += nbytes; c.bytesReadFromDisk += nbytes; c.elapsedReadNanos += now - beginDisk; c.elapsedDiskReadNanos += now - beginDisk; } finally { c.release(); } } catch (IOException ex) { throw new RuntimeException(ex); } // reset for reading dst.position(position); return dst; } finally { lock.unlock(); } } public int getMaxBlobSize() { return m_maxBlobAllocSize-4; // allow for checksum } public StorageStats getStorageStats() { return m_storageStats; } private final class RawTx implements IRawTx { private final AtomicBoolean m_open = new AtomicBoolean(true); RawTx() { activateTx(); } @Override public void close() { if (m_open.compareAndSet(true/*expect*/, false/*update*/)) { deactivateTx(); } } } @Override public IRawTx newTx() { return new RawTx(); } private void activateTx() { m_allocationWriteLock.lock(); try { m_activeTxCount++; if(log.isInfoEnabled()) log.info("#activeTx="+m_activeTxCount); } finally { m_allocationWriteLock.unlock(); } } private void deactivateTx() { m_allocationWriteLock.lock(); try { if (log.isInfoEnabled()) log.info("Deactivating TX " + m_activeTxCount); if (m_activeTxCount == 0) { throw new IllegalStateException("Tx count must be positive!"); } m_activeTxCount--; if(log.isInfoEnabled()) log.info("#activeTx="+m_activeTxCount); if (m_activeTxCount == 0 && m_contexts.isEmpty()) { releaseSessions(); } } finally { m_allocationWriteLock.unlock(); } } /** * Debug ONLY method added to permit unit tests to be written that the * native transaction counter is correctly decremented to zero. The returned * value is ONLY valid while holding the {@link #m_allocationLock}. * Therefore this method MAY NOT be used reliably outside of code that can * guarantee that there are no concurrent committers on the {@link RWStore}. * * @see <a href="http://trac.blazegraph.com/ticket/1036"> Journal file growth * reported with 1.3.3 </a> */ public int getActiveTxCount() { m_allocationWriteLock.lock(); try { return m_activeTxCount; } finally { m_allocationWriteLock.unlock(); } } /** * Returns the slot size associated with this address */ @Override public int getAssociatedSlotSize(final int addr) { return getBlock(addr).getBlockSize(); } /** * lockAddress adds the address passed to a lock list. This is for * debug only and is not intended to be used generally for the live system. * * @param addr - address to be locked */ public void lockAddress(final int addr) { if (m_lockAddresses.putIfAbsent(addr, System.currentTimeMillis()) != null) { throw new IllegalStateException("address already locked, logical: " + addr + ", physical: " + physicalAddress(addr, true)); } } public void showWriteCacheDebug(final long paddr) { log.warn("WriteCacheDebug: " + paddr + " - " + m_writeCacheService.addrDebugInfo(paddr)); } public CounterSet getWriteCacheCounters() { return m_writeCacheService.getCounters(); } // /** // * If historical data is maintained then this will return the earliest time for which // * data can be safely retrieved. // * // * @return time of last release // */ @Override public long getLastReleaseTime() { return m_lastDeferredReleaseTime; } private ConcurrentWeakValueCache<Long, ICommitter> m_externalCache = null; private int m_cachedDatasize = 0; @Override public void registerExternalCache( final ConcurrentWeakValueCache<Long, ICommitter> externalCache, final int dataSize) { m_allocationWriteLock.lock(); try { m_externalCache = externalCache; m_cachedDatasize = getSlotSize(dataSize); } finally { m_allocationWriteLock.unlock(); } } /** * Return <code>true</code> iff the allocation having that address is * flagged as committed. The caller must be holding the allocation lock in * order for the result to remain valid outside of the method call. * * @param addr * The address. * * @return <code>true</code> iff the address is currently committed. */ public boolean isCommitted(final int rwaddr) { // FIXME ALLOCATION LOCK : Why not use the ReadLock here? final Lock lock = m_allocationWriteLock; lock.lock(); try { final FixedAllocator alloc = getBlockByAddress(rwaddr); final int offset = getOffset(rwaddr); return alloc.isCommitted(offset); } finally { lock.unlock(); } } public boolean inWriteCache(final int rwaddr) { return m_writeCacheService.isPresent(physicalAddress(rwaddr, true)); } @Override public InputStream getInputStream(long addr) { return new PSInputStream(this, addr); } @Override public IPSOutputStream getOutputStream() { return getOutputStream(null); } public IPSOutputStream getOutputStream(final IAllocationContext context) { checkContext(context); return PSOutputStream.getNew(this, m_maxFixedAlloc, context); } /** * Low level routine used when we replace the root blocks of an empty * journal in HA with those from the leader. * <p> * Note: This method is only invoked in contexts where there should not be * concurrent access to the journal. This we should not need to worry about * concurrent readers during {@link #resetFromHARootBlock(IRootBlockView)}. * * @see #postHACommit(IRootBlockView) */ public void resetFromHARootBlock(final IRootBlockView rootBlock) { /* * Acquire exclusive access to the allocators. * * Note: Since the allocation lock must be held before you may take the * extensionLock, and we have exclusive access to the allocation lock, * we SHOULD NOT need to take the extension lock as well. */ final Lock outerLock = m_allocationWriteLock; outerLock.lock(); try { // Exclude IOs. final Lock innerLock = m_extensionLock.writeLock(); innerLock.lock(); try { // should not be any dirty allocators // assert m_commitList.size() == 0; // Remove all current allocators m_allocs.clear(); assert m_nextAllocation != 0; m_nextAllocation = 0; initfromRootBlock(rootBlock); // KICK external cache into touch - FIXME: handle with improved Allocator synchronization m_externalCache.clear(); assert m_nextAllocation != 0; } finally { innerLock.unlock(); } } catch (IOException e) { throw new RuntimeException(e); } finally { outerLock.unlock(); } } /** * Called from {@link AbstractJournal} commit2Phase to ensure that a * downstream HA quorum member ensures it is able to read committed data * that has been streamed directly to the backing store. * <p> * The data stream will have included metabits and modified * {@link FixedAllocator}s so these must be reset using the metabitsAddr * data in the root block. * <p> * Note: Reads on the {@link RWStore} MUST block during this method since * some allocators may be replaced as part of the post-commit protocol. * <p> * Ticket #778 was for a problem when a follower takes over as leader and * was not correctly synchronised. This was traced, eventually, to a problem * in calculating the diskAddr metabit for the modified Allocator. The problem * was demonstrated by a temporary method to reserve metaAllocations by extending and * setting the m_transient bits. But that has to be done within the commit() method * before it attempts to save all the dirty allocators. If we need to contrive a similar * scenario in the future a better approach would be a special debug property on the * RWStore that indicates a "TRANSIENT_RESERVE" or something similar. * * @param rbv * The new {@link IRootBlockView}. */ @SuppressWarnings("unchecked") public void postHACommit(final IRootBlockView rbv) { /* * Acquire exclusive access to the allocators. * * Note: Since the allocation lock must be held before you may take the * extensionLock, and we have exclusive access to the allocation lock, * we SHOULD NOT need to take the extension lock as well. */ final Lock outerLock = m_allocationWriteLock; outerLock.lock(); try { final Lock innerLock = m_extensionLock.writeLock(); innerLock.lock(); try { // Current FixedAllocators for sanity if (log.isTraceEnabled()) { log.trace("POSTHACOMMIT START"); for (int index = 0; index < m_allocs.size(); index++) { final FixedAllocator xfa = m_allocs.get(index); log.trace("Allocator " + index + ", size: " + xfa.m_size + ", startAddress: " + xfa.getStartAddr() + ", allocated: " + (xfa.getAllocatedSlots()/xfa.m_size)); } } // Update m_metaBits addr and m_nextAllocation to ensure able to allocate as well as read! { final long nxtOffset = rbv.getNextOffset(); // next allocation to be made (in -32K units). m_nextAllocation = -(int) (nxtOffset >> 32); if (m_nextAllocation == 0) { throw new IllegalStateException("Invalid state for non-empty store"); } m_committedNextAllocation = m_nextAllocation; // latched offset of the metabits region. m_metaBitsAddr = -(int) nxtOffset; } final ArrayList<FixedAllocator> nallocs = new ArrayList<FixedAllocator>(); // current metabits final int[] oldmetabits = m_metaBits; // new metabits final RootBlockInfo rbi = new RootBlockInfo(rbv, m_reopener); m_metaBits = rbi.m_metabits; // and grab the last deferred release and storageStats! m_lastDeferredReleaseTime = rbi.m_lastDeferredReleaseTime; m_storageStatsAddr = rbi.m_storageStatsAddr; if(log.isTraceEnabled()) log.trace("Metabits length: " + m_metaBits.length); // Valid metabits should be multiples of default sizes if (oldmetabits.length % cDefaultMetaBitsSize != 0) throw new AssertionError(); if (m_metaBits.length % cDefaultMetaBitsSize != 0) throw new AssertionError("New metabits: " + m_metaBits.length + ", old: " + oldmetabits.length); // Is it always valid to assume that: // metabits.length >= oldmetabits.length if (m_metaBits.length < oldmetabits.length) throw new AssertionError(); // need to compute modded metabits, those newly written slots by ANDing // new bits with compliment of current final int[] moddedBits = m_metaBits.clone(); for (int b = 0; b < oldmetabits.length; b+=cDefaultMetaBitsSize) { // int[0] is startAddr, int[1:cDefaultMetaBitsSize] bits for (int i = 1; i < cDefaultMetaBitsSize; i++) { moddedBits[b+i] &= ~oldmetabits[b+i]; } } if (log.isTraceEnabled()) { StringBuffer sb = new StringBuffer(); Iterator<Entry<Long, WeakReference<ICommitter>>> entries = m_externalCache.entryIterator(); while (entries.hasNext()) { sb.append(entries.next().getKey() + "|"); } log.trace("External Cache Start Size: " + m_externalCache.size() + ", entries: " + sb.toString()); } // Stage One: Count moddedBits // Stage Two: Compute Address of modded bits // Stage Three: Read Allocator from modded address // Stage Four: Update Live Allocators int modCount = 0; int totalFreed = 0; for (int i = 0; i < moddedBits.length; i+=cDefaultMetaBitsSize) { final long startAddr = convertAddr(m_metaBits[i]); for (int j = 1; j < cDefaultMetaBitsSize; j++) { final int chkbits = moddedBits[i+j]; for (int b = 0; b < 32; b++) { if ((chkbits & (1 << b)) != 0) { modCount++; // Calculate address final int bit = b + (32 * (j-1)); final long paddr = startAddr + (bit * ALLOC_BLOCK_SIZE); if (log.isTraceEnabled()) log.trace("Allocator at: " + paddr); // metaBit // final int metaBit = (i * cDefaultMetaBitsSize * 32) + (j * 32) + b; final int metaBit = ((i + j) * 32) + b; // Now try to read it in final FixedAllocator nalloc = readAllocator(paddr); if (log.isTraceEnabled()) log.trace("Allocator read of size: " + nalloc.m_size + ", metaBit: " + metaBit); nalloc.setDiskAddr(metaBit); // Now can we find an existing one to replace, otherwise we need to add to the new list boolean found = false; if (log.isTraceEnabled()) log.trace("Checking allocator at " + nalloc.getStartAddr()); for (int index = 0; !found && index < m_allocs.size(); index++) { final FixedAllocator xfa = m_allocs.get(index); if (xfa.getStartAddr() == nalloc.getStartAddr()) { if (log.isTraceEnabled()) log.trace("Found updated allocator at " + index + ", size: " + xfa.m_size + " vs " + nalloc.m_size + ", allocated slots: " + (xfa.getAllocatedSlots()/xfa.m_size) + " vs " + (nalloc.getAllocatedSlots()/xfa.m_size)); // Compare allocators to see if same found = true; // Replace old with new m_allocs.set(index, nalloc); nalloc.setIndex(index); // remove old from free list (if set) xfa.removeFromFreeList(); // now clear any cached writes now freed totalFreed +=nalloc.removeFreedWrites(xfa, m_externalCache); } } if (!found) { nallocs.add(nalloc); } } } } } if (log.isInfoEnabled()) log.info("Released: " + totalFreed + " addresses from " + modCount + " modified Allocators"); if (log.isTraceEnabled()) { log.trace("OLD BITS: " + BytesUtil.toHexString(oldmetabits)); log.trace("NEW BITS: " + BytesUtil.toHexString(m_metaBits)); log.trace("MODDED BITS: " + BytesUtil.toHexString(moddedBits)); log.trace("MODDED COUNT: " + modCount + " from " + m_allocs.size() + " Allocators"); } // Now add in any new allocators, first sorting and setting their index number if (nallocs.size() > 0) { Collections.sort(nallocs); final int sindex = m_allocs.size(); for (int index = 0; index < nallocs.size(); index++) { ((Allocator) nallocs.get(index)).setIndex(sindex + index); if (log.isTraceEnabled()) log.trace("New Allocator, index: " + (sindex + index)); } if (log.isTraceEnabled()) log.trace("Adding new allocators: " + sindex); m_allocs.addAll(nallocs); } { final long nxtOffset = rbv.getNextOffset(); // next allocation to be made (in -32K units). m_nextAllocation = -(int) (nxtOffset >> 32); if (m_nextAllocation == 0) { /* * Skip the first 32K in the file. The root blocks live here but * nothing else. */ m_nextAllocation = -(1 + META_ALLOCATION); } m_committedNextAllocation = m_nextAllocation; } if (log.isTraceEnabled()) { log.trace("POSTHACOMMIT END"); for (int index = 0; index < m_allocs.size(); index++) { final FixedAllocator xfa = m_allocs.get(index); log.trace("Allocator " + index + ", startAddress: " + xfa.getStartAddr() + ", allocated: " + xfa.getAllocatedSlots()); } } if (log.isTraceEnabled()) log.trace("External Cache Pre Clear Size: " + m_externalCache.size()); // If FixedAllocator.removeFreedWrites does its job then we do not // need to clear the external cache // m_externalCache.clear(); assert m_nextAllocation != 0; } finally { innerLock.unlock(); } } catch (IOException e) { throw new RuntimeException(e); } finally { outerLock.unlock(); } // FIXME: Remove once allocators are synced // log.error("Complete implementation of postHACommit()"); // // resetFromHARootBlock(rbv); // // log.warn("POSTHACOMMIT AFTER RESET"); // for (int index = 0; index < m_allocs.size(); index++) { // final FixedAllocator xfa = m_allocs.get(index); // log.warn("Allocator " + index + ", startAddress: " + xfa.getStartAddr() + ", allocated: " + xfa.getAllocatedSlots()); // } } /** * Simple class to collect up DeleteBlockStats and returned by * checkDeleteBlocks, called from DumpJournal. */ public static class DeleteBlockStats { private int m_commitRecords = 0;; private int m_addresses = 0; private int m_blobs = 0; private int m_badAddresses = 0; private final HashMap<Integer, Integer> m_freed = new HashMap<Integer, Integer>(); /** * The latched address of each address that appears more than once * across the delete blocks. */ private final Set<Integer> m_duplicates = new LinkedHashSet<Integer>(); // /** // * The hexstring version of the data associated with the addresses that // * are present more than once in the delete blocks. // */ // private final ArrayList<String> m_dupData = new ArrayList<String>(); /** * The #of commit records that would be processed. */ public int getCommitRecords() { return m_commitRecords; } /** * Return the #of addresses in the delete blocks acrosss the commit * records. */ public int getAddresses() { return m_addresses; } /** * Return the #of addresses that are not committed data across the * commit records. */ public int getBadAddresses() { return m_badAddresses; } /** * Return the latched addresses that appear more than once in the delete * blocks across the commit records. */ public Set<Integer> getDuplicateAddresses() { return m_duplicates; } public String toString(final RWStore store) { final StringBuilder sb = new StringBuilder(); sb.append("CommitRecords: " + m_commitRecords + ", Addresses: " + m_addresses + ", Blobs: " + m_blobs + ", bad: " + + m_badAddresses); if (!m_duplicates.isEmpty()) { for (int latchedAddr : m_duplicates) { // final int latchedAddr = m_duplicates.get(i); sb.append("\nDuplicate: latchedAddr=" + latchedAddr + "\n"); /* * Note: Now dumped by DumpJournal. */ // final byte[] data; // try { // data = store.readFromLatchedAddress(latchedAddr); // } catch (IOException ex) { // final String msg = "Could not read data: addr=" // + latchedAddr; // log.error(msg, ex); // sb.append(msg); // continue; // } // // final String hexStr = BytesUtil.toHexString(data, // data.length); // // BytesUtil.printHexString(sb, hexStr); } } return sb.toString(); } } /** * Utility to check the deleteBlocks associated with each active CommitRecord */ public DeleteBlockStats checkDeleteBlocks(final AbstractJournal journal) { final DeleteBlockStats stats = new DeleteBlockStats(); /* * Commit can be called prior to Journal initialisation, in which case * the commitRecordIndex will not be set. */ final IIndex commitRecordIndex = journal.getReadOnlyCommitRecordIndex(); if (commitRecordIndex == null) { return stats; } @SuppressWarnings("unchecked") final ITupleIterator<CommitRecordIndex.Entry> commitRecords = commitRecordIndex .rangeIterator(); while (commitRecords.hasNext()) { final ITuple<CommitRecordIndex.Entry> tuple = commitRecords.next(); final CommitRecordIndex.Entry entry = tuple.getObject(); try { final ICommitRecord record = CommitRecordSerializer.INSTANCE .deserialize(journal.read(entry.addr)); final long blockAddr = record .getRootAddr(AbstractJournal.DELETEBLOCK); if (blockAddr != 0) { checkDeferrals(blockAddr, record.getTimestamp(), stats); } stats.m_commitRecords++; } catch (RuntimeException re) { throw new RuntimeException("Problem with entry at " + entry.addr, re); } } return stats; } /** * Utility method to verify the deferred delete blocks. * * @param blockAddr * The address of a deferred delete block. * @param commitTime * The commitTime associated with the {@link ICommitRecord}. * @param stats * Where to collect statistics. */ private void checkDeferrals(final long blockAddr, final long commitTime, final DeleteBlockStats stats) { /** * Debug flag. When true, writes all frees onto stderr so they can be * read into a worksheet for analysis. */ final boolean writeAll = false; final int addr = (int) (blockAddr >> 32); final int sze = (int) blockAddr & 0xFFFFFFFF; // Resolution for BLZG-1236 (recycler error) if (log.isTraceEnabled()) log.trace("freeDeferrals at " + physicalAddress(addr) + ", size: " + sze + " releaseTime: " + commitTime); final byte[] buf = new byte[sze + 4]; // allow for checksum getData(addr, buf); final DataInputStream strBuf = new DataInputStream( new ByteArrayInputStream(buf)); m_allocationWriteLock.lock(); // int totalFreed = 0; try { int nxtAddr = strBuf.readInt(); // int cnt = 0; while (nxtAddr != 0) { // while (false && addrs-- > 0) { stats.m_addresses++; if (nxtAddr > 0) { // Blob stats.m_blobs++; final int bloblen = strBuf.readInt(); assert bloblen > 0; // a Blob address MUST have a size nxtAddr = -nxtAddr; } if (!isCommitted(nxtAddr)) { stats.m_badAddresses++; } if (stats.m_freed.containsKey(nxtAddr)) { stats.m_duplicates.add(nxtAddr); if (writeAll) { log.warn("" + commitTime + " " + nxtAddr + " FREE DUP"); } } else { stats.m_freed.put(nxtAddr, nxtAddr); if (writeAll) { log.warn("" + commitTime + " " + nxtAddr + " FREE"); } } nxtAddr = strBuf.readInt(); } // now check delete block assert isCommitted(addr); } catch (IOException e) { throw new RuntimeException("Problem checking deferrals: " + e, e); } finally { m_allocationWriteLock.unlock(); } } /** * A low level utility method that reads directly from the backing * {@link FileChannel}. * <p> * Note: The latched address does not encode the actual length of the data. * Therefore, all data in the slot addressed by the latched address will be * returned. * * @param nxtAddr * The latched address. * * @return The byte[] in the addressed slot. * * @throws IOException */ public final byte[] readFromLatchedAddress(final int nxtAddr) throws IOException { final Lock outerLock = m_allocationReadLock; try { final FixedAllocator alloc = getBlockByAddress(nxtAddr); final byte[] data = new byte[alloc.m_size]; final ByteBuffer bb = ByteBuffer.wrap(data); final int offset = getOffset(nxtAddr); final long paddr = alloc.getPhysicalAddress(offset); // Guard IO against concurrent file extension. final Lock innerLock = m_extensionLock.readLock(); try { FileChannelUtility.readAll(m_reopener, bb, paddr); } finally { innerLock.unlock(); } return data; } finally { outerLock.unlock(); } } /** * @see IHABufferStrategy#getBlockSequence() */ public long getBlockSequence() { return lastBlockSequence; } private long lastBlockSequence = 0; /** * @see IHABufferStrategy#getCurrentBlockSequence() */ public long getCurrentBlockSequence() { final WriteCacheService tmp = m_writeCacheService; if (tmp == null) { /* * Either this is not an HA strategy mode -or- we are in abort() and * the value temporarily [null]. If there is an abort(), then the * counter will be reset to 0L. */ return 0L; } return tmp.getSequence(); } // private HARebuildRequest m_rebuildRequest = null; // /** // * Only blacklist the addr if not already available, in other words // * a blacklisted address only makes sense if it for previously // * committed data and not instantly recyclable. // */ // public void blacklistAddress(int addr, final String info) { // if (m_blacklist == null) { // // add delay/synchronization to emulate blacklist delay? // return; // } // // if (physicalAddress(addr) == 0) // throw new IllegalStateException("Blacklist should only be called for a valid address"); // // if (info == null) { // throw new IllegalStateException("Blacklist must have info String"); // } // // if ( m_blacklist.putIfAbsent(addr, info) != null) // throw new IllegalStateException("Address already blacklisted: " // + addr + ", info: " + info + ", prev: " + m_blacklist.get(addr)); // // ; // } /** * @see IHABufferStrategy#computeDigest(Object, MessageDigest) */ public void computeDigest(final Object snapshot, final MessageDigest digest) throws DigestException, IOException { if(true) { computeDigestOld(snapshot, digest); } else { computeDigestAlt(snapshot, digest); } } private void computeDigestOld(final Object snapshot, final MessageDigest digest) throws DigestException, IOException { if (snapshot != null) throw new UnsupportedOperationException(); IBufferAccess buf = null; try { try { // Acquire a buffer. buf = DirectBufferPool.INSTANCE.acquire(); } catch (InterruptedException ex) { // Wrap and re-throw. throw new IOException(ex); } // The backing ByteBuffer. final ByteBuffer b = buf.buffer(); // // A byte[] with the same capacity as that ByteBuffer. // final byte[] a = new byte[b.capacity()]; // The capacity of that buffer (typically 1MB). final int bufferCapacity = b.capacity(); // The size of the file at the moment we begin. final long fileExtent = getStoreFile().length(); // The #of bytes whose digest will be computed. final long totalBytes = fileExtent; // The #of bytes remaining. long remaining = totalBytes; // The offset of the current block. long offset = 0L; // The block sequence. long sequence = 0L; if (log.isInfoEnabled()) log.info("Computing digest: nbytes=" + totalBytes); while (remaining > 0) { final int nbytes = (int) Math.min((long) bufferCapacity, remaining); if (log.isTraceEnabled()) log.trace("Computing digest: sequence=" + sequence + ", offset=" + offset + ", nbytes=" + nbytes); // Setup for read. b.position(0); b.limit(nbytes); // read block readRaw(/*nbytes,*/ offset, b); // // Copy data into our byte[]. // final byte[] c = BytesUtil.toArray(b, false/* forceCopy */, a); // update digest //digest.update(c, 0/* off */, nbytes/* len */); digest.update(b); remaining -= nbytes; offset += nbytes; sequence++; } if (log.isInfoEnabled()) log.info("Computed digest: #blocks=" + sequence + ", #bytes=" + totalBytes); // Done. return; } finally { if (buf != null) { try { // Release the direct buffer. buf.release(); } catch (InterruptedException e) { log.warn(e); } } } } /** * This alternative implementation checks only the live allocations * * @param snapshot * @param digest * @throws DigestException * @throws IOException */ private void computeDigestAlt(final Object snapshot, final MessageDigest digest) throws DigestException, IOException { if (snapshot != null) throw new UnsupportedOperationException(); m_allocationWriteLock.lock(); try { // FIXME add digest for RootBlocks! for (FixedAllocator fa : m_allocs) { fa.computeDigest(snapshot, digest); } } finally { m_allocationWriteLock.unlock(); } { final byte[] data = digest.digest(); final StringBuffer sb = new StringBuffer(); for (byte b : data) { if (sb.length() > 0) sb.append(","); sb.append(b); } log.warn("STORE DIGEST: " + sb.toString()); log.warn("Free Deferrals: " + this.m_deferredFreeOut.getBytesWritten()); } } /** * Used as part of the rebuild protocol * @throws IOException */ public void writeRaw(final long offset, final ByteBuffer transfer) throws IOException { if (log.isDebugEnabled()) log.debug("writeRaw: " + offset); // Guard IO against concurrent file extension. final Lock lock = m_extensionLock.readLock(); lock.lock(); try { FileChannelUtility.writeAll(m_reopener, transfer, offset); } finally { lock.unlock(); } } private String showAllocatorList() { final StringBuilder sb = new StringBuilder(); for (int index = 0; index < m_allocs.size(); index++) { final FixedAllocator xfa = m_allocs.get(index); sb.append("Allocator " + index + ", size: " + xfa.m_size + ", startAddress: " + xfa.getStartAddr() + ", allocated: " + xfa.getAllocatedSlots() + "\n"); } return sb.toString(); } // /** // * // * @return whether WCS is flushed // * // * @see IBufferStrategy#isFlushed() // */ // public boolean isFlushed() { // return this.m_writeCacheService.isFlushed(); // } public static class RWStoreState implements StoreState { /** * Generated ID */ private static final long serialVersionUID = 4315400143557397323L; /* * Transient state necessary for consistent ha leader transition */ private final int m_fileSize; private final int m_nextAllocation; private final int m_committedNextAllocation; private final long m_minReleaseAge; private final long m_lastDeferredReleaseTime; private final long m_storageStatsAddr; private final int m_allocsSize; private final int m_metaBitsAddr; private final int m_metaBitsSize; private RWStoreState(final RWStore store) { m_fileSize = store.m_fileSize; m_nextAllocation = store.m_nextAllocation; m_committedNextAllocation = store.m_committedNextAllocation; m_minReleaseAge = store.m_minReleaseAge; m_lastDeferredReleaseTime = store.m_lastDeferredReleaseTime; m_storageStatsAddr = store.m_storageStatsAddr; m_allocsSize = store.m_allocs.size(); m_metaBitsAddr = store.m_metaBitsAddr; m_metaBitsSize = store.m_metaBits.length; } @Override public boolean equals(final Object obj) { if (obj == null || !(obj instanceof RWStoreState)) return false; final RWStoreState other = (RWStoreState) obj; return m_fileSize == other.m_fileSize && m_nextAllocation == other.m_nextAllocation && m_committedNextAllocation == other.m_committedNextAllocation && m_minReleaseAge == other.m_minReleaseAge && m_lastDeferredReleaseTime == other.m_lastDeferredReleaseTime && m_storageStatsAddr == other.m_storageStatsAddr && m_allocsSize == other.m_allocsSize && m_metaBitsAddr == other.m_metaBitsAddr && m_metaBitsSize == other.m_metaBitsSize; } @Override public String toString() { final StringBuilder sb = new StringBuilder(); sb.append("RWStoreState\n"); sb.append("fileSize: " + m_fileSize + "\n"); sb.append("nextAllocation: " + m_nextAllocation + "\n"); sb.append("committedNextAllocation: " + m_committedNextAllocation + "\n"); sb.append("minReleaseAge: " + m_minReleaseAge + "\n"); sb.append("lastDeferredReleaseTime: " + m_lastDeferredReleaseTime + "\n"); sb.append("storageStatsAddr: " + m_storageStatsAddr + "\n"); sb.append("allocsSize: " + m_allocsSize + "\n"); sb.append("metaBitsAddr: " + m_metaBitsAddr + "\n"); sb.append("metaBitsSize: " + m_metaBitsSize + "\n"); return sb.toString(); } } /** * Can be used to determine if an address is within an allocated slot. * * @param addr * @return whether addr is within slot allocated area */ public boolean verifyAllocatedAddress(final long addr) { for (int index = 0; index < m_allocs.size(); index++) { final FixedAllocator xfa = m_allocs.get(index); if (xfa.verifyAllocatedAddress(addr)) return true; } return false; } public StoreState getStoreState() { final RWStoreState ret = new RWStoreState(this); return ret; } /** * Forces a reset of the metabits allocation on the next commit. * <p> * Note that a side-effect of this is that there will be a memory leak * of either a FixedAllocation slot or an existing demi-space. * <p> * @param useDemispace * @return whether the storage has been modified. */ public boolean ensureMetabitsDemispace(final boolean useDemispace) { final boolean isDemispace = m_metaBitsAddr > 0; if (isDemispace != useDemispace || m_useMetabitsDemispace != useDemispace) { m_useMetabitsDemispace = useDemispace; m_metaBitsAddr = 0; m_recentAlloc = true; // force commit return true; } else { return false; } } public boolean isUsingDemiSpace() { return m_metaBitsAddr > 0; } /** * Add the address/byte[] to the snapshot representing the metabits allocaiton data * * @throws IOException */ public void snapshotMetabits(final ISnapshotData tm) throws IOException { final long mba; if (m_metaBitsAddr < 0) { mba = physicalAddress((int) m_metaBitsAddr); } else { // long ret = physicalAddress((int) m_metaBitsAddr); mba = convertAddr(-m_metaBitsAddr); // maximum 48 bit address range } tm.put(mba, genMetabitsData()); } /** * Add the address/allocator associated with each FixedAllocator to the snapshot map */ public void snapshotAllocators(final ISnapshotData tm) { for(FixedAllocator alloc : m_allocs) { alloc.snapshot(tm); } } class AllocationContext implements IAllocationContext { boolean m_active = true; final boolean m_isolated; public AllocationContext(boolean isolated) { m_isolated = isolated; } final public void checkActive() { if (!m_active) { throw new IllegalStateException(); } } final public void release() { checkActive(); m_active = false; } @Override public boolean isIsolated() { return m_isolated; } } public IAllocationContext newAllocationContext(final boolean isolated) { m_allocationWriteLock.lock(); try { final IAllocationContext ret = new AllocationContext(isolated); if (isolated) { final ContextAllocation ca = new ContextAllocation(this, m_freeFixed.length, null, ret); if (m_contexts.put(ret, ca) != null) { throw new AssertionError(); } } return ret; } finally { m_allocationWriteLock.unlock(); } } // public void prepareForRebuild(final HARebuildRequest req) { // assert m_rebuildRequest == null; // // m_rebuildRequest = req; // } // // public void completeRebuild(final HARebuildRequest req, final IRootBlockView rbv) { // assert m_rebuildRequest != null; // // assert m_rebuildRequest.equals(req); // // // TODO: reinit from file // this.resetFromHARootBlock(rbv); // // m_rebuildRequest = null; // } }